first commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,10 @@
|
||||
import pytest
|
||||
|
||||
import sklearn
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def print_changed_only_false():
|
||||
sklearn.set_config(print_changed_only=False)
|
||||
yield
|
||||
sklearn.set_config(print_changed_only=True) # reset to default
|
||||
@@ -0,0 +1,16 @@
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._arpack import _init_arpack_v0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(100))
|
||||
def test_init_arpack_v0(seed):
|
||||
# check that the initialization a sampling from an uniform distribution
|
||||
# where we can fix the random state
|
||||
size = 1000
|
||||
v0 = _init_arpack_v0(size, seed)
|
||||
|
||||
rng = check_random_state(seed)
|
||||
assert_allclose(v0, rng.uniform(-1, 1, size=size))
|
||||
@@ -0,0 +1,26 @@
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.arrayfuncs import min_pos
|
||||
|
||||
|
||||
def test_min_pos():
|
||||
# Check that min_pos returns a positive value and that it's consistent
|
||||
# between float and double
|
||||
X = np.random.RandomState(0).randn(100)
|
||||
|
||||
min_double = min_pos(X)
|
||||
min_float = min_pos(X.astype(np.float32))
|
||||
|
||||
assert_allclose(min_double, min_float)
|
||||
assert min_double >= 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_min_pos_no_positive(dtype):
|
||||
# Check that the return value of min_pos is the maximum representable
|
||||
# value of the input dtype when all input elements are <= 0 (#19328)
|
||||
X = np.full(100, -1.0).astype(dtype, copy=False)
|
||||
|
||||
assert min_pos(X) == np.finfo(dtype).max
|
||||
@@ -0,0 +1,292 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
from sklearn.utils.class_weight import compute_class_weight
|
||||
from sklearn.utils.class_weight import compute_sample_weight
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
|
||||
|
||||
def test_compute_class_weight():
|
||||
# Test (and demo) compute_class_weight.
|
||||
y = np.asarray([2, 2, 2, 3, 3, 4])
|
||||
classes = np.unique(y)
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
# total effect of samples is preserved
|
||||
class_counts = np.bincount(y)[2:]
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert cw[0] < cw[1] < cw[2]
|
||||
|
||||
|
||||
def test_compute_class_weight_not_present():
|
||||
# Raise error when y does not contain all class labels
|
||||
classes = np.arange(4)
|
||||
y = np.asarray([0, 0, 0, 1, 1, 2])
|
||||
with pytest.raises(ValueError):
|
||||
compute_class_weight("balanced", classes=classes, y=y)
|
||||
# Fix exception in error message formatting when missing label is a string
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/8312
|
||||
with pytest.raises(
|
||||
ValueError, match=r"The classes, \[0, 1, 2, 3\], are not in class_weight"
|
||||
):
|
||||
compute_class_weight({"label_not_present": 1.0}, classes=classes, y=y)
|
||||
# Raise error when y has items not in classes
|
||||
classes = np.arange(2)
|
||||
with pytest.raises(ValueError):
|
||||
compute_class_weight("balanced", classes=classes, y=y)
|
||||
with pytest.raises(ValueError):
|
||||
compute_class_weight({0: 1.0, 1: 2.0}, classes=classes, y=y)
|
||||
|
||||
# y contains a unweighted class that is not in class_weights
|
||||
classes = np.asarray(["cat", "dog"])
|
||||
y = np.asarray(["dog", "cat", "dog"])
|
||||
class_weights = {"dogs": 3, "cat": 2}
|
||||
msg = r"The classes, \['dog'\], are not in class_weight"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
compute_class_weight(class_weights, classes=classes, y=y)
|
||||
|
||||
|
||||
def test_compute_class_weight_dict():
|
||||
classes = np.arange(3)
|
||||
class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
|
||||
y = np.asarray([0, 0, 1, 2])
|
||||
cw = compute_class_weight(class_weights, classes=classes, y=y)
|
||||
|
||||
# When the user specifies class weights, compute_class_weights should just
|
||||
# return them.
|
||||
assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)
|
||||
|
||||
# When a class weight is specified that isn't in classes, the weight is ignored
|
||||
class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
|
||||
cw = compute_class_weight(class_weights, classes=classes, y=y)
|
||||
assert_allclose([1.0, 2.0, 3.0], cw)
|
||||
|
||||
class_weights = {-1: 5.0, 0: 4.0, 1: 2.0, 2: 3.0}
|
||||
cw = compute_class_weight(class_weights, classes=classes, y=y)
|
||||
assert_allclose([4.0, 2.0, 3.0], cw)
|
||||
|
||||
|
||||
def test_compute_class_weight_invariance():
|
||||
# Test that results with class_weight="balanced" is invariant wrt
|
||||
# class imbalance if the number of samples is identical.
|
||||
# The test uses a balanced two class dataset with 100 datapoints.
|
||||
# It creates three versions, one where class 1 is duplicated
|
||||
# resulting in 150 points of class 1 and 50 of class 0,
|
||||
# one where there are 50 points in class 1 and 150 in class 0,
|
||||
# and one where there are 100 points of each class (this one is balanced
|
||||
# again).
|
||||
# With balancing class weights, all three should give the same model.
|
||||
X, y = make_blobs(centers=2, random_state=0)
|
||||
# create dataset where class 1 is duplicated twice
|
||||
X_1 = np.vstack([X] + [X[y == 1]] * 2)
|
||||
y_1 = np.hstack([y] + [y[y == 1]] * 2)
|
||||
# create dataset where class 0 is duplicated twice
|
||||
X_0 = np.vstack([X] + [X[y == 0]] * 2)
|
||||
y_0 = np.hstack([y] + [y[y == 0]] * 2)
|
||||
# duplicate everything
|
||||
X_ = np.vstack([X] * 2)
|
||||
y_ = np.hstack([y] * 2)
|
||||
# results should be identical
|
||||
logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
|
||||
logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
|
||||
logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
|
||||
assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
|
||||
assert_array_almost_equal(logreg.coef_, logreg0.coef_)
|
||||
|
||||
|
||||
def test_compute_class_weight_balanced_negative():
|
||||
# Test compute_class_weight when labels are negative
|
||||
# Test with balanced class labels.
|
||||
classes = np.array([-2, -1, 0])
|
||||
y = np.asarray([-1, -1, 0, 0, -2, -2])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
assert len(cw) == len(classes)
|
||||
assert_array_almost_equal(cw, np.array([1.0, 1.0, 1.0]))
|
||||
|
||||
# Test with unbalanced class labels.
|
||||
y = np.asarray([-1, 0, 0, -2, -2, -2])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
assert len(cw) == len(classes)
|
||||
class_counts = np.bincount(y + 2)
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert_array_almost_equal(cw, [2.0 / 3, 2.0, 1.0])
|
||||
|
||||
|
||||
def test_compute_class_weight_balanced_unordered():
|
||||
# Test compute_class_weight when classes are unordered
|
||||
classes = np.array([1, 0, 3])
|
||||
y = np.asarray([1, 0, 0, 3, 3, 3])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
class_counts = np.bincount(y)[classes]
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert_array_almost_equal(cw, [2.0, 1.0, 2.0 / 3])
|
||||
|
||||
|
||||
def test_compute_class_weight_default():
|
||||
# Test for the case where no weight is given for a present class.
|
||||
# Current behaviour is to assign the unweighted classes a weight of 1.
|
||||
y = np.asarray([2, 2, 2, 3, 3, 4])
|
||||
classes = np.unique(y)
|
||||
classes_len = len(classes)
|
||||
|
||||
# Test for non specified weights
|
||||
cw = compute_class_weight(None, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, np.ones(3))
|
||||
|
||||
# Tests for partly specified weights
|
||||
cw = compute_class_weight({2: 1.5}, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, [1.5, 1.0, 1.0])
|
||||
|
||||
cw = compute_class_weight({2: 1.5, 4: 0.5}, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, [1.5, 1.0, 0.5])
|
||||
|
||||
|
||||
def test_compute_sample_weight():
|
||||
# Test (and demo) compute_sample_weight.
|
||||
# Test with balanced classes
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with user-defined weights
|
||||
sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
|
||||
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with column vector of balanced classes
|
||||
y = np.asarray([[1], [1], [1], [2], [2], [2]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with unbalanced classes
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
expected_balanced = np.array(
|
||||
[0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 2.3333]
|
||||
)
|
||||
assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)
|
||||
|
||||
# Test with `None` weights
|
||||
sample_weight = compute_sample_weight(None, y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with multi-output of balanced classes
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with multi-output with user-defined weights
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
|
||||
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
|
||||
|
||||
# Test with multi-output of unbalanced classes
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, expected_balanced**2, decimal=3)
|
||||
|
||||
|
||||
def test_compute_sample_weight_with_subsample():
|
||||
# Test compute_sample_weight with subsamples specified.
|
||||
# Test with balanced classes and all samples present
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with column vector of balanced classes and all samples present
|
||||
y = np.asarray([[1], [1], [1], [2], [2], [2]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with a subsample
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
|
||||
assert_array_almost_equal(sample_weight, [2.0 / 3, 2.0 / 3, 2.0 / 3, 2.0, 2.0, 2.0])
|
||||
|
||||
# Test with a bootstrap subsample
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
|
||||
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3.0, 3.0, 3.0])
|
||||
assert_array_almost_equal(sample_weight, expected_balanced)
|
||||
|
||||
# Test with a bootstrap subsample for multi-output
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
|
||||
assert_array_almost_equal(sample_weight, expected_balanced**2)
|
||||
|
||||
# Test with a missing class
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
|
||||
|
||||
# Test with a missing class for multi-output
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
|
||||
|
||||
|
||||
def test_compute_sample_weight_errors():
|
||||
# Test compute_sample_weight raises errors expected.
|
||||
# Invalid preset string
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
y_ = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight("ni", y)
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight("ni", y, indices=range(4))
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight("ni", y_)
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight("ni", y_, indices=range(4))
|
||||
|
||||
# Not "balanced" for subsample
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))
|
||||
|
||||
# Not a list or preset for multi-output
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight({1: 2, 2: 1}, y_)
|
||||
|
||||
# Incorrect length list for multi-output
|
||||
with pytest.raises(ValueError):
|
||||
compute_sample_weight([{1: 2, 2: 1}], y_)
|
||||
|
||||
|
||||
def test_compute_sample_weight_more_than_32():
|
||||
# Non-regression smoke test for #12146
|
||||
y = np.arange(50) # more than 32 distinct classes
|
||||
indices = np.arange(50) # use subsampling
|
||||
weight = compute_sample_weight("balanced", y, indices=indices)
|
||||
assert_array_almost_equal(weight, np.ones(y.shape[0]))
|
||||
|
||||
|
||||
def test_class_weight_does_not_contains_more_classses():
|
||||
"""Check that class_weight can contain more labels than in y.
|
||||
|
||||
Non-regression test for #22413
|
||||
"""
|
||||
tree = DecisionTreeClassifier(class_weight={0: 1, 1: 10, 2: 20})
|
||||
|
||||
# Does not raise
|
||||
tree.fit([[0, 0, 1], [1, 0, 1], [1, 2, 0]], [0, 0, 1])
|
||||
|
||||
|
||||
def test_compute_sample_weight_sparse():
|
||||
"""Check that we can compute weight for sparse `y`."""
|
||||
y = sparse.csc_matrix(np.asarray([0, 1, 1])).T
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_allclose(sample_weight, [1.5, 0.75, 0.75])
|
||||
@@ -0,0 +1,231 @@
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils._cython_blas import _dot_memview
|
||||
from sklearn.utils._cython_blas import _asum_memview
|
||||
from sklearn.utils._cython_blas import _axpy_memview
|
||||
from sklearn.utils._cython_blas import _nrm2_memview
|
||||
from sklearn.utils._cython_blas import _copy_memview
|
||||
from sklearn.utils._cython_blas import _scal_memview
|
||||
from sklearn.utils._cython_blas import _rotg_memview
|
||||
from sklearn.utils._cython_blas import _rot_memview
|
||||
from sklearn.utils._cython_blas import _gemv_memview
|
||||
from sklearn.utils._cython_blas import _ger_memview
|
||||
from sklearn.utils._cython_blas import _gemm_memview
|
||||
from sklearn.utils._cython_blas import RowMajor, ColMajor
|
||||
from sklearn.utils._cython_blas import Trans, NoTrans
|
||||
|
||||
|
||||
def _numpy_to_cython(dtype):
|
||||
cython = pytest.importorskip("cython")
|
||||
if dtype == np.float32:
|
||||
return cython.float
|
||||
elif dtype == np.float64:
|
||||
return cython.double
|
||||
|
||||
|
||||
RTOL = {np.float32: 1e-6, np.float64: 1e-12}
|
||||
ORDER = {RowMajor: "C", ColMajor: "F"}
|
||||
|
||||
|
||||
def _no_op(x):
|
||||
return x
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_dot(dtype):
|
||||
dot = _dot_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = x.dot(y)
|
||||
actual = dot(x, y)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_asum(dtype):
|
||||
asum = _asum_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = np.abs(x).sum()
|
||||
actual = asum(x)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_axpy(dtype):
|
||||
axpy = _axpy_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * x + y
|
||||
axpy(alpha, x, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_nrm2(dtype):
|
||||
nrm2 = _nrm2_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = np.linalg.norm(x)
|
||||
actual = nrm2(x)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_copy(dtype):
|
||||
copy = _copy_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = np.empty_like(x)
|
||||
|
||||
expected = x.copy()
|
||||
copy(x, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_scal(dtype):
|
||||
scal = _scal_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * x
|
||||
scal(alpha, x)
|
||||
|
||||
assert_allclose(x, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_rotg(dtype):
|
||||
rotg = _rotg_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
a = dtype(rng.randn())
|
||||
b = dtype(rng.randn())
|
||||
c, s = 0.0, 0.0
|
||||
|
||||
def expected_rotg(a, b):
|
||||
roe = a if abs(a) > abs(b) else b
|
||||
if a == 0 and b == 0:
|
||||
c, s, r, z = (1, 0, 0, 0)
|
||||
else:
|
||||
r = np.sqrt(a**2 + b**2) * (1 if roe >= 0 else -1)
|
||||
c, s = a / r, b / r
|
||||
z = s if roe == a else (1 if c == 0 else 1 / c)
|
||||
return r, z, c, s
|
||||
|
||||
expected = expected_rotg(a, b)
|
||||
actual = rotg(a, b, c, s)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_rot(dtype):
|
||||
rot = _rot_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
c = dtype(rng.randn())
|
||||
s = dtype(rng.randn())
|
||||
|
||||
expected_x = c * x + s * y
|
||||
expected_y = c * y - s * x
|
||||
|
||||
rot(x, y, c, s)
|
||||
|
||||
assert_allclose(x, expected_x)
|
||||
assert_allclose(y, expected_y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize(
|
||||
"opA, transA", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
|
||||
)
|
||||
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
|
||||
def test_gemv(dtype, opA, transA, order):
|
||||
gemv = _gemv_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = np.asarray(
|
||||
opA(rng.random_sample((20, 10)).astype(dtype, copy=False)), order=ORDER[order]
|
||||
)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(20).astype(dtype, copy=False)
|
||||
alpha, beta = 2.5, -0.5
|
||||
|
||||
expected = alpha * opA(A).dot(x) + beta * y
|
||||
gemv(transA, alpha, A, x, beta, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
|
||||
def test_ger(dtype, order):
|
||||
ger = _ger_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(20).astype(dtype, copy=False)
|
||||
A = np.asarray(
|
||||
rng.random_sample((10, 20)).astype(dtype, copy=False), order=ORDER[order]
|
||||
)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * np.outer(x, y) + A
|
||||
ger(alpha, x, y, A)
|
||||
|
||||
assert_allclose(A, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize(
|
||||
"opB, transB", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"opA, transA", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
|
||||
)
|
||||
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
|
||||
def test_gemm(dtype, opA, transA, opB, transB, order):
|
||||
gemm = _gemm_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = np.asarray(
|
||||
opA(rng.random_sample((30, 10)).astype(dtype, copy=False)), order=ORDER[order]
|
||||
)
|
||||
B = np.asarray(
|
||||
opB(rng.random_sample((10, 20)).astype(dtype, copy=False)), order=ORDER[order]
|
||||
)
|
||||
C = np.asarray(
|
||||
rng.random_sample((30, 20)).astype(dtype, copy=False), order=ORDER[order]
|
||||
)
|
||||
alpha, beta = 2.5, -0.5
|
||||
|
||||
expected = alpha * opA(A).dot(opB(B)) + beta * C
|
||||
gemm(transA, transB, alpha, A, B, beta, C)
|
||||
|
||||
assert_allclose(C, expected, rtol=RTOL[dtype])
|
||||
@@ -0,0 +1,20 @@
|
||||
import pathlib
|
||||
import pytest
|
||||
import sklearn
|
||||
|
||||
|
||||
def test_files_generated_by_templates_are_git_ignored():
|
||||
"""Check the consistence of the files generated from template files."""
|
||||
gitignore_file = pathlib.Path(sklearn.__file__).parent.parent / ".gitignore"
|
||||
if not gitignore_file.exists():
|
||||
pytest.skip("Tests are not run from the source folder")
|
||||
|
||||
base_dir = pathlib.Path(sklearn.__file__).parent
|
||||
ignored_files = gitignore_file.read_text().split("\n")
|
||||
ignored_files = [pathlib.Path(line) for line in ignored_files]
|
||||
|
||||
for filename in base_dir.glob("**/*.tp"):
|
||||
filename = filename.relative_to(base_dir.parent)
|
||||
# From "path/to/template.p??.tp" to "path/to/template.p??"
|
||||
filename_wo_tempita_suffix = filename.with_suffix("")
|
||||
assert filename_wo_tempita_suffix in ignored_files
|
||||
@@ -0,0 +1,76 @@
|
||||
# Authors: Raghav RV <rvraghav93@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import pickle
|
||||
|
||||
from sklearn.utils.deprecation import _is_deprecated
|
||||
from sklearn.utils.deprecation import deprecated
|
||||
import pytest
|
||||
|
||||
|
||||
@deprecated("qwerty")
|
||||
class MockClass1:
|
||||
pass
|
||||
|
||||
|
||||
class MockClass2:
|
||||
@deprecated("mockclass2_method")
|
||||
def method(self):
|
||||
pass
|
||||
|
||||
@deprecated("n_features_ is deprecated") # type: ignore
|
||||
@property
|
||||
def n_features_(self):
|
||||
"""Number of input features."""
|
||||
return 10
|
||||
|
||||
|
||||
class MockClass3:
|
||||
@deprecated()
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
class MockClass4:
|
||||
pass
|
||||
|
||||
|
||||
@deprecated()
|
||||
def mock_function():
|
||||
return 10
|
||||
|
||||
|
||||
def test_deprecated():
|
||||
with pytest.warns(FutureWarning, match="qwerty"):
|
||||
MockClass1()
|
||||
with pytest.warns(FutureWarning, match="mockclass2_method"):
|
||||
MockClass2().method()
|
||||
with pytest.warns(FutureWarning, match="deprecated"):
|
||||
MockClass3()
|
||||
with pytest.warns(FutureWarning, match="deprecated"):
|
||||
val = mock_function()
|
||||
assert val == 10
|
||||
|
||||
|
||||
def test_is_deprecated():
|
||||
# Test if _is_deprecated helper identifies wrapping via deprecated
|
||||
# NOTE it works only for class methods and functions
|
||||
assert _is_deprecated(MockClass1.__init__)
|
||||
assert _is_deprecated(MockClass2().method)
|
||||
assert _is_deprecated(MockClass3.__init__)
|
||||
assert not _is_deprecated(MockClass4.__init__)
|
||||
assert _is_deprecated(mock_function)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
pickle.loads(pickle.dumps(mock_function))
|
||||
|
||||
|
||||
def test_deprecated_property_docstring_exists():
|
||||
"""Deprecated property contains the original docstring."""
|
||||
mock_class_property = getattr(MockClass2, "n_features_")
|
||||
assert (
|
||||
"DEPRECATED: n_features_ is deprecated\n\n Number of input features."
|
||||
== mock_class_property.__doc__
|
||||
)
|
||||
@@ -0,0 +1,277 @@
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.utils._encode import _unique
|
||||
from sklearn.utils._encode import _encode
|
||||
from sklearn.utils._encode import _check_unknown
|
||||
from sklearn.utils._encode import _get_counts
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, expected",
|
||||
[
|
||||
(np.array([2, 1, 3, 1, 3], dtype="int64"), np.array([1, 2, 3], dtype="int64")),
|
||||
(
|
||||
np.array([2, 1, np.nan, 1, np.nan], dtype="float32"),
|
||||
np.array([1, 2, np.nan], dtype="float32"),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", None, "a", None], dtype=object),
|
||||
np.array(["a", "b", None], dtype=object),
|
||||
),
|
||||
(np.array(["b", "a", "c", "a", "c"]), np.array(["a", "b", "c"])),
|
||||
],
|
||||
ids=["int64", "float32-nan", "object", "object-None", "str"],
|
||||
)
|
||||
def test_encode_util(values, expected):
|
||||
uniques = _unique(values)
|
||||
assert_array_equal(uniques, expected)
|
||||
|
||||
result, encoded = _unique(values, return_inverse=True)
|
||||
assert_array_equal(result, expected)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
|
||||
encoded = _encode(values, uniques=uniques)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
|
||||
result, counts = _unique(values, return_counts=True)
|
||||
assert_array_equal(result, expected)
|
||||
assert_array_equal(counts, np.array([2, 1, 2]))
|
||||
|
||||
result, encoded, counts = _unique(values, return_inverse=True, return_counts=True)
|
||||
assert_array_equal(result, expected)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
assert_array_equal(counts, np.array([2, 1, 2]))
|
||||
|
||||
|
||||
def test_encode_with_check_unknown():
|
||||
# test for the check_unknown parameter of _encode()
|
||||
uniques = np.array([1, 2, 3])
|
||||
values = np.array([1, 2, 3, 4])
|
||||
|
||||
# Default is True, raise error
|
||||
with pytest.raises(ValueError, match="y contains previously unseen labels"):
|
||||
_encode(values, uniques=uniques, check_unknown=True)
|
||||
|
||||
# dont raise error if False
|
||||
_encode(values, uniques=uniques, check_unknown=False)
|
||||
|
||||
# parameter is ignored for object dtype
|
||||
uniques = np.array(["a", "b", "c"], dtype=object)
|
||||
values = np.array(["a", "b", "c", "d"], dtype=object)
|
||||
with pytest.raises(ValueError, match="y contains previously unseen labels"):
|
||||
_encode(values, uniques=uniques, check_unknown=False)
|
||||
|
||||
|
||||
def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
|
||||
diff = _check_unknown(values, uniques)
|
||||
assert_array_equal(diff, expected_diff)
|
||||
|
||||
diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
|
||||
assert_array_equal(diff, expected_diff)
|
||||
assert_array_equal(valid_mask, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, uniques, expected_diff, expected_mask",
|
||||
[
|
||||
(np.array([1, 2, 3, 4]), np.array([1, 2, 3]), [4], [True, True, True, False]),
|
||||
(np.array([2, 1, 4, 5]), np.array([2, 5, 1]), [4], [True, True, False, True]),
|
||||
(np.array([2, 1, np.nan]), np.array([2, 5, 1]), [np.nan], [True, True, False]),
|
||||
(
|
||||
np.array([2, 1, 4, np.nan]),
|
||||
np.array([2, 5, 1, np.nan]),
|
||||
[4],
|
||||
[True, True, False, True],
|
||||
),
|
||||
(
|
||||
np.array([2, 1, 4, np.nan]),
|
||||
np.array([2, 5, 1]),
|
||||
[4, np.nan],
|
||||
[True, True, False, False],
|
||||
),
|
||||
(
|
||||
np.array([2, 1, 4, 5]),
|
||||
np.array([2, 5, 1, np.nan]),
|
||||
[4],
|
||||
[True, True, False, True],
|
||||
),
|
||||
(
|
||||
np.array(["a", "b", "c", "d"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["d"], dtype=object),
|
||||
[True, True, True, False],
|
||||
),
|
||||
(
|
||||
np.array(["d", "c", "a", "b"], dtype=object),
|
||||
np.array(["a", "c", "b"], dtype=object),
|
||||
np.array(["d"], dtype=object),
|
||||
[False, True, True, True],
|
||||
),
|
||||
(
|
||||
np.array(["a", "b", "c", "d"]),
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["d"]),
|
||||
[True, True, True, False],
|
||||
),
|
||||
(
|
||||
np.array(["d", "c", "a", "b"]),
|
||||
np.array(["a", "c", "b"]),
|
||||
np.array(["d"]),
|
||||
[False, True, True, True],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_check_unknown(values, uniques, expected_diff, expected_mask):
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("missing_value", [None, np.nan, float("nan")])
|
||||
@pytest.mark.parametrize("pickle_uniques", [True, False])
|
||||
def test_check_unknown_missing_values(missing_value, pickle_uniques):
|
||||
# check for check_unknown with missing values with object dtypes
|
||||
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
|
||||
uniques = np.array(["c", "a", "b", missing_value], dtype=object)
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
expected_diff = ["d"]
|
||||
expected_mask = [False, True, True, True, True]
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
|
||||
uniques = np.array(["c", "a", "b"], dtype=object)
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
expected_diff = ["d", missing_value]
|
||||
|
||||
expected_mask = [False, True, True, True, False]
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
values = np.array(["a", missing_value], dtype=object)
|
||||
uniques = np.array(["a", "b", "z"], dtype=object)
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
expected_diff = [missing_value]
|
||||
expected_mask = [True, False]
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
|
||||
@pytest.mark.parametrize("pickle_uniques", [True, False])
|
||||
def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
|
||||
# check for _unique and _encode with missing values with object dtypes
|
||||
values = np.array(["a", "c", "c", missing_value, "b"], dtype=object)
|
||||
expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object)
|
||||
|
||||
uniques = _unique(values)
|
||||
|
||||
if missing_value is None:
|
||||
assert_array_equal(uniques, expected_uniques)
|
||||
else: # missing_value == np.nan
|
||||
assert_array_equal(uniques[:-1], expected_uniques[:-1])
|
||||
assert np.isnan(uniques[-1])
|
||||
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
encoded = _encode(values, uniques=uniques)
|
||||
assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))
|
||||
|
||||
|
||||
def test_unique_util_missing_values_numeric():
|
||||
# Check missing values in numerical values
|
||||
values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
|
||||
expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
|
||||
expected_inverse = np.array([1, 0, 3, 2, 1, 3])
|
||||
|
||||
uniques = _unique(values)
|
||||
assert_array_equal(uniques, expected_uniques)
|
||||
|
||||
uniques, inverse = _unique(values, return_inverse=True)
|
||||
assert_array_equal(uniques, expected_uniques)
|
||||
assert_array_equal(inverse, expected_inverse)
|
||||
|
||||
encoded = _encode(values, uniques=uniques)
|
||||
assert_array_equal(encoded, expected_inverse)
|
||||
|
||||
|
||||
def test_unique_util_with_all_missing_values():
|
||||
# test for all types of missing values for object dtype
|
||||
values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object)
|
||||
|
||||
uniques = _unique(values)
|
||||
assert_array_equal(uniques[:-1], ["a", "c", None])
|
||||
# last value is nan
|
||||
assert np.isnan(uniques[-1])
|
||||
|
||||
expected_inverse = [3, 0, 1, 1, 2, 3, 2]
|
||||
_, inverse = _unique(values, return_inverse=True)
|
||||
assert_array_equal(inverse, expected_inverse)
|
||||
|
||||
|
||||
def test_check_unknown_with_both_missing_values():
|
||||
# test for both types of missing values for object dtype
|
||||
values = np.array([np.nan, "a", "c", "c", None, np.nan, None], dtype=object)
|
||||
|
||||
diff = _check_unknown(values, known_values=np.array(["a", "c"], dtype=object))
|
||||
assert diff[0] is None
|
||||
assert np.isnan(diff[1])
|
||||
|
||||
diff, valid_mask = _check_unknown(
|
||||
values, known_values=np.array(["a", "c"], dtype=object), return_mask=True
|
||||
)
|
||||
|
||||
assert diff[0] is None
|
||||
assert np.isnan(diff[1])
|
||||
assert_array_equal(valid_mask, [False, True, True, True, False, False, False])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, uniques, expected_counts",
|
||||
[
|
||||
(np.array([1] * 10 + [2] * 4 + [3] * 15), np.array([1, 2, 3]), [10, 4, 15]),
|
||||
(
|
||||
np.array([1] * 10 + [2] * 4 + [3] * 15),
|
||||
np.array([1, 2, 3, 5]),
|
||||
[10, 4, 15, 0],
|
||||
),
|
||||
(
|
||||
np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
|
||||
np.array([2, 3, np.nan]),
|
||||
[4, 15, 10],
|
||||
),
|
||||
(
|
||||
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["a", "b", "c"],
|
||||
[16, 4, 20],
|
||||
),
|
||||
(
|
||||
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["c", "b", "a"],
|
||||
[20, 4, 16],
|
||||
),
|
||||
(
|
||||
np.array([np.nan] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["c", np.nan, "a"],
|
||||
[20, 4, 16],
|
||||
),
|
||||
(
|
||||
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["a", "b", "c", "e"],
|
||||
[16, 4, 20, 0],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_get_counts(values, uniques, expected_counts):
|
||||
counts = _get_counts(values, uniques)
|
||||
assert_array_equal(counts, expected_counts)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,311 @@
|
||||
from contextlib import closing
|
||||
import html
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.pipeline import FeatureUnion
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.ensemble import VotingClassifier
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.cluster import Birch
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.svm import LinearSVR
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.multiclass import OneVsOneClassifier
|
||||
from sklearn.ensemble import StackingClassifier
|
||||
from sklearn.ensemble import StackingRegressor
|
||||
from sklearn.gaussian_process.kernels import ExpSineSquared
|
||||
from sklearn.kernel_ridge import KernelRidge
|
||||
|
||||
from sklearn.model_selection import RandomizedSearchCV
|
||||
from sklearn.utils._estimator_html_repr import _write_label_html
|
||||
from sklearn.utils._estimator_html_repr import _get_visual_block
|
||||
from sklearn.utils._estimator_html_repr import estimator_html_repr
|
||||
|
||||
|
||||
@pytest.mark.parametrize("checked", [True, False])
|
||||
def test_write_label_html(checked):
|
||||
# Test checking logic and labeling
|
||||
name = "LogisticRegression"
|
||||
tool_tip = "hello-world"
|
||||
|
||||
with closing(StringIO()) as out:
|
||||
_write_label_html(out, name, tool_tip, checked=checked)
|
||||
html_label = out.getvalue()
|
||||
assert "LogisticRegression</label>" in html_label
|
||||
assert html_label.startswith('<div class="sk-label-container">')
|
||||
assert "<pre>hello-world</pre>" in html_label
|
||||
if checked:
|
||||
assert "checked>" in html_label
|
||||
|
||||
|
||||
@pytest.mark.parametrize("est", ["passthrough", "drop", None])
|
||||
def test_get_visual_block_single_str_none(est):
|
||||
# Test estimators that are represented by strings
|
||||
est_html_info = _get_visual_block(est)
|
||||
assert est_html_info.kind == "single"
|
||||
assert est_html_info.estimators == est
|
||||
assert est_html_info.names == str(est)
|
||||
assert est_html_info.name_details == str(est)
|
||||
|
||||
|
||||
def test_get_visual_block_single_estimator():
|
||||
est = LogisticRegression(C=10.0)
|
||||
est_html_info = _get_visual_block(est)
|
||||
assert est_html_info.kind == "single"
|
||||
assert est_html_info.estimators == est
|
||||
assert est_html_info.names == est.__class__.__name__
|
||||
assert est_html_info.name_details == str(est)
|
||||
|
||||
|
||||
def test_get_visual_block_pipeline():
|
||||
pipe = Pipeline(
|
||||
[
|
||||
("imputer", SimpleImputer()),
|
||||
("do_nothing", "passthrough"),
|
||||
("do_nothing_more", None),
|
||||
("classifier", LogisticRegression()),
|
||||
]
|
||||
)
|
||||
est_html_info = _get_visual_block(pipe)
|
||||
assert est_html_info.kind == "serial"
|
||||
assert est_html_info.estimators == tuple(step[1] for step in pipe.steps)
|
||||
assert est_html_info.names == [
|
||||
"imputer: SimpleImputer",
|
||||
"do_nothing: passthrough",
|
||||
"do_nothing_more: passthrough",
|
||||
"classifier: LogisticRegression",
|
||||
]
|
||||
assert est_html_info.name_details == [str(est) for _, est in pipe.steps]
|
||||
|
||||
|
||||
def test_get_visual_block_feature_union():
|
||||
f_union = FeatureUnion([("pca", PCA()), ("svd", TruncatedSVD())])
|
||||
est_html_info = _get_visual_block(f_union)
|
||||
assert est_html_info.kind == "parallel"
|
||||
assert est_html_info.names == ("pca", "svd")
|
||||
assert est_html_info.estimators == tuple(
|
||||
trans[1] for trans in f_union.transformer_list
|
||||
)
|
||||
assert est_html_info.name_details == (None, None)
|
||||
|
||||
|
||||
def test_get_visual_block_voting():
|
||||
clf = VotingClassifier(
|
||||
[("log_reg", LogisticRegression()), ("mlp", MLPClassifier())]
|
||||
)
|
||||
est_html_info = _get_visual_block(clf)
|
||||
assert est_html_info.kind == "parallel"
|
||||
assert est_html_info.estimators == tuple(trans[1] for trans in clf.estimators)
|
||||
assert est_html_info.names == ("log_reg", "mlp")
|
||||
assert est_html_info.name_details == (None, None)
|
||||
|
||||
|
||||
def test_get_visual_block_column_transformer():
|
||||
ct = ColumnTransformer(
|
||||
[("pca", PCA(), ["num1", "num2"]), ("svd", TruncatedSVD, [0, 3])]
|
||||
)
|
||||
est_html_info = _get_visual_block(ct)
|
||||
assert est_html_info.kind == "parallel"
|
||||
assert est_html_info.estimators == tuple(trans[1] for trans in ct.transformers)
|
||||
assert est_html_info.names == ("pca", "svd")
|
||||
assert est_html_info.name_details == (["num1", "num2"], [0, 3])
|
||||
|
||||
|
||||
def test_estimator_html_repr_pipeline():
|
||||
num_trans = Pipeline(
|
||||
steps=[("pass", "passthrough"), ("imputer", SimpleImputer(strategy="median"))]
|
||||
)
|
||||
|
||||
cat_trans = Pipeline(
|
||||
steps=[
|
||||
("imputer", SimpleImputer(strategy="constant", missing_values="empty")),
|
||||
("one-hot", OneHotEncoder(drop="first")),
|
||||
]
|
||||
)
|
||||
|
||||
preprocess = ColumnTransformer(
|
||||
[
|
||||
("num", num_trans, ["a", "b", "c", "d", "e"]),
|
||||
("cat", cat_trans, [0, 1, 2, 3]),
|
||||
]
|
||||
)
|
||||
|
||||
feat_u = FeatureUnion(
|
||||
[
|
||||
("pca", PCA(n_components=1)),
|
||||
(
|
||||
"tsvd",
|
||||
Pipeline(
|
||||
[
|
||||
("first", TruncatedSVD(n_components=3)),
|
||||
("select", SelectPercentile()),
|
||||
]
|
||||
),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
clf = VotingClassifier(
|
||||
[
|
||||
("lr", LogisticRegression(solver="lbfgs", random_state=1)),
|
||||
("mlp", MLPClassifier(alpha=0.001)),
|
||||
]
|
||||
)
|
||||
|
||||
pipe = Pipeline(
|
||||
[("preprocessor", preprocess), ("feat_u", feat_u), ("classifier", clf)]
|
||||
)
|
||||
html_output = estimator_html_repr(pipe)
|
||||
|
||||
# top level estimators show estimator with changes
|
||||
assert html.escape(str(pipe)) in html_output
|
||||
for _, est in pipe.steps:
|
||||
assert (
|
||||
'<div class="sk-toggleable__content"><pre>' + html.escape(str(est))
|
||||
) in html_output
|
||||
|
||||
# low level estimators do not show changes
|
||||
with config_context(print_changed_only=True):
|
||||
assert html.escape(str(num_trans["pass"])) in html_output
|
||||
assert "passthrough</label>" in html_output
|
||||
assert html.escape(str(num_trans["imputer"])) in html_output
|
||||
|
||||
for _, _, cols in preprocess.transformers:
|
||||
assert f"<pre>{html.escape(str(cols))}</pre>" in html_output
|
||||
|
||||
# feature union
|
||||
for name, _ in feat_u.transformer_list:
|
||||
assert f"<label>{html.escape(name)}</label>" in html_output
|
||||
|
||||
pca = feat_u.transformer_list[0][1]
|
||||
assert f"<pre>{html.escape(str(pca))}</pre>" in html_output
|
||||
|
||||
tsvd = feat_u.transformer_list[1][1]
|
||||
first = tsvd["first"]
|
||||
select = tsvd["select"]
|
||||
assert f"<pre>{html.escape(str(first))}</pre>" in html_output
|
||||
assert f"<pre>{html.escape(str(select))}</pre>" in html_output
|
||||
|
||||
# voting classifier
|
||||
for name, est in clf.estimators:
|
||||
assert f"<label>{html.escape(name)}</label>" in html_output
|
||||
assert f"<pre>{html.escape(str(est))}</pre>" in html_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
|
||||
def test_stacking_classsifer(final_estimator):
|
||||
estimators = [
|
||||
("mlp", MLPClassifier(alpha=0.001)),
|
||||
("tree", DecisionTreeClassifier()),
|
||||
]
|
||||
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
|
||||
|
||||
html_output = estimator_html_repr(clf)
|
||||
|
||||
assert html.escape(str(clf)) in html_output
|
||||
# If final_estimator's default changes from LogisticRegression
|
||||
# this should be updated
|
||||
if final_estimator is None:
|
||||
assert "LogisticRegression(" in html_output
|
||||
else:
|
||||
assert final_estimator.__class__.__name__ in html_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("final_estimator", [None, LinearSVR()])
|
||||
def test_stacking_regressor(final_estimator):
|
||||
reg = StackingRegressor(
|
||||
estimators=[("svr", LinearSVR())], final_estimator=final_estimator
|
||||
)
|
||||
html_output = estimator_html_repr(reg)
|
||||
|
||||
assert html.escape(str(reg.estimators[0][0])) in html_output
|
||||
assert "LinearSVR</label>" in html_output
|
||||
if final_estimator is None:
|
||||
assert "RidgeCV</label>" in html_output
|
||||
else:
|
||||
assert html.escape(final_estimator.__class__.__name__) in html_output
|
||||
|
||||
|
||||
def test_birch_duck_typing_meta():
|
||||
# Test duck typing meta estimators with Birch
|
||||
birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3))
|
||||
html_output = estimator_html_repr(birch)
|
||||
|
||||
# inner estimators do not show changes
|
||||
with config_context(print_changed_only=True):
|
||||
assert f"<pre>{html.escape(str(birch.n_clusters))}" in html_output
|
||||
assert "AgglomerativeClustering</label>" in html_output
|
||||
|
||||
# outer estimator contains all changes
|
||||
assert f"<pre>{html.escape(str(birch))}" in html_output
|
||||
|
||||
|
||||
def test_ovo_classifier_duck_typing_meta():
|
||||
# Test duck typing metaestimators with OVO
|
||||
ovo = OneVsOneClassifier(LinearSVC(penalty="l1"))
|
||||
html_output = estimator_html_repr(ovo)
|
||||
|
||||
# inner estimators do not show changes
|
||||
with config_context(print_changed_only=True):
|
||||
assert f"<pre>{html.escape(str(ovo.estimator))}" in html_output
|
||||
assert "LinearSVC</label>" in html_output
|
||||
|
||||
# outer estimator
|
||||
assert f"<pre>{html.escape(str(ovo))}" in html_output
|
||||
|
||||
|
||||
def test_duck_typing_nested_estimator():
|
||||
# Test duck typing metaestimators with random search
|
||||
kernel_ridge = KernelRidge(kernel=ExpSineSquared())
|
||||
param_distributions = {"alpha": [1, 2]}
|
||||
|
||||
kernel_ridge_tuned = RandomizedSearchCV(
|
||||
kernel_ridge,
|
||||
param_distributions=param_distributions,
|
||||
)
|
||||
html_output = estimator_html_repr(kernel_ridge_tuned)
|
||||
assert "estimator: KernelRidge</label>" in html_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("print_changed_only", [True, False])
|
||||
def test_one_estimator_print_change_only(print_changed_only):
|
||||
pca = PCA(n_components=10)
|
||||
|
||||
with config_context(print_changed_only=print_changed_only):
|
||||
pca_repr = html.escape(str(pca))
|
||||
html_output = estimator_html_repr(pca)
|
||||
assert pca_repr in html_output
|
||||
|
||||
|
||||
def test_fallback_exists():
|
||||
"""Check that repr fallback is in the HTML."""
|
||||
pca = PCA(n_components=10)
|
||||
html_output = estimator_html_repr(pca)
|
||||
|
||||
assert (
|
||||
f'<div class="sk-text-repr-fallback"><pre>{html.escape(str(pca))}'
|
||||
in html_output
|
||||
)
|
||||
|
||||
|
||||
def test_show_arrow_pipeline():
|
||||
"""Show arrow in pipeline for top level in pipeline"""
|
||||
pipe = Pipeline([("scale", StandardScaler()), ("log_Reg", LogisticRegression())])
|
||||
|
||||
html_output = estimator_html_repr(pipe)
|
||||
assert (
|
||||
'class="sk-toggleable__label sk-toggleable__label-arrow">Pipeline'
|
||||
in html_output
|
||||
)
|
||||
@@ -0,0 +1,982 @@
|
||||
# Authors: Olivier Grisel <olivier.grisel@ensta.org>
|
||||
# Mathieu Blondel <mathieu@mblondel.org>
|
||||
# Denis Engemann <denis-alexander.engemann@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from scipy import linalg
|
||||
from scipy import stats
|
||||
from scipy.sparse.linalg import eigsh
|
||||
from scipy.special import expit
|
||||
|
||||
import pytest
|
||||
from sklearn.utils import gen_batches
|
||||
from sklearn.utils._arpack import _init_arpack_v0
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils._testing import assert_allclose_dense_sparse
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import skip_if_32bit
|
||||
|
||||
from sklearn.utils.extmath import density, _safe_accumulator_op
|
||||
from sklearn.utils.extmath import randomized_svd, _randomized_eigsh
|
||||
from sklearn.utils.extmath import row_norms
|
||||
from sklearn.utils.extmath import weighted_mode
|
||||
from sklearn.utils.extmath import cartesian
|
||||
from sklearn.utils.extmath import log_logistic
|
||||
from sklearn.utils.extmath import svd_flip
|
||||
from sklearn.utils.extmath import _incremental_mean_and_var
|
||||
from sklearn.utils.extmath import _deterministic_vector_sign_flip
|
||||
from sklearn.utils.extmath import softmax
|
||||
from sklearn.utils.extmath import stable_cumsum
|
||||
from sklearn.utils.extmath import safe_sparse_dot
|
||||
from sklearn.datasets import make_low_rank_matrix, make_sparse_spd_matrix
|
||||
|
||||
|
||||
def test_density():
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randint(10, size=(10, 5))
|
||||
X[1, 2] = 0
|
||||
X[5, 3] = 0
|
||||
X_csr = sparse.csr_matrix(X)
|
||||
X_csc = sparse.csc_matrix(X)
|
||||
X_coo = sparse.coo_matrix(X)
|
||||
X_lil = sparse.lil_matrix(X)
|
||||
|
||||
for X_ in (X_csr, X_csc, X_coo, X_lil):
|
||||
assert density(X_) == density(X)
|
||||
|
||||
|
||||
def test_uniform_weights():
|
||||
# with uniform weights, results should be identical to stats.mode
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.randint(10, size=(10, 5))
|
||||
weights = np.ones(x.shape)
|
||||
|
||||
for axis in (None, 0, 1):
|
||||
mode, score = stats.mode(x, axis)
|
||||
mode2, score2 = weighted_mode(x, weights, axis=axis)
|
||||
|
||||
assert_array_equal(mode, mode2)
|
||||
assert_array_equal(score, score2)
|
||||
|
||||
|
||||
def test_random_weights():
|
||||
# set this up so that each row should have a weighted mode of 6,
|
||||
# with a score that is easily reproduced
|
||||
mode_result = 6
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.randint(mode_result, size=(100, 10))
|
||||
w = rng.random_sample(x.shape)
|
||||
|
||||
x[:, :5] = mode_result
|
||||
w[:, :5] += 1
|
||||
|
||||
mode, score = weighted_mode(x, w, axis=1)
|
||||
|
||||
assert_array_equal(mode, mode_result)
|
||||
assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
|
||||
|
||||
|
||||
def check_randomized_svd_low_rank(dtype):
|
||||
# Check that extmath.randomized_svd is consistent with linalg.svd
|
||||
n_samples = 100
|
||||
n_features = 500
|
||||
rank = 5
|
||||
k = 10
|
||||
decimal = 5 if dtype == np.float32 else 7
|
||||
dtype = np.dtype(dtype)
|
||||
|
||||
# generate a matrix X of approximate effective rank `rank` and no noise
|
||||
# component (very structured signal):
|
||||
X = make_low_rank_matrix(
|
||||
n_samples=n_samples,
|
||||
n_features=n_features,
|
||||
effective_rank=rank,
|
||||
tail_strength=0.0,
|
||||
random_state=0,
|
||||
).astype(dtype, copy=False)
|
||||
assert X.shape == (n_samples, n_features)
|
||||
|
||||
# compute the singular values of X using the slow exact method
|
||||
U, s, Vt = linalg.svd(X, full_matrices=False)
|
||||
|
||||
# Convert the singular values to the specific dtype
|
||||
U = U.astype(dtype, copy=False)
|
||||
s = s.astype(dtype, copy=False)
|
||||
Vt = Vt.astype(dtype, copy=False)
|
||||
|
||||
for normalizer in ["auto", "LU", "QR"]: # 'none' would not be stable
|
||||
# compute the singular values of X using the fast approximate method
|
||||
Ua, sa, Va = randomized_svd(
|
||||
X, k, power_iteration_normalizer=normalizer, random_state=0
|
||||
)
|
||||
|
||||
# If the input dtype is float, then the output dtype is float of the
|
||||
# same bit size (f32 is not upcast to f64)
|
||||
# But if the input dtype is int, the output dtype is float64
|
||||
if dtype.kind == "f":
|
||||
assert Ua.dtype == dtype
|
||||
assert sa.dtype == dtype
|
||||
assert Va.dtype == dtype
|
||||
else:
|
||||
assert Ua.dtype == np.float64
|
||||
assert sa.dtype == np.float64
|
||||
assert Va.dtype == np.float64
|
||||
|
||||
assert Ua.shape == (n_samples, k)
|
||||
assert sa.shape == (k,)
|
||||
assert Va.shape == (k, n_features)
|
||||
|
||||
# ensure that the singular values of both methods are equal up to the
|
||||
# real rank of the matrix
|
||||
assert_almost_equal(s[:k], sa, decimal=decimal)
|
||||
|
||||
# check the singular vectors too (while not checking the sign)
|
||||
assert_almost_equal(
|
||||
np.dot(U[:, :k], Vt[:k, :]), np.dot(Ua, Va), decimal=decimal
|
||||
)
|
||||
|
||||
# check the sparse matrix representation
|
||||
X = sparse.csr_matrix(X)
|
||||
|
||||
# compute the singular values of X using the fast approximate method
|
||||
Ua, sa, Va = randomized_svd(
|
||||
X, k, power_iteration_normalizer=normalizer, random_state=0
|
||||
)
|
||||
if dtype.kind == "f":
|
||||
assert Ua.dtype == dtype
|
||||
assert sa.dtype == dtype
|
||||
assert Va.dtype == dtype
|
||||
else:
|
||||
assert Ua.dtype.kind == "f"
|
||||
assert sa.dtype.kind == "f"
|
||||
assert Va.dtype.kind == "f"
|
||||
|
||||
assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", (np.int32, np.int64, np.float32, np.float64))
|
||||
def test_randomized_svd_low_rank_all_dtypes(dtype):
|
||||
check_randomized_svd_low_rank(dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", (np.int32, np.int64, np.float32, np.float64))
|
||||
def test_randomized_eigsh(dtype):
|
||||
"""Test that `_randomized_eigsh` returns the appropriate components"""
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.diag(np.array([1.0, -2.0, 0.0, 3.0], dtype=dtype))
|
||||
# random rotation that preserves the eigenvalues of X
|
||||
rand_rot = np.linalg.qr(rng.normal(size=X.shape))[0]
|
||||
X = rand_rot @ X @ rand_rot.T
|
||||
|
||||
# with 'module' selection method, the negative eigenvalue shows up
|
||||
eigvals, eigvecs = _randomized_eigsh(X, n_components=2, selection="module")
|
||||
# eigenvalues
|
||||
assert eigvals.shape == (2,)
|
||||
assert_array_almost_equal(eigvals, [3.0, -2.0]) # negative eigenvalue here
|
||||
# eigenvectors
|
||||
assert eigvecs.shape == (4, 2)
|
||||
|
||||
# with 'value' selection method, the negative eigenvalue does not show up
|
||||
with pytest.raises(NotImplementedError):
|
||||
_randomized_eigsh(X, n_components=2, selection="value")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("k", (10, 50, 100, 199, 200))
|
||||
def test_randomized_eigsh_compared_to_others(k):
|
||||
"""Check that `_randomized_eigsh` is similar to other `eigsh`
|
||||
|
||||
Tests that for a random PSD matrix, `_randomized_eigsh` provides results
|
||||
comparable to LAPACK (scipy.linalg.eigh) and ARPACK
|
||||
(scipy.sparse.linalg.eigsh).
|
||||
|
||||
Note: some versions of ARPACK do not support k=n_features.
|
||||
"""
|
||||
|
||||
# make a random PSD matrix
|
||||
n_features = 200
|
||||
X = make_sparse_spd_matrix(n_features, random_state=0)
|
||||
|
||||
# compare two versions of randomized
|
||||
# rough and fast
|
||||
eigvals, eigvecs = _randomized_eigsh(
|
||||
X, n_components=k, selection="module", n_iter=25, random_state=0
|
||||
)
|
||||
# more accurate but slow (TODO find realistic settings here)
|
||||
eigvals_qr, eigvecs_qr = _randomized_eigsh(
|
||||
X,
|
||||
n_components=k,
|
||||
n_iter=25,
|
||||
n_oversamples=20,
|
||||
random_state=0,
|
||||
power_iteration_normalizer="QR",
|
||||
selection="module",
|
||||
)
|
||||
|
||||
# with LAPACK
|
||||
eigvals_lapack, eigvecs_lapack = linalg.eigh(
|
||||
X, eigvals=(n_features - k, n_features - 1)
|
||||
)
|
||||
indices = eigvals_lapack.argsort()[::-1]
|
||||
eigvals_lapack = eigvals_lapack[indices]
|
||||
eigvecs_lapack = eigvecs_lapack[:, indices]
|
||||
|
||||
# -- eigenvalues comparison
|
||||
assert eigvals_lapack.shape == (k,)
|
||||
# comparison precision
|
||||
assert_array_almost_equal(eigvals, eigvals_lapack, decimal=6)
|
||||
assert_array_almost_equal(eigvals_qr, eigvals_lapack, decimal=6)
|
||||
|
||||
# -- eigenvectors comparison
|
||||
assert eigvecs_lapack.shape == (n_features, k)
|
||||
# flip eigenvectors' sign to enforce deterministic output
|
||||
dummy_vecs = np.zeros_like(eigvecs).T
|
||||
eigvecs, _ = svd_flip(eigvecs, dummy_vecs)
|
||||
eigvecs_qr, _ = svd_flip(eigvecs_qr, dummy_vecs)
|
||||
eigvecs_lapack, _ = svd_flip(eigvecs_lapack, dummy_vecs)
|
||||
assert_array_almost_equal(eigvecs, eigvecs_lapack, decimal=4)
|
||||
assert_array_almost_equal(eigvecs_qr, eigvecs_lapack, decimal=6)
|
||||
|
||||
# comparison ARPACK ~ LAPACK (some ARPACK implems do not support k=n)
|
||||
if k < n_features:
|
||||
v0 = _init_arpack_v0(n_features, random_state=0)
|
||||
# "LA" largest algebraic <=> selection="value" in randomized_eigsh
|
||||
eigvals_arpack, eigvecs_arpack = eigsh(
|
||||
X, k, which="LA", tol=0, maxiter=None, v0=v0
|
||||
)
|
||||
indices = eigvals_arpack.argsort()[::-1]
|
||||
# eigenvalues
|
||||
eigvals_arpack = eigvals_arpack[indices]
|
||||
assert_array_almost_equal(eigvals_lapack, eigvals_arpack, decimal=10)
|
||||
# eigenvectors
|
||||
eigvecs_arpack = eigvecs_arpack[:, indices]
|
||||
eigvecs_arpack, _ = svd_flip(eigvecs_arpack, dummy_vecs)
|
||||
assert_array_almost_equal(eigvecs_arpack, eigvecs_lapack, decimal=8)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"n,rank",
|
||||
[
|
||||
(10, 7),
|
||||
(100, 10),
|
||||
(100, 80),
|
||||
(500, 10),
|
||||
(500, 250),
|
||||
(500, 400),
|
||||
],
|
||||
)
|
||||
def test_randomized_eigsh_reconst_low_rank(n, rank):
|
||||
"""Check that randomized_eigsh is able to reconstruct a low rank psd matrix
|
||||
|
||||
Tests that the decomposition provided by `_randomized_eigsh` leads to
|
||||
orthonormal eigenvectors, and that a low rank PSD matrix can be effectively
|
||||
reconstructed with good accuracy using it.
|
||||
"""
|
||||
assert rank < n
|
||||
|
||||
# create a low rank PSD
|
||||
rng = np.random.RandomState(69)
|
||||
X = rng.randn(n, rank)
|
||||
A = X @ X.T
|
||||
|
||||
# approximate A with the "right" number of components
|
||||
S, V = _randomized_eigsh(A, n_components=rank, random_state=rng)
|
||||
# orthonormality checks
|
||||
assert_array_almost_equal(np.linalg.norm(V, axis=0), np.ones(S.shape))
|
||||
assert_array_almost_equal(V.T @ V, np.diag(np.ones(S.shape)))
|
||||
# reconstruction
|
||||
A_reconstruct = V @ np.diag(S) @ V.T
|
||||
|
||||
# test that the approximation is good
|
||||
assert_array_almost_equal(A_reconstruct, A, decimal=6)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_row_norms(dtype):
|
||||
X = np.random.RandomState(42).randn(100, 100)
|
||||
if dtype is np.float32:
|
||||
precision = 4
|
||||
else:
|
||||
precision = 5
|
||||
|
||||
X = X.astype(dtype, copy=False)
|
||||
sq_norm = (X**2).sum(axis=1)
|
||||
|
||||
assert_array_almost_equal(sq_norm, row_norms(X, squared=True), precision)
|
||||
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)
|
||||
|
||||
for csr_index_dtype in [np.int32, np.int64]:
|
||||
Xcsr = sparse.csr_matrix(X, dtype=dtype)
|
||||
# csr_matrix will use int32 indices by default,
|
||||
# up-casting those to int64 when necessary
|
||||
if csr_index_dtype is np.int64:
|
||||
Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype, copy=False)
|
||||
Xcsr.indices = Xcsr.indices.astype(csr_index_dtype, copy=False)
|
||||
assert Xcsr.indices.dtype == csr_index_dtype
|
||||
assert Xcsr.indptr.dtype == csr_index_dtype
|
||||
assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), precision)
|
||||
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision)
|
||||
|
||||
|
||||
def test_randomized_svd_low_rank_with_noise():
|
||||
# Check that extmath.randomized_svd can handle noisy matrices
|
||||
n_samples = 100
|
||||
n_features = 500
|
||||
rank = 5
|
||||
k = 10
|
||||
|
||||
# generate a matrix X wity structure approximate rank `rank` and an
|
||||
# important noisy component
|
||||
X = make_low_rank_matrix(
|
||||
n_samples=n_samples,
|
||||
n_features=n_features,
|
||||
effective_rank=rank,
|
||||
tail_strength=0.1,
|
||||
random_state=0,
|
||||
)
|
||||
assert X.shape == (n_samples, n_features)
|
||||
|
||||
# compute the singular values of X using the slow exact method
|
||||
_, s, _ = linalg.svd(X, full_matrices=False)
|
||||
|
||||
for normalizer in ["auto", "none", "LU", "QR"]:
|
||||
# compute the singular values of X using the fast approximate
|
||||
# method without the iterated power method
|
||||
_, sa, _ = randomized_svd(
|
||||
X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0
|
||||
)
|
||||
|
||||
# the approximation does not tolerate the noise:
|
||||
assert np.abs(s[:k] - sa).max() > 0.01
|
||||
|
||||
# compute the singular values of X using the fast approximate
|
||||
# method with iterated power method
|
||||
_, sap, _ = randomized_svd(
|
||||
X, k, power_iteration_normalizer=normalizer, random_state=0
|
||||
)
|
||||
|
||||
# the iterated power method is helping getting rid of the noise:
|
||||
assert_almost_equal(s[:k], sap, decimal=3)
|
||||
|
||||
|
||||
def test_randomized_svd_infinite_rank():
|
||||
# Check that extmath.randomized_svd can handle noisy matrices
|
||||
n_samples = 100
|
||||
n_features = 500
|
||||
rank = 5
|
||||
k = 10
|
||||
|
||||
# let us try again without 'low_rank component': just regularly but slowly
|
||||
# decreasing singular values: the rank of the data matrix is infinite
|
||||
X = make_low_rank_matrix(
|
||||
n_samples=n_samples,
|
||||
n_features=n_features,
|
||||
effective_rank=rank,
|
||||
tail_strength=1.0,
|
||||
random_state=0,
|
||||
)
|
||||
assert X.shape == (n_samples, n_features)
|
||||
|
||||
# compute the singular values of X using the slow exact method
|
||||
_, s, _ = linalg.svd(X, full_matrices=False)
|
||||
for normalizer in ["auto", "none", "LU", "QR"]:
|
||||
# compute the singular values of X using the fast approximate method
|
||||
# without the iterated power method
|
||||
_, sa, _ = randomized_svd(
|
||||
X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0
|
||||
)
|
||||
|
||||
# the approximation does not tolerate the noise:
|
||||
assert np.abs(s[:k] - sa).max() > 0.1
|
||||
|
||||
# compute the singular values of X using the fast approximate method
|
||||
# with iterated power method
|
||||
_, sap, _ = randomized_svd(
|
||||
X, k, n_iter=5, power_iteration_normalizer=normalizer, random_state=0
|
||||
)
|
||||
|
||||
# the iterated power method is still managing to get most of the
|
||||
# structure at the requested rank
|
||||
assert_almost_equal(s[:k], sap, decimal=3)
|
||||
|
||||
|
||||
def test_randomized_svd_transpose_consistency():
|
||||
# Check that transposing the design matrix has limited impact
|
||||
n_samples = 100
|
||||
n_features = 500
|
||||
rank = 4
|
||||
k = 10
|
||||
|
||||
X = make_low_rank_matrix(
|
||||
n_samples=n_samples,
|
||||
n_features=n_features,
|
||||
effective_rank=rank,
|
||||
tail_strength=0.5,
|
||||
random_state=0,
|
||||
)
|
||||
assert X.shape == (n_samples, n_features)
|
||||
|
||||
U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False, random_state=0)
|
||||
U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0)
|
||||
U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose="auto", random_state=0)
|
||||
U4, s4, V4 = linalg.svd(X, full_matrices=False)
|
||||
|
||||
assert_almost_equal(s1, s4[:k], decimal=3)
|
||||
assert_almost_equal(s2, s4[:k], decimal=3)
|
||||
assert_almost_equal(s3, s4[:k], decimal=3)
|
||||
|
||||
assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2)
|
||||
assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2)
|
||||
|
||||
# in this case 'auto' is equivalent to transpose
|
||||
assert_almost_equal(s2, s3)
|
||||
|
||||
|
||||
def test_randomized_svd_power_iteration_normalizer():
|
||||
# randomized_svd with power_iteration_normalized='none' diverges for
|
||||
# large number of power iterations on this dataset
|
||||
rng = np.random.RandomState(42)
|
||||
X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
|
||||
X += 3 * rng.randint(0, 2, size=X.shape)
|
||||
n_components = 50
|
||||
|
||||
# Check that it diverges with many (non-normalized) power iterations
|
||||
U, s, Vt = randomized_svd(
|
||||
X, n_components, n_iter=2, power_iteration_normalizer="none", random_state=0
|
||||
)
|
||||
A = X - U.dot(np.diag(s).dot(Vt))
|
||||
error_2 = linalg.norm(A, ord="fro")
|
||||
U, s, Vt = randomized_svd(
|
||||
X, n_components, n_iter=20, power_iteration_normalizer="none", random_state=0
|
||||
)
|
||||
A = X - U.dot(np.diag(s).dot(Vt))
|
||||
error_20 = linalg.norm(A, ord="fro")
|
||||
assert np.abs(error_2 - error_20) > 100
|
||||
|
||||
for normalizer in ["LU", "QR", "auto"]:
|
||||
U, s, Vt = randomized_svd(
|
||||
X,
|
||||
n_components,
|
||||
n_iter=2,
|
||||
power_iteration_normalizer=normalizer,
|
||||
random_state=0,
|
||||
)
|
||||
A = X - U.dot(np.diag(s).dot(Vt))
|
||||
error_2 = linalg.norm(A, ord="fro")
|
||||
|
||||
for i in [5, 10, 50]:
|
||||
U, s, Vt = randomized_svd(
|
||||
X,
|
||||
n_components,
|
||||
n_iter=i,
|
||||
power_iteration_normalizer=normalizer,
|
||||
random_state=0,
|
||||
)
|
||||
A = X - U.dot(np.diag(s).dot(Vt))
|
||||
error = linalg.norm(A, ord="fro")
|
||||
assert 15 > np.abs(error_2 - error)
|
||||
|
||||
|
||||
def test_randomized_svd_sparse_warnings():
|
||||
# randomized_svd throws a warning for lil and dok matrix
|
||||
rng = np.random.RandomState(42)
|
||||
X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng)
|
||||
n_components = 5
|
||||
for cls in (sparse.lil_matrix, sparse.dok_matrix):
|
||||
X = cls(X)
|
||||
warn_msg = (
|
||||
"Calculating SVD of a {} is expensive. "
|
||||
"csr_matrix is more efficient.".format(cls.__name__)
|
||||
)
|
||||
with pytest.warns(sparse.SparseEfficiencyWarning, match=warn_msg):
|
||||
randomized_svd(X, n_components, n_iter=1, power_iteration_normalizer="none")
|
||||
|
||||
|
||||
def test_svd_flip():
|
||||
# Check that svd_flip works in both situations, and reconstructs input.
|
||||
rs = np.random.RandomState(1999)
|
||||
n_samples = 20
|
||||
n_features = 10
|
||||
X = rs.randn(n_samples, n_features)
|
||||
|
||||
# Check matrix reconstruction
|
||||
U, S, Vt = linalg.svd(X, full_matrices=False)
|
||||
U1, V1 = svd_flip(U, Vt, u_based_decision=False)
|
||||
assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6)
|
||||
|
||||
# Check transposed matrix reconstruction
|
||||
XT = X.T
|
||||
U, S, Vt = linalg.svd(XT, full_matrices=False)
|
||||
U2, V2 = svd_flip(U, Vt, u_based_decision=True)
|
||||
assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6)
|
||||
|
||||
# Check that different flip methods are equivalent under reconstruction
|
||||
U_flip1, V_flip1 = svd_flip(U, Vt, u_based_decision=True)
|
||||
assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6)
|
||||
U_flip2, V_flip2 = svd_flip(U, Vt, u_based_decision=False)
|
||||
assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
|
||||
|
||||
|
||||
def test_randomized_svd_sign_flip():
|
||||
a = np.array([[2.0, 0.0], [0.0, 1.0]])
|
||||
u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41)
|
||||
for seed in range(10):
|
||||
u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed)
|
||||
assert_almost_equal(u1, u2)
|
||||
assert_almost_equal(v1, v2)
|
||||
assert_almost_equal(np.dot(u2 * s2, v2), a)
|
||||
assert_almost_equal(np.dot(u2.T, u2), np.eye(2))
|
||||
assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
|
||||
|
||||
|
||||
def test_randomized_svd_sign_flip_with_transpose():
|
||||
# Check if the randomized_svd sign flipping is always done based on u
|
||||
# irrespective of transpose.
|
||||
# See https://github.com/scikit-learn/scikit-learn/issues/5608
|
||||
# for more details.
|
||||
def max_loading_is_positive(u, v):
|
||||
"""
|
||||
returns bool tuple indicating if the values maximising np.abs
|
||||
are positive across all rows for u and across all columns for v.
|
||||
"""
|
||||
u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all()
|
||||
v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all()
|
||||
return u_based, v_based
|
||||
|
||||
mat = np.arange(10 * 8).reshape(10, -1)
|
||||
|
||||
# Without transpose
|
||||
u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True, random_state=0)
|
||||
u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
|
||||
assert u_based
|
||||
assert not v_based
|
||||
|
||||
# With transpose
|
||||
u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(
|
||||
mat, 3, flip_sign=True, transpose=True, random_state=0
|
||||
)
|
||||
u_based, v_based = max_loading_is_positive(
|
||||
u_flipped_with_transpose, v_flipped_with_transpose
|
||||
)
|
||||
assert u_based
|
||||
assert not v_based
|
||||
|
||||
|
||||
def test_cartesian():
|
||||
# Check if cartesian product delivers the right results
|
||||
|
||||
axes = (np.array([1, 2, 3]), np.array([4, 5]), np.array([6, 7]))
|
||||
|
||||
true_out = np.array(
|
||||
[
|
||||
[1, 4, 6],
|
||||
[1, 4, 7],
|
||||
[1, 5, 6],
|
||||
[1, 5, 7],
|
||||
[2, 4, 6],
|
||||
[2, 4, 7],
|
||||
[2, 5, 6],
|
||||
[2, 5, 7],
|
||||
[3, 4, 6],
|
||||
[3, 4, 7],
|
||||
[3, 5, 6],
|
||||
[3, 5, 7],
|
||||
]
|
||||
)
|
||||
|
||||
out = cartesian(axes)
|
||||
assert_array_equal(true_out, out)
|
||||
|
||||
# check single axis
|
||||
x = np.arange(3)
|
||||
assert_array_equal(x[:, np.newaxis], cartesian((x,)))
|
||||
|
||||
|
||||
def test_logistic_sigmoid():
|
||||
# Check correctness and robustness of logistic sigmoid implementation
|
||||
def naive_log_logistic(x):
|
||||
return np.log(expit(x))
|
||||
|
||||
x = np.linspace(-2, 2, 50)
|
||||
assert_array_almost_equal(log_logistic(x), naive_log_logistic(x))
|
||||
|
||||
extreme_x = np.array([-100.0, 100.0])
|
||||
assert_array_almost_equal(log_logistic(extreme_x), [-100, 0])
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def rng():
|
||||
return np.random.RandomState(42)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_incremental_weighted_mean_and_variance_simple(rng, dtype):
|
||||
mult = 10
|
||||
X = rng.rand(1000, 20).astype(dtype) * mult
|
||||
sample_weight = rng.rand(X.shape[0]) * mult
|
||||
mean, var, _ = _incremental_mean_and_var(X, 0, 0, 0, sample_weight=sample_weight)
|
||||
|
||||
expected_mean = np.average(X, weights=sample_weight, axis=0)
|
||||
expected_var = (
|
||||
np.average(X**2, weights=sample_weight, axis=0) - expected_mean**2
|
||||
)
|
||||
assert_almost_equal(mean, expected_mean)
|
||||
assert_almost_equal(var, expected_var)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mean", [0, 1e7, -1e7])
|
||||
@pytest.mark.parametrize("var", [1, 1e-8, 1e5])
|
||||
@pytest.mark.parametrize(
|
||||
"weight_loc, weight_scale", [(0, 1), (0, 1e-8), (1, 1e-8), (10, 1), (1e7, 1)]
|
||||
)
|
||||
def test_incremental_weighted_mean_and_variance(
|
||||
mean, var, weight_loc, weight_scale, rng
|
||||
):
|
||||
|
||||
# Testing of correctness and numerical stability
|
||||
def _assert(X, sample_weight, expected_mean, expected_var):
|
||||
n = X.shape[0]
|
||||
for chunk_size in [1, n // 10 + 1, n // 4 + 1, n // 2 + 1, n]:
|
||||
last_mean, last_weight_sum, last_var = 0, 0, 0
|
||||
for batch in gen_batches(n, chunk_size):
|
||||
last_mean, last_var, last_weight_sum = _incremental_mean_and_var(
|
||||
X[batch],
|
||||
last_mean,
|
||||
last_var,
|
||||
last_weight_sum,
|
||||
sample_weight=sample_weight[batch],
|
||||
)
|
||||
assert_allclose(last_mean, expected_mean)
|
||||
assert_allclose(last_var, expected_var, atol=1e-6)
|
||||
|
||||
size = (100, 20)
|
||||
weight = rng.normal(loc=weight_loc, scale=weight_scale, size=size[0])
|
||||
|
||||
# Compare to weighted average: np.average
|
||||
X = rng.normal(loc=mean, scale=var, size=size)
|
||||
expected_mean = _safe_accumulator_op(np.average, X, weights=weight, axis=0)
|
||||
expected_var = _safe_accumulator_op(
|
||||
np.average, (X - expected_mean) ** 2, weights=weight, axis=0
|
||||
)
|
||||
_assert(X, weight, expected_mean, expected_var)
|
||||
|
||||
# Compare to unweighted mean: np.mean
|
||||
X = rng.normal(loc=mean, scale=var, size=size)
|
||||
ones_weight = np.ones(size[0])
|
||||
expected_mean = _safe_accumulator_op(np.mean, X, axis=0)
|
||||
expected_var = _safe_accumulator_op(np.var, X, axis=0)
|
||||
_assert(X, ones_weight, expected_mean, expected_var)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_incremental_weighted_mean_and_variance_ignore_nan(dtype):
|
||||
old_means = np.array([535.0, 535.0, 535.0, 535.0])
|
||||
old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
|
||||
old_weight_sum = np.array([2, 2, 2, 2], dtype=np.int32)
|
||||
sample_weights_X = np.ones(3)
|
||||
sample_weights_X_nan = np.ones(4)
|
||||
|
||||
X = np.array(
|
||||
[[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]]
|
||||
).astype(dtype)
|
||||
|
||||
X_nan = np.array(
|
||||
[
|
||||
[170, np.nan, 170, 170],
|
||||
[np.nan, 170, 430, 430],
|
||||
[430, 430, np.nan, 300],
|
||||
[300, 300, 300, np.nan],
|
||||
]
|
||||
).astype(dtype)
|
||||
|
||||
X_means, X_variances, X_count = _incremental_mean_and_var(
|
||||
X, old_means, old_variances, old_weight_sum, sample_weight=sample_weights_X
|
||||
)
|
||||
X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
|
||||
X_nan,
|
||||
old_means,
|
||||
old_variances,
|
||||
old_weight_sum,
|
||||
sample_weight=sample_weights_X_nan,
|
||||
)
|
||||
|
||||
assert_allclose(X_nan_means, X_means)
|
||||
assert_allclose(X_nan_variances, X_variances)
|
||||
assert_allclose(X_nan_count, X_count)
|
||||
|
||||
|
||||
def test_incremental_variance_update_formulas():
|
||||
# Test Youngs and Cramer incremental variance formulas.
|
||||
# Doggie data from https://www.mathsisfun.com/data/standard-deviation.html
|
||||
A = np.array(
|
||||
[
|
||||
[600, 470, 170, 430, 300],
|
||||
[600, 470, 170, 430, 300],
|
||||
[600, 470, 170, 430, 300],
|
||||
[600, 470, 170, 430, 300],
|
||||
]
|
||||
).T
|
||||
idx = 2
|
||||
X1 = A[:idx, :]
|
||||
X2 = A[idx:, :]
|
||||
|
||||
old_means = X1.mean(axis=0)
|
||||
old_variances = X1.var(axis=0)
|
||||
old_sample_count = np.full(X1.shape[1], X1.shape[0], dtype=np.int32)
|
||||
final_means, final_variances, final_count = _incremental_mean_and_var(
|
||||
X2, old_means, old_variances, old_sample_count
|
||||
)
|
||||
assert_almost_equal(final_means, A.mean(axis=0), 6)
|
||||
assert_almost_equal(final_variances, A.var(axis=0), 6)
|
||||
assert_almost_equal(final_count, A.shape[0])
|
||||
|
||||
|
||||
def test_incremental_mean_and_variance_ignore_nan():
|
||||
old_means = np.array([535.0, 535.0, 535.0, 535.0])
|
||||
old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
|
||||
old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32)
|
||||
|
||||
X = np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])
|
||||
|
||||
X_nan = np.array(
|
||||
[
|
||||
[170, np.nan, 170, 170],
|
||||
[np.nan, 170, 430, 430],
|
||||
[430, 430, np.nan, 300],
|
||||
[300, 300, 300, np.nan],
|
||||
]
|
||||
)
|
||||
|
||||
X_means, X_variances, X_count = _incremental_mean_and_var(
|
||||
X, old_means, old_variances, old_sample_count
|
||||
)
|
||||
X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
|
||||
X_nan, old_means, old_variances, old_sample_count
|
||||
)
|
||||
|
||||
assert_allclose(X_nan_means, X_means)
|
||||
assert_allclose(X_nan_variances, X_variances)
|
||||
assert_allclose(X_nan_count, X_count)
|
||||
|
||||
|
||||
@skip_if_32bit
|
||||
def test_incremental_variance_numerical_stability():
|
||||
# Test Youngs and Cramer incremental variance formulas.
|
||||
|
||||
def np_var(A):
|
||||
return A.var(axis=0)
|
||||
|
||||
# Naive one pass variance computation - not numerically stable
|
||||
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
|
||||
def one_pass_var(X):
|
||||
n = X.shape[0]
|
||||
exp_x2 = (X**2).sum(axis=0) / n
|
||||
expx_2 = (X.sum(axis=0) / n) ** 2
|
||||
return exp_x2 - expx_2
|
||||
|
||||
# Two-pass algorithm, stable.
|
||||
# We use it as a benchmark. It is not an online algorithm
|
||||
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
|
||||
def two_pass_var(X):
|
||||
mean = X.mean(axis=0)
|
||||
Y = X.copy()
|
||||
return np.mean((Y - mean) ** 2, axis=0)
|
||||
|
||||
# Naive online implementation
|
||||
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
|
||||
# This works only for chunks for size 1
|
||||
def naive_mean_variance_update(x, last_mean, last_variance, last_sample_count):
|
||||
updated_sample_count = last_sample_count + 1
|
||||
samples_ratio = last_sample_count / float(updated_sample_count)
|
||||
updated_mean = x / updated_sample_count + last_mean * samples_ratio
|
||||
updated_variance = (
|
||||
last_variance * samples_ratio
|
||||
+ (x - last_mean) * (x - updated_mean) / updated_sample_count
|
||||
)
|
||||
return updated_mean, updated_variance, updated_sample_count
|
||||
|
||||
# We want to show a case when one_pass_var has error > 1e-3 while
|
||||
# _batch_mean_variance_update has less.
|
||||
tol = 200
|
||||
n_features = 2
|
||||
n_samples = 10000
|
||||
x1 = np.array(1e8, dtype=np.float64)
|
||||
x2 = np.log(1e-5, dtype=np.float64)
|
||||
A0 = np.full((n_samples // 2, n_features), x1, dtype=np.float64)
|
||||
A1 = np.full((n_samples // 2, n_features), x2, dtype=np.float64)
|
||||
A = np.vstack((A0, A1))
|
||||
|
||||
# Naive one pass var: >tol (=1063)
|
||||
assert np.abs(np_var(A) - one_pass_var(A)).max() > tol
|
||||
|
||||
# Starting point for online algorithms: after A0
|
||||
|
||||
# Naive implementation: >tol (436)
|
||||
mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2
|
||||
for i in range(A1.shape[0]):
|
||||
mean, var, n = naive_mean_variance_update(A1[i, :], mean, var, n)
|
||||
assert n == A.shape[0]
|
||||
# the mean is also slightly unstable
|
||||
assert np.abs(A.mean(axis=0) - mean).max() > 1e-6
|
||||
assert np.abs(np_var(A) - var).max() > tol
|
||||
|
||||
# Robust implementation: <tol (177)
|
||||
mean, var = A0[0, :], np.zeros(n_features)
|
||||
n = np.full(n_features, n_samples // 2, dtype=np.int32)
|
||||
for i in range(A1.shape[0]):
|
||||
mean, var, n = _incremental_mean_and_var(
|
||||
A1[i, :].reshape((1, A1.shape[1])), mean, var, n
|
||||
)
|
||||
assert_array_equal(n, A.shape[0])
|
||||
assert_array_almost_equal(A.mean(axis=0), mean)
|
||||
assert tol > np.abs(np_var(A) - var).max()
|
||||
|
||||
|
||||
def test_incremental_variance_ddof():
|
||||
# Test that degrees of freedom parameter for calculations are correct.
|
||||
rng = np.random.RandomState(1999)
|
||||
X = rng.randn(50, 10)
|
||||
n_samples, n_features = X.shape
|
||||
for batch_size in [11, 20, 37]:
|
||||
steps = np.arange(0, X.shape[0], batch_size)
|
||||
if steps[-1] != X.shape[0]:
|
||||
steps = np.hstack([steps, n_samples])
|
||||
|
||||
for i, j in zip(steps[:-1], steps[1:]):
|
||||
batch = X[i:j, :]
|
||||
if i == 0:
|
||||
incremental_means = batch.mean(axis=0)
|
||||
incremental_variances = batch.var(axis=0)
|
||||
# Assign this twice so that the test logic is consistent
|
||||
incremental_count = batch.shape[0]
|
||||
sample_count = np.full(batch.shape[1], batch.shape[0], dtype=np.int32)
|
||||
else:
|
||||
result = _incremental_mean_and_var(
|
||||
batch, incremental_means, incremental_variances, sample_count
|
||||
)
|
||||
(incremental_means, incremental_variances, incremental_count) = result
|
||||
sample_count += batch.shape[0]
|
||||
|
||||
calculated_means = np.mean(X[:j], axis=0)
|
||||
calculated_variances = np.var(X[:j], axis=0)
|
||||
assert_almost_equal(incremental_means, calculated_means, 6)
|
||||
assert_almost_equal(incremental_variances, calculated_variances, 6)
|
||||
assert_array_equal(incremental_count, sample_count)
|
||||
|
||||
|
||||
def test_vector_sign_flip():
|
||||
# Testing that sign flip is working & largest value has positive sign
|
||||
data = np.random.RandomState(36).randn(5, 5)
|
||||
max_abs_rows = np.argmax(np.abs(data), axis=1)
|
||||
data_flipped = _deterministic_vector_sign_flip(data)
|
||||
max_rows = np.argmax(data_flipped, axis=1)
|
||||
assert_array_equal(max_abs_rows, max_rows)
|
||||
signs = np.sign(data[range(data.shape[0]), max_abs_rows])
|
||||
assert_array_equal(data, data_flipped * signs[:, np.newaxis])
|
||||
|
||||
|
||||
def test_softmax():
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(3, 5)
|
||||
exp_X = np.exp(X)
|
||||
sum_exp_X = np.sum(exp_X, axis=1).reshape((-1, 1))
|
||||
assert_array_almost_equal(softmax(X), exp_X / sum_exp_X)
|
||||
|
||||
|
||||
def test_stable_cumsum():
|
||||
assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
|
||||
r = np.random.RandomState(0).rand(100000)
|
||||
with pytest.warns(RuntimeWarning):
|
||||
stable_cumsum(r, rtol=0, atol=0)
|
||||
|
||||
# test axis parameter
|
||||
A = np.random.RandomState(36).randint(1000, size=(5, 5, 5))
|
||||
assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0))
|
||||
assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1))
|
||||
assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"A_array_constr", [np.array, sparse.csr_matrix], ids=["dense", "sparse"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"B_array_constr", [np.array, sparse.csr_matrix], ids=["dense", "sparse"]
|
||||
)
|
||||
def test_safe_sparse_dot_2d(A_array_constr, B_array_constr):
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
A = rng.random_sample((30, 10))
|
||||
B = rng.random_sample((10, 20))
|
||||
expected = np.dot(A, B)
|
||||
|
||||
A = A_array_constr(A)
|
||||
B = B_array_constr(B)
|
||||
actual = safe_sparse_dot(A, B, dense_output=True)
|
||||
|
||||
assert_allclose(actual, expected)
|
||||
|
||||
|
||||
def test_safe_sparse_dot_nd():
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
# dense ND / sparse
|
||||
A = rng.random_sample((2, 3, 4, 5, 6))
|
||||
B = rng.random_sample((6, 7))
|
||||
expected = np.dot(A, B)
|
||||
B = sparse.csr_matrix(B)
|
||||
actual = safe_sparse_dot(A, B)
|
||||
assert_allclose(actual, expected)
|
||||
|
||||
# sparse / dense ND
|
||||
A = rng.random_sample((2, 3))
|
||||
B = rng.random_sample((4, 5, 3, 6))
|
||||
expected = np.dot(A, B)
|
||||
A = sparse.csr_matrix(A)
|
||||
actual = safe_sparse_dot(A, B)
|
||||
assert_allclose(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"A_array_constr", [np.array, sparse.csr_matrix], ids=["dense", "sparse"]
|
||||
)
|
||||
def test_safe_sparse_dot_2d_1d(A_array_constr):
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
B = rng.random_sample((10))
|
||||
|
||||
# 2D @ 1D
|
||||
A = rng.random_sample((30, 10))
|
||||
expected = np.dot(A, B)
|
||||
A = A_array_constr(A)
|
||||
actual = safe_sparse_dot(A, B)
|
||||
assert_allclose(actual, expected)
|
||||
|
||||
# 1D @ 2D
|
||||
A = rng.random_sample((10, 30))
|
||||
expected = np.dot(B, A)
|
||||
A = A_array_constr(A)
|
||||
actual = safe_sparse_dot(B, A)
|
||||
assert_allclose(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dense_output", [True, False])
|
||||
def test_safe_sparse_dot_dense_output(dense_output):
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
A = sparse.random(30, 10, density=0.1, random_state=rng)
|
||||
B = sparse.random(10, 20, density=0.1, random_state=rng)
|
||||
|
||||
expected = A.dot(B)
|
||||
actual = safe_sparse_dot(A, B, dense_output=dense_output)
|
||||
|
||||
assert sparse.issparse(actual) == (not dense_output)
|
||||
|
||||
if dense_output:
|
||||
expected = expected.toarray()
|
||||
assert_allclose_dense_sparse(actual, expected)
|
||||
@@ -0,0 +1,31 @@
|
||||
""" Test fast_dict.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._fast_dict import IntFloatDict, argmin
|
||||
|
||||
|
||||
def test_int_float_dict():
|
||||
rng = np.random.RandomState(0)
|
||||
keys = np.unique(rng.randint(100, size=10).astype(np.intp))
|
||||
values = rng.rand(len(keys))
|
||||
|
||||
d = IntFloatDict(keys, values)
|
||||
for key, value in zip(keys, values):
|
||||
assert d[key] == value
|
||||
assert len(d) == len(keys)
|
||||
|
||||
d.append(120, 3.0)
|
||||
assert d[120] == 3.0
|
||||
assert len(d) == len(keys) + 1
|
||||
for i in range(2000):
|
||||
d.append(i + 1000, 4.0)
|
||||
assert d[1100] == 4.0
|
||||
|
||||
|
||||
def test_int_float_dict_argmin():
|
||||
# Test the argmin implementation on the IntFloatDict
|
||||
keys = np.arange(100, dtype=np.intp)
|
||||
values = np.arange(100, dtype=np.float64)
|
||||
d = IntFloatDict(keys, values)
|
||||
assert argmin(d) == (0, 0)
|
||||
@@ -0,0 +1,48 @@
|
||||
# Authors: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Justin Vincent
|
||||
# Lars Buitinck
|
||||
# License: BSD 3 clause
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.stats
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
from sklearn.utils.fixes import _object_dtype_isnan
|
||||
from sklearn.utils.fixes import loguniform
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype, val", ([object, 1], [object, "a"], [float, 1]))
|
||||
def test_object_dtype_isnan(dtype, val):
|
||||
X = np.array([[val, np.nan], [np.nan, val]], dtype=dtype)
|
||||
|
||||
expected_mask = np.array([[False, True], [True, False]])
|
||||
|
||||
mask = _object_dtype_isnan(X)
|
||||
|
||||
assert_array_equal(mask, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("low,high,base", [(-1, 0, 10), (0, 2, np.exp(1)), (-1, 1, 2)])
|
||||
def test_loguniform(low, high, base):
|
||||
rv = loguniform(base**low, base**high)
|
||||
assert isinstance(rv, scipy.stats._distn_infrastructure.rv_frozen)
|
||||
rvs = rv.rvs(size=2000, random_state=0)
|
||||
|
||||
# Test the basics; right bounds, right size
|
||||
assert (base**low <= rvs).all() and (rvs <= base**high).all()
|
||||
assert len(rvs) == 2000
|
||||
|
||||
# Test that it's actually (fairly) uniform
|
||||
log_rvs = np.array([math.log(x, base) for x in rvs])
|
||||
counts, _ = np.histogram(log_rvs)
|
||||
assert counts.mean() == 200
|
||||
assert np.abs(counts - counts.mean()).max() <= 40
|
||||
|
||||
# Test that random_state works
|
||||
assert loguniform(base**low, base**high).rvs(random_state=0) == loguniform(
|
||||
base**low, base**high
|
||||
).rvs(random_state=0)
|
||||
@@ -0,0 +1,80 @@
|
||||
import pytest
|
||||
import numpy as np
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
|
||||
from sklearn.neighbors import kneighbors_graph
|
||||
from sklearn.utils.graph import _fix_connected_components
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
|
||||
|
||||
def test_fix_connected_components():
|
||||
# Test that _fix_connected_components reduces the number of component to 1.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components > 1
|
||||
|
||||
graph = _fix_connected_components(X, graph, n_connected_components, labels)
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components == 1
|
||||
|
||||
|
||||
def test_fix_connected_components_precomputed():
|
||||
# Test that _fix_connected_components accepts precomputed distance matrix.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components > 1
|
||||
|
||||
distances = pairwise_distances(X)
|
||||
graph = _fix_connected_components(
|
||||
distances, graph, n_connected_components, labels, metric="precomputed"
|
||||
)
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components == 1
|
||||
|
||||
# but it does not work with precomputed neighbors graph
|
||||
with pytest.raises(RuntimeError, match="does not work with a sparse"):
|
||||
_fix_connected_components(
|
||||
graph, graph, n_connected_components, labels, metric="precomputed"
|
||||
)
|
||||
|
||||
|
||||
def test_fix_connected_components_wrong_mode():
|
||||
# Test that the an error is raised if the mode string is incorrect.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
|
||||
with pytest.raises(ValueError, match="Unknown mode"):
|
||||
graph = _fix_connected_components(
|
||||
X, graph, n_connected_components, labels, mode="foo"
|
||||
)
|
||||
|
||||
|
||||
def test_fix_connected_components_connectivity_mode():
|
||||
# Test that the connectivity mode fill new connections with ones.
|
||||
X = np.array([0, 1, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=1, mode="connectivity")
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
graph = _fix_connected_components(
|
||||
X, graph, n_connected_components, labels, mode="connectivity"
|
||||
)
|
||||
assert np.all(graph.data == 1)
|
||||
|
||||
|
||||
def test_fix_connected_components_distance_mode():
|
||||
# Test that the distance mode does not fill new connections with ones.
|
||||
X = np.array([0, 1, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=1, mode="distance")
|
||||
assert np.all(graph.data == 1)
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
graph = _fix_connected_components(
|
||||
X, graph, n_connected_components, labels, mode="distance"
|
||||
)
|
||||
assert not np.all(graph.data == 1)
|
||||
@@ -0,0 +1,172 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
import warnings
|
||||
|
||||
import pickle
|
||||
|
||||
from sklearn.utils.metaestimators import if_delegate_has_method
|
||||
from sklearn.utils.metaestimators import available_if
|
||||
|
||||
|
||||
class Prefix:
|
||||
def func(self):
|
||||
pass
|
||||
|
||||
|
||||
class MockMetaEstimator:
|
||||
"""This is a mock meta estimator"""
|
||||
|
||||
a_prefix = Prefix()
|
||||
|
||||
@if_delegate_has_method(delegate="a_prefix")
|
||||
def func(self):
|
||||
"""This is a mock delegated function"""
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:if_delegate_has_method was deprecated")
|
||||
def test_delegated_docstring():
|
||||
assert "This is a mock delegated function" in str(
|
||||
MockMetaEstimator.__dict__["func"].__doc__
|
||||
)
|
||||
assert "This is a mock delegated function" in str(MockMetaEstimator.func.__doc__)
|
||||
assert "This is a mock delegated function" in str(MockMetaEstimator().func.__doc__)
|
||||
|
||||
|
||||
class MetaEst:
|
||||
"""A mock meta estimator"""
|
||||
|
||||
def __init__(self, sub_est, better_sub_est=None):
|
||||
self.sub_est = sub_est
|
||||
self.better_sub_est = better_sub_est
|
||||
|
||||
@if_delegate_has_method(delegate="sub_est")
|
||||
def predict(self):
|
||||
pass
|
||||
|
||||
|
||||
class MetaEstTestTuple(MetaEst):
|
||||
"""A mock meta estimator to test passing a tuple of delegates"""
|
||||
|
||||
@if_delegate_has_method(delegate=("sub_est", "better_sub_est"))
|
||||
def predict(self):
|
||||
pass
|
||||
|
||||
|
||||
class MetaEstTestList(MetaEst):
|
||||
"""A mock meta estimator to test passing a list of delegates"""
|
||||
|
||||
@if_delegate_has_method(delegate=["sub_est", "better_sub_est"])
|
||||
def predict(self):
|
||||
pass
|
||||
|
||||
|
||||
class HasPredict:
|
||||
"""A mock sub-estimator with predict method"""
|
||||
|
||||
def predict(self):
|
||||
pass
|
||||
|
||||
|
||||
class HasNoPredict:
|
||||
"""A mock sub-estimator with no predict method"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class HasPredictAsNDArray:
|
||||
"""A mock sub-estimator where predict is a NumPy array"""
|
||||
|
||||
predict = np.ones((10, 2), dtype=np.int64)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:if_delegate_has_method was deprecated")
|
||||
def test_if_delegate_has_method():
|
||||
assert hasattr(MetaEst(HasPredict()), "predict")
|
||||
assert not hasattr(MetaEst(HasNoPredict()), "predict")
|
||||
assert not hasattr(MetaEstTestTuple(HasNoPredict(), HasNoPredict()), "predict")
|
||||
assert hasattr(MetaEstTestTuple(HasPredict(), HasNoPredict()), "predict")
|
||||
assert not hasattr(MetaEstTestTuple(HasNoPredict(), HasPredict()), "predict")
|
||||
assert not hasattr(MetaEstTestList(HasNoPredict(), HasPredict()), "predict")
|
||||
assert hasattr(MetaEstTestList(HasPredict(), HasPredict()), "predict")
|
||||
|
||||
|
||||
class AvailableParameterEstimator:
|
||||
"""This estimator's `available` parameter toggles the presence of a method"""
|
||||
|
||||
def __init__(self, available=True, return_value=1):
|
||||
self.available = available
|
||||
self.return_value = return_value
|
||||
|
||||
@available_if(lambda est: est.available)
|
||||
def available_func(self):
|
||||
"""This is a mock available_if function"""
|
||||
return self.return_value
|
||||
|
||||
|
||||
def test_available_if_docstring():
|
||||
assert "This is a mock available_if function" in str(
|
||||
AvailableParameterEstimator.__dict__["available_func"].__doc__
|
||||
)
|
||||
assert "This is a mock available_if function" in str(
|
||||
AvailableParameterEstimator.available_func.__doc__
|
||||
)
|
||||
assert "This is a mock available_if function" in str(
|
||||
AvailableParameterEstimator().available_func.__doc__
|
||||
)
|
||||
|
||||
|
||||
def test_available_if():
|
||||
assert hasattr(AvailableParameterEstimator(), "available_func")
|
||||
assert not hasattr(AvailableParameterEstimator(available=False), "available_func")
|
||||
|
||||
|
||||
def test_available_if_unbound_method():
|
||||
# This is a non regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/20614
|
||||
# to make sure that decorated functions can be used as an unbound method,
|
||||
# for instance when monkeypatching.
|
||||
est = AvailableParameterEstimator()
|
||||
AvailableParameterEstimator.available_func(est)
|
||||
|
||||
est = AvailableParameterEstimator(available=False)
|
||||
with pytest.raises(
|
||||
AttributeError,
|
||||
match="This 'AvailableParameterEstimator' has no attribute 'available_func'",
|
||||
):
|
||||
AvailableParameterEstimator.available_func(est)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:if_delegate_has_method was deprecated")
|
||||
def test_if_delegate_has_method_numpy_array():
|
||||
"""Check that we can check for an attribute that is a NumPy array.
|
||||
|
||||
This is a non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/21144
|
||||
"""
|
||||
estimator = MetaEst(HasPredictAsNDArray())
|
||||
assert hasattr(estimator, "predict")
|
||||
|
||||
|
||||
def test_if_delegate_has_method_deprecated():
|
||||
"""Check the deprecation warning of if_delegate_has_method"""
|
||||
# don't warn when creating the decorator
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", FutureWarning)
|
||||
_ = if_delegate_has_method(delegate="predict")
|
||||
|
||||
# Only when calling it
|
||||
with pytest.warns(FutureWarning, match="if_delegate_has_method was deprecated"):
|
||||
hasattr(MetaEst(HasPredict()), "predict")
|
||||
|
||||
|
||||
def test_available_if_methods_can_be_pickled():
|
||||
"""Check that available_if methods can be pickled.
|
||||
|
||||
Non-regression test for #21344.
|
||||
"""
|
||||
return_value = 10
|
||||
est = AvailableParameterEstimator(available=True, return_value=return_value)
|
||||
pickled_bytes = pickle.dumps(est.available_func)
|
||||
unpickled_func = pickle.loads(pickled_bytes)
|
||||
assert unpickled_func() == return_value
|
||||
@@ -0,0 +1,183 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import sparse
|
||||
|
||||
from numpy.testing import assert_array_equal
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.utils import check_array
|
||||
from sklearn.utils import _safe_indexing
|
||||
from sklearn.utils._testing import _convert_container
|
||||
|
||||
from sklearn.utils._mocking import CheckingClassifier
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def iris():
|
||||
return load_iris(return_X_y=True)
|
||||
|
||||
|
||||
def _success(x):
|
||||
return True
|
||||
|
||||
|
||||
def _fail(x):
|
||||
return False
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{},
|
||||
{"check_X": _success},
|
||||
{"check_y": _success},
|
||||
{"check_X": _success, "check_y": _success},
|
||||
],
|
||||
)
|
||||
def test_check_on_fit_success(iris, kwargs):
|
||||
X, y = iris
|
||||
CheckingClassifier(**kwargs).fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"check_X": _fail},
|
||||
{"check_y": _fail},
|
||||
{"check_X": _success, "check_y": _fail},
|
||||
{"check_X": _fail, "check_y": _success},
|
||||
{"check_X": _fail, "check_y": _fail},
|
||||
],
|
||||
)
|
||||
def test_check_on_fit_fail(iris, kwargs):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(**kwargs)
|
||||
with pytest.raises(AssertionError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pred_func", ["predict", "predict_proba", "decision_function", "score"]
|
||||
)
|
||||
def test_check_X_on_predict_success(iris, pred_func):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(check_X=_success).fit(X, y)
|
||||
getattr(clf, pred_func)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pred_func", ["predict", "predict_proba", "decision_function", "score"]
|
||||
)
|
||||
def test_check_X_on_predict_fail(iris, pred_func):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(check_X=_success).fit(X, y)
|
||||
clf.set_params(check_X=_fail)
|
||||
with pytest.raises(AssertionError):
|
||||
getattr(clf, pred_func)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_type", ["list", "array", "sparse", "dataframe"])
|
||||
def test_checking_classifier(iris, input_type):
|
||||
# Check that the CheckingClassifier outputs what we expect
|
||||
X, y = iris
|
||||
X = _convert_container(X, input_type)
|
||||
clf = CheckingClassifier()
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_array_equal(clf.classes_, np.unique(y))
|
||||
assert len(clf.classes_) == 3
|
||||
assert clf.n_features_in_ == 4
|
||||
|
||||
y_pred = clf.predict(X)
|
||||
assert_array_equal(y_pred, np.zeros(y_pred.size, dtype=int))
|
||||
|
||||
assert clf.score(X) == pytest.approx(0)
|
||||
clf.set_params(foo_param=10)
|
||||
assert clf.fit(X, y).score(X) == pytest.approx(1)
|
||||
|
||||
y_proba = clf.predict_proba(X)
|
||||
assert y_proba.shape == (150, 3)
|
||||
assert_allclose(y_proba[:, 0], 1)
|
||||
assert_allclose(y_proba[:, 1:], 0)
|
||||
|
||||
y_decision = clf.decision_function(X)
|
||||
assert y_decision.shape == (150, 3)
|
||||
assert_allclose(y_decision[:, 0], 1)
|
||||
assert_allclose(y_decision[:, 1:], 0)
|
||||
|
||||
# check the shape in case of binary classification
|
||||
first_2_classes = np.logical_or(y == 0, y == 1)
|
||||
X = _safe_indexing(X, first_2_classes)
|
||||
y = _safe_indexing(y, first_2_classes)
|
||||
clf.fit(X, y)
|
||||
|
||||
y_proba = clf.predict_proba(X)
|
||||
assert y_proba.shape == (100, 2)
|
||||
assert_allclose(y_proba[:, 0], 1)
|
||||
assert_allclose(y_proba[:, 1], 0)
|
||||
|
||||
y_decision = clf.decision_function(X)
|
||||
assert y_decision.shape == (100,)
|
||||
assert_allclose(y_decision, 0)
|
||||
|
||||
|
||||
def test_checking_classifier_with_params(iris):
|
||||
X, y = iris
|
||||
X_sparse = sparse.csr_matrix(X)
|
||||
|
||||
clf = CheckingClassifier(check_X=sparse.issparse)
|
||||
with pytest.raises(AssertionError):
|
||||
clf.fit(X, y)
|
||||
clf.fit(X_sparse, y)
|
||||
|
||||
clf = CheckingClassifier(
|
||||
check_X=check_array, check_X_params={"accept_sparse": False}
|
||||
)
|
||||
clf.fit(X, y)
|
||||
with pytest.raises(TypeError, match="A sparse matrix was passed"):
|
||||
clf.fit(X_sparse, y)
|
||||
|
||||
|
||||
def test_checking_classifier_fit_params(iris):
|
||||
# check the error raised when the number of samples is not the one expected
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(expected_sample_weight=True)
|
||||
sample_weight = np.ones(len(X) // 2)
|
||||
|
||||
msg = f"sample_weight.shape == ({len(X) // 2},), expected ({len(X)},)!"
|
||||
with pytest.raises(ValueError) as exc:
|
||||
clf.fit(X, y, sample_weight=sample_weight)
|
||||
assert exc.value.args[0] == msg
|
||||
|
||||
|
||||
def test_checking_classifier_missing_fit_params(iris):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(expected_sample_weight=True)
|
||||
err_msg = "Expected sample_weight to be passed"
|
||||
with pytest.raises(AssertionError, match=err_msg):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"methods_to_check",
|
||||
[["predict"], ["predict", "predict_proba"]],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"predict_method", ["predict", "predict_proba", "decision_function", "score"]
|
||||
)
|
||||
def test_checking_classifier_methods_to_check(iris, methods_to_check, predict_method):
|
||||
# check that methods_to_check allows to bypass checks
|
||||
X, y = iris
|
||||
|
||||
clf = CheckingClassifier(
|
||||
check_X=sparse.issparse,
|
||||
methods_to_check=methods_to_check,
|
||||
)
|
||||
|
||||
clf.fit(X, y)
|
||||
if predict_method in methods_to_check:
|
||||
with pytest.raises(AssertionError):
|
||||
getattr(clf, predict_method)(X)
|
||||
else:
|
||||
getattr(clf, predict_method)(X)
|
||||
@@ -0,0 +1,429 @@
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
from itertools import product
|
||||
import pytest
|
||||
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse import csr_matrix
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy.sparse import dok_matrix
|
||||
from scipy.sparse import lil_matrix
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.estimator_checks import _NotAnArray
|
||||
|
||||
from sklearn.utils.multiclass import unique_labels
|
||||
from sklearn.utils.multiclass import is_multilabel
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
from sklearn.utils.multiclass import class_distribution
|
||||
from sklearn.utils.multiclass import check_classification_targets
|
||||
from sklearn.utils.multiclass import _ovr_decision_function
|
||||
|
||||
from sklearn.utils.metaestimators import _safe_split
|
||||
from sklearn.model_selection import ShuffleSplit
|
||||
from sklearn.svm import SVC
|
||||
from sklearn import datasets
|
||||
|
||||
|
||||
EXAMPLES = {
|
||||
"multilabel-indicator": [
|
||||
# valid when the data is formatted as sparse or dense, identified
|
||||
# by CSR format when the testing takes place
|
||||
csr_matrix(np.random.RandomState(42).randint(2, size=(10, 10))),
|
||||
[[0, 1], [1, 0]],
|
||||
[[0, 1]],
|
||||
csr_matrix(np.array([[0, 1], [1, 0]])),
|
||||
csr_matrix(np.array([[0, 1], [1, 0]], dtype=bool)),
|
||||
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.int8)),
|
||||
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.uint8)),
|
||||
csr_matrix(np.array([[0, 1], [1, 0]], dtype=float)),
|
||||
csr_matrix(np.array([[0, 1], [1, 0]], dtype=np.float32)),
|
||||
csr_matrix(np.array([[0, 0], [0, 0]])),
|
||||
csr_matrix(np.array([[0, 1]])),
|
||||
# Only valid when data is dense
|
||||
[[-1, 1], [1, -1]],
|
||||
np.array([[-1, 1], [1, -1]]),
|
||||
np.array([[-3, 3], [3, -3]]),
|
||||
_NotAnArray(np.array([[-3, 3], [3, -3]])),
|
||||
],
|
||||
"multiclass": [
|
||||
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
|
||||
np.array([1, 0, 2]),
|
||||
np.array([1, 0, 2], dtype=np.int8),
|
||||
np.array([1, 0, 2], dtype=np.uint8),
|
||||
np.array([1, 0, 2], dtype=float),
|
||||
np.array([1, 0, 2], dtype=np.float32),
|
||||
np.array([[1], [0], [2]]),
|
||||
_NotAnArray(np.array([1, 0, 2])),
|
||||
[0, 1, 2],
|
||||
["a", "b", "c"],
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
],
|
||||
"multiclass-multioutput": [
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
[["a", "b"], ["c", "d"]],
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
|
||||
np.array([["a", "b"], ["c", "d"]]),
|
||||
np.array([["a", "b"], ["c", "d"]]),
|
||||
np.array([["a", "b"], ["c", "d"]], dtype=object),
|
||||
np.array([[1, 0, 2]]),
|
||||
_NotAnArray(np.array([[1, 0, 2]])),
|
||||
],
|
||||
"binary": [
|
||||
[0, 1],
|
||||
[1, 1],
|
||||
[],
|
||||
[0],
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
|
||||
np.array([[0], [1]]),
|
||||
_NotAnArray(np.array([[0], [1]])),
|
||||
[1, -1],
|
||||
[3, 5],
|
||||
["a"],
|
||||
["a", "b"],
|
||||
["abc", "def"],
|
||||
np.array(["abc", "def"]),
|
||||
["a", "b"],
|
||||
np.array(["abc", "def"], dtype=object),
|
||||
],
|
||||
"continuous": [
|
||||
[1e-5],
|
||||
[0, 0.5],
|
||||
np.array([[0], [0.5]]),
|
||||
np.array([[0], [0.5]], dtype=np.float32),
|
||||
],
|
||||
"continuous-multioutput": [
|
||||
np.array([[0, 0.5], [0.5, 0]]),
|
||||
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
|
||||
np.array([[0, 0.5]]),
|
||||
],
|
||||
"unknown": [
|
||||
[[]],
|
||||
[()],
|
||||
# sequence of sequences that weren't supported even before deprecation
|
||||
np.array([np.array([]), np.array([1, 2, 3])], dtype=object),
|
||||
[np.array([]), np.array([1, 2, 3])],
|
||||
[{1, 2, 3}, {1, 2}],
|
||||
[frozenset([1, 2, 3]), frozenset([1, 2])],
|
||||
# and also confusable as sequences of sequences
|
||||
[{0: "a", 1: "b"}, {0: "a"}],
|
||||
# empty second dimension
|
||||
np.array([[], []]),
|
||||
# 3d
|
||||
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
|
||||
],
|
||||
}
|
||||
|
||||
NON_ARRAY_LIKE_EXAMPLES = [
|
||||
{1, 2, 3},
|
||||
{0: "a", 1: "b"},
|
||||
{0: [5], 1: [5]},
|
||||
"abc",
|
||||
frozenset([1, 2, 3]),
|
||||
None,
|
||||
]
|
||||
|
||||
MULTILABEL_SEQUENCES = [
|
||||
[[1], [2], [0, 1]],
|
||||
[(), (2), (0, 1)],
|
||||
np.array([[], [1, 2]], dtype="object"),
|
||||
_NotAnArray(np.array([[], [1, 2]], dtype="object")),
|
||||
]
|
||||
|
||||
|
||||
def test_unique_labels():
|
||||
# Empty iterable
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels()
|
||||
|
||||
# Multiclass problem
|
||||
assert_array_equal(unique_labels(range(10)), np.arange(10))
|
||||
assert_array_equal(unique_labels(np.arange(10)), np.arange(10))
|
||||
assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4]))
|
||||
|
||||
# Multilabel indicator
|
||||
assert_array_equal(
|
||||
unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3)
|
||||
)
|
||||
|
||||
assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3))
|
||||
|
||||
# Several arrays passed
|
||||
assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5))
|
||||
assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3))
|
||||
|
||||
# Border line case with binary indicator matrix
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([4, 0, 2], np.ones((5, 5)))
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(np.ones((5, 4)), np.ones((5, 5)))
|
||||
|
||||
assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5))
|
||||
|
||||
|
||||
def test_unique_labels_non_specific():
|
||||
# Test unique_labels with a variety of collected examples
|
||||
|
||||
# Smoke test for all supported format
|
||||
for format in ["binary", "multiclass", "multilabel-indicator"]:
|
||||
for y in EXAMPLES[format]:
|
||||
unique_labels(y)
|
||||
|
||||
# We don't support those format at the moment
|
||||
for example in NON_ARRAY_LIKE_EXAMPLES:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(example)
|
||||
|
||||
for y_type in [
|
||||
"unknown",
|
||||
"continuous",
|
||||
"continuous-multioutput",
|
||||
"multiclass-multioutput",
|
||||
]:
|
||||
for example in EXAMPLES[y_type]:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(example)
|
||||
|
||||
|
||||
def test_unique_labels_mixed_types():
|
||||
# Mix with binary or multiclass and multilabel
|
||||
mix_clf_format = product(
|
||||
EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"]
|
||||
)
|
||||
|
||||
for y_multilabel, y_multiclass in mix_clf_format:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(y_multiclass, y_multilabel)
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(y_multilabel, y_multiclass)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([[1, 2]], [["a", "d"]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(["1", 2])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([["1", 2], [1, 3]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([["1", "2"], [2, 3]])
|
||||
|
||||
|
||||
def test_is_multilabel():
|
||||
for group, group_examples in EXAMPLES.items():
|
||||
if group in ["multilabel-indicator"]:
|
||||
dense_exp = True
|
||||
else:
|
||||
dense_exp = False
|
||||
|
||||
for example in group_examples:
|
||||
# Only mark explicitly defined sparse examples as valid sparse
|
||||
# multilabel-indicators
|
||||
if group == "multilabel-indicator" and issparse(example):
|
||||
sparse_exp = True
|
||||
else:
|
||||
sparse_exp = False
|
||||
|
||||
if issparse(example) or (
|
||||
hasattr(example, "__array__")
|
||||
and np.asarray(example).ndim == 2
|
||||
and np.asarray(example).dtype.kind in "biuf"
|
||||
and np.asarray(example).shape[1] > 0
|
||||
):
|
||||
examples_sparse = [
|
||||
sparse_matrix(example)
|
||||
for sparse_matrix in [
|
||||
coo_matrix,
|
||||
csc_matrix,
|
||||
csr_matrix,
|
||||
dok_matrix,
|
||||
lil_matrix,
|
||||
]
|
||||
]
|
||||
for exmpl_sparse in examples_sparse:
|
||||
assert sparse_exp == is_multilabel(
|
||||
exmpl_sparse
|
||||
), "is_multilabel(%r) should be %s" % (exmpl_sparse, sparse_exp)
|
||||
|
||||
# Densify sparse examples before testing
|
||||
if issparse(example):
|
||||
example = example.toarray()
|
||||
|
||||
assert dense_exp == is_multilabel(
|
||||
example
|
||||
), "is_multilabel(%r) should be %s" % (example, dense_exp)
|
||||
|
||||
|
||||
def test_check_classification_targets():
|
||||
for y_type in EXAMPLES.keys():
|
||||
if y_type in ["unknown", "continuous", "continuous-multioutput"]:
|
||||
for example in EXAMPLES[y_type]:
|
||||
msg = "Unknown label type: "
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
check_classification_targets(example)
|
||||
else:
|
||||
for example in EXAMPLES[y_type]:
|
||||
check_classification_targets(example)
|
||||
|
||||
|
||||
# @ignore_warnings
|
||||
def test_type_of_target():
|
||||
for group, group_examples in EXAMPLES.items():
|
||||
for example in group_examples:
|
||||
assert (
|
||||
type_of_target(example) == group
|
||||
), "type_of_target(%r) should be %r, got %r" % (
|
||||
example,
|
||||
group,
|
||||
type_of_target(example),
|
||||
)
|
||||
|
||||
for example in NON_ARRAY_LIKE_EXAMPLES:
|
||||
msg_regex = r"Expected array-like \(array or non-string sequence\).*"
|
||||
with pytest.raises(ValueError, match=msg_regex):
|
||||
type_of_target(example)
|
||||
|
||||
for example in MULTILABEL_SEQUENCES:
|
||||
msg = (
|
||||
"You appear to be using a legacy multi-label data "
|
||||
"representation. Sequence of sequences are no longer supported;"
|
||||
" use a binary array or sparse matrix instead."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
type_of_target(example)
|
||||
|
||||
|
||||
def test_type_of_target_pandas_sparse():
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan])
|
||||
msg = "y cannot be class 'SparseSeries' or 'SparseArray'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
type_of_target(y)
|
||||
|
||||
|
||||
def test_class_distribution():
|
||||
y = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[2, 2, 0, 1],
|
||||
[1, 3, 0, 1],
|
||||
[4, 2, 0, 1],
|
||||
[2, 0, 0, 1],
|
||||
[1, 3, 0, 1],
|
||||
]
|
||||
)
|
||||
# Define the sparse matrix with a mix of implicit and explicit zeros
|
||||
data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
|
||||
indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
|
||||
indptr = np.array([0, 6, 11, 11, 17])
|
||||
y_sp = sp.csc_matrix((data, indices, indptr), shape=(6, 4))
|
||||
|
||||
classes, n_classes, class_prior = class_distribution(y)
|
||||
classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
|
||||
classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]]
|
||||
n_classes_expected = [3, 3, 1, 1]
|
||||
class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]]
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
assert_array_almost_equal(classes[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
|
||||
|
||||
assert_array_almost_equal(classes_sp[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
|
||||
|
||||
# Test again with explicit sample weights
|
||||
(classes, n_classes, class_prior) = class_distribution(
|
||||
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
|
||||
)
|
||||
(classes_sp, n_classes_sp, class_prior_sp) = class_distribution(
|
||||
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
|
||||
)
|
||||
class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]]
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
assert_array_almost_equal(classes[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
|
||||
|
||||
assert_array_almost_equal(classes_sp[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
|
||||
|
||||
|
||||
def test_safe_split_with_precomputed_kernel():
|
||||
clf = SVC()
|
||||
clfp = SVC(kernel="precomputed")
|
||||
|
||||
iris = datasets.load_iris()
|
||||
X, y = iris.data, iris.target
|
||||
K = np.dot(X, X.T)
|
||||
|
||||
cv = ShuffleSplit(test_size=0.25, random_state=0)
|
||||
train, test = list(cv.split(X))[0]
|
||||
|
||||
X_train, y_train = _safe_split(clf, X, y, train)
|
||||
K_train, y_train2 = _safe_split(clfp, K, y, train)
|
||||
assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
|
||||
assert_array_almost_equal(y_train, y_train2)
|
||||
|
||||
X_test, y_test = _safe_split(clf, X, y, test, train)
|
||||
K_test, y_test2 = _safe_split(clfp, K, y, test, train)
|
||||
assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
|
||||
assert_array_almost_equal(y_test, y_test2)
|
||||
|
||||
|
||||
def test_ovr_decision_function():
|
||||
# test properties for ovr decision function
|
||||
|
||||
predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]])
|
||||
|
||||
confidences = np.array(
|
||||
[[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]]
|
||||
)
|
||||
|
||||
n_classes = 3
|
||||
|
||||
dec_values = _ovr_decision_function(predictions, confidences, n_classes)
|
||||
|
||||
# check that the decision values are within 0.5 range of the votes
|
||||
votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]])
|
||||
|
||||
assert_allclose(votes, dec_values, atol=0.5)
|
||||
|
||||
# check that the prediction are what we expect
|
||||
# highest vote or highest confidence if there is a tie.
|
||||
# for the second sample we have a tie (should be won by 1)
|
||||
expected_prediction = np.array([2, 1, 2, 2])
|
||||
assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction)
|
||||
|
||||
# third and fourth sample have the same vote but third sample
|
||||
# has higher confidence, this should reflect on the decision values
|
||||
assert dec_values[2, 2] > dec_values[3, 2]
|
||||
|
||||
# assert subset invariance.
|
||||
dec_values_one = [
|
||||
_ovr_decision_function(
|
||||
np.array([predictions[i]]), np.array([confidences[i]]), n_classes
|
||||
)[0]
|
||||
for i in range(4)
|
||||
]
|
||||
|
||||
assert_allclose(dec_values, dec_values_one, atol=1e-6)
|
||||
@@ -0,0 +1,74 @@
|
||||
# Author: Olivier Grisel <olivier.grisel@ensta.org>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
from sklearn.utils.murmurhash import murmurhash3_32
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
|
||||
def test_mmhash3_int():
|
||||
assert murmurhash3_32(3) == 847579505
|
||||
assert murmurhash3_32(3, seed=0) == 847579505
|
||||
assert murmurhash3_32(3, seed=42) == -1823081949
|
||||
|
||||
assert murmurhash3_32(3, positive=False) == 847579505
|
||||
assert murmurhash3_32(3, seed=0, positive=False) == 847579505
|
||||
assert murmurhash3_32(3, seed=42, positive=False) == -1823081949
|
||||
|
||||
assert murmurhash3_32(3, positive=True) == 847579505
|
||||
assert murmurhash3_32(3, seed=0, positive=True) == 847579505
|
||||
assert murmurhash3_32(3, seed=42, positive=True) == 2471885347
|
||||
|
||||
|
||||
def test_mmhash3_int_array():
|
||||
rng = np.random.RandomState(42)
|
||||
keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
|
||||
keys = keys.reshape((3, 2, 1))
|
||||
|
||||
for seed in [0, 42]:
|
||||
expected = np.array([murmurhash3_32(int(k), seed) for k in keys.flat])
|
||||
expected = expected.reshape(keys.shape)
|
||||
assert_array_equal(murmurhash3_32(keys, seed), expected)
|
||||
|
||||
for seed in [0, 42]:
|
||||
expected = np.array([murmurhash3_32(k, seed, positive=True) for k in keys.flat])
|
||||
expected = expected.reshape(keys.shape)
|
||||
assert_array_equal(murmurhash3_32(keys, seed, positive=True), expected)
|
||||
|
||||
|
||||
def test_mmhash3_bytes():
|
||||
assert murmurhash3_32(b"foo", 0) == -156908512
|
||||
assert murmurhash3_32(b"foo", 42) == -1322301282
|
||||
|
||||
assert murmurhash3_32(b"foo", 0, positive=True) == 4138058784
|
||||
assert murmurhash3_32(b"foo", 42, positive=True) == 2972666014
|
||||
|
||||
|
||||
def test_mmhash3_unicode():
|
||||
assert murmurhash3_32("foo", 0) == -156908512
|
||||
assert murmurhash3_32("foo", 42) == -1322301282
|
||||
|
||||
assert murmurhash3_32("foo", 0, positive=True) == 4138058784
|
||||
assert murmurhash3_32("foo", 42, positive=True) == 2972666014
|
||||
|
||||
|
||||
def test_no_collision_on_byte_range():
|
||||
previous_hashes = set()
|
||||
for i in range(100):
|
||||
h = murmurhash3_32(" " * i, 0)
|
||||
assert h not in previous_hashes, "Found collision on growing empty string"
|
||||
|
||||
|
||||
def test_uniform_distribution():
|
||||
n_bins, n_samples = 10, 100000
|
||||
bins = np.zeros(n_bins, dtype=np.float64)
|
||||
|
||||
for i in range(n_samples):
|
||||
bins[murmurhash3_32(i, positive=True) % n_bins] += 1
|
||||
|
||||
means = bins / n_samples
|
||||
expected = np.full(n_bins, 1.0 / n_bins)
|
||||
|
||||
assert_array_almost_equal(means / expected, np.ones(n_bins), 2)
|
||||
@@ -0,0 +1,32 @@
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils.optimize import _newton_cg
|
||||
from scipy.optimize import fmin_ncg
|
||||
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
|
||||
def test_newton_cg():
|
||||
# Test that newton_cg gives same result as scipy's fmin_ncg
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = rng.normal(size=(10, 10))
|
||||
x0 = np.ones(10)
|
||||
|
||||
def func(x):
|
||||
Ax = A.dot(x)
|
||||
return 0.5 * (Ax).dot(Ax)
|
||||
|
||||
def grad(x):
|
||||
return A.T.dot(A.dot(x))
|
||||
|
||||
def hess(x, p):
|
||||
return p.dot(A.T.dot(A.dot(x.all())))
|
||||
|
||||
def grad_hess(x):
|
||||
return grad(x), lambda x: A.T.dot(A.dot(x))
|
||||
|
||||
assert_array_almost_equal(
|
||||
_newton_cg(grad_hess, func, grad, x0, tol=1e-10)[0],
|
||||
fmin_ncg(f=func, x0=x0, fprime=grad, fhess_p=hess),
|
||||
)
|
||||
@@ -0,0 +1,24 @@
|
||||
import pytest
|
||||
from joblib import Parallel
|
||||
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn._config import config_context, get_config
|
||||
from sklearn.utils.fixes import delayed
|
||||
|
||||
|
||||
def get_working_memory():
|
||||
return get_config()["working_memory"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_jobs", [1, 2])
|
||||
@pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"])
|
||||
def test_configuration_passes_through_to_joblib(n_jobs, backend):
|
||||
# Tests that the global global configuration is passed to joblib jobs
|
||||
|
||||
with config_context(working_memory=123):
|
||||
results = Parallel(n_jobs=n_jobs, backend=backend)(
|
||||
delayed(get_working_memory)() for _ in range(2)
|
||||
)
|
||||
|
||||
assert_array_equal(results, [123] * 2)
|
||||
@@ -0,0 +1,680 @@
|
||||
import re
|
||||
from pprint import PrettyPrinter
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._pprint import _EstimatorPrettyPrinter
|
||||
from sklearn.linear_model import LogisticRegressionCV
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
from sklearn import config_context
|
||||
|
||||
|
||||
# Ignore flake8 (lots of line too long issues)
|
||||
# flake8: noqa
|
||||
|
||||
# Constructors excerpted to test pprinting
|
||||
class LogisticRegression(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
penalty="l2",
|
||||
dual=False,
|
||||
tol=1e-4,
|
||||
C=1.0,
|
||||
fit_intercept=True,
|
||||
intercept_scaling=1,
|
||||
class_weight=None,
|
||||
random_state=None,
|
||||
solver="warn",
|
||||
max_iter=100,
|
||||
multi_class="warn",
|
||||
verbose=0,
|
||||
warm_start=False,
|
||||
n_jobs=None,
|
||||
l1_ratio=None,
|
||||
):
|
||||
self.penalty = penalty
|
||||
self.dual = dual
|
||||
self.tol = tol
|
||||
self.C = C
|
||||
self.fit_intercept = fit_intercept
|
||||
self.intercept_scaling = intercept_scaling
|
||||
self.class_weight = class_weight
|
||||
self.random_state = random_state
|
||||
self.solver = solver
|
||||
self.max_iter = max_iter
|
||||
self.multi_class = multi_class
|
||||
self.verbose = verbose
|
||||
self.warm_start = warm_start
|
||||
self.n_jobs = n_jobs
|
||||
self.l1_ratio = l1_ratio
|
||||
|
||||
def fit(self, X, y):
|
||||
return self
|
||||
|
||||
|
||||
class StandardScaler(TransformerMixin, BaseEstimator):
|
||||
def __init__(self, copy=True, with_mean=True, with_std=True):
|
||||
self.with_mean = with_mean
|
||||
self.with_std = with_std
|
||||
self.copy = copy
|
||||
|
||||
def transform(self, X, copy=None):
|
||||
return self
|
||||
|
||||
|
||||
class RFE(BaseEstimator):
|
||||
def __init__(self, estimator, n_features_to_select=None, step=1, verbose=0):
|
||||
self.estimator = estimator
|
||||
self.n_features_to_select = n_features_to_select
|
||||
self.step = step
|
||||
self.verbose = verbose
|
||||
|
||||
|
||||
class GridSearchCV(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
estimator,
|
||||
param_grid,
|
||||
scoring=None,
|
||||
n_jobs=None,
|
||||
iid="warn",
|
||||
refit=True,
|
||||
cv="warn",
|
||||
verbose=0,
|
||||
pre_dispatch="2*n_jobs",
|
||||
error_score="raise-deprecating",
|
||||
return_train_score=False,
|
||||
):
|
||||
self.estimator = estimator
|
||||
self.param_grid = param_grid
|
||||
self.scoring = scoring
|
||||
self.n_jobs = n_jobs
|
||||
self.iid = iid
|
||||
self.refit = refit
|
||||
self.cv = cv
|
||||
self.verbose = verbose
|
||||
self.pre_dispatch = pre_dispatch
|
||||
self.error_score = error_score
|
||||
self.return_train_score = return_train_score
|
||||
|
||||
|
||||
class CountVectorizer(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
input="content",
|
||||
encoding="utf-8",
|
||||
decode_error="strict",
|
||||
strip_accents=None,
|
||||
lowercase=True,
|
||||
preprocessor=None,
|
||||
tokenizer=None,
|
||||
stop_words=None,
|
||||
token_pattern=r"(?u)\b\w\w+\b",
|
||||
ngram_range=(1, 1),
|
||||
analyzer="word",
|
||||
max_df=1.0,
|
||||
min_df=1,
|
||||
max_features=None,
|
||||
vocabulary=None,
|
||||
binary=False,
|
||||
dtype=np.int64,
|
||||
):
|
||||
self.input = input
|
||||
self.encoding = encoding
|
||||
self.decode_error = decode_error
|
||||
self.strip_accents = strip_accents
|
||||
self.preprocessor = preprocessor
|
||||
self.tokenizer = tokenizer
|
||||
self.analyzer = analyzer
|
||||
self.lowercase = lowercase
|
||||
self.token_pattern = token_pattern
|
||||
self.stop_words = stop_words
|
||||
self.max_df = max_df
|
||||
self.min_df = min_df
|
||||
self.max_features = max_features
|
||||
self.ngram_range = ngram_range
|
||||
self.vocabulary = vocabulary
|
||||
self.binary = binary
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
class Pipeline(BaseEstimator):
|
||||
def __init__(self, steps, memory=None):
|
||||
self.steps = steps
|
||||
self.memory = memory
|
||||
|
||||
|
||||
class SVC(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
C=1.0,
|
||||
kernel="rbf",
|
||||
degree=3,
|
||||
gamma="auto_deprecated",
|
||||
coef0=0.0,
|
||||
shrinking=True,
|
||||
probability=False,
|
||||
tol=1e-3,
|
||||
cache_size=200,
|
||||
class_weight=None,
|
||||
verbose=False,
|
||||
max_iter=-1,
|
||||
decision_function_shape="ovr",
|
||||
random_state=None,
|
||||
):
|
||||
self.kernel = kernel
|
||||
self.degree = degree
|
||||
self.gamma = gamma
|
||||
self.coef0 = coef0
|
||||
self.tol = tol
|
||||
self.C = C
|
||||
self.shrinking = shrinking
|
||||
self.probability = probability
|
||||
self.cache_size = cache_size
|
||||
self.class_weight = class_weight
|
||||
self.verbose = verbose
|
||||
self.max_iter = max_iter
|
||||
self.decision_function_shape = decision_function_shape
|
||||
self.random_state = random_state
|
||||
|
||||
|
||||
class PCA(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
n_components=None,
|
||||
copy=True,
|
||||
whiten=False,
|
||||
svd_solver="auto",
|
||||
tol=0.0,
|
||||
iterated_power="auto",
|
||||
random_state=None,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.copy = copy
|
||||
self.whiten = whiten
|
||||
self.svd_solver = svd_solver
|
||||
self.tol = tol
|
||||
self.iterated_power = iterated_power
|
||||
self.random_state = random_state
|
||||
|
||||
|
||||
class NMF(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
n_components=None,
|
||||
init=None,
|
||||
solver="cd",
|
||||
beta_loss="frobenius",
|
||||
tol=1e-4,
|
||||
max_iter=200,
|
||||
random_state=None,
|
||||
alpha=0.0,
|
||||
l1_ratio=0.0,
|
||||
verbose=0,
|
||||
shuffle=False,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.init = init
|
||||
self.solver = solver
|
||||
self.beta_loss = beta_loss
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.random_state = random_state
|
||||
self.alpha = alpha
|
||||
self.l1_ratio = l1_ratio
|
||||
self.verbose = verbose
|
||||
self.shuffle = shuffle
|
||||
|
||||
|
||||
class SimpleImputer(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
missing_values=np.nan,
|
||||
strategy="mean",
|
||||
fill_value=None,
|
||||
verbose=0,
|
||||
copy=True,
|
||||
):
|
||||
self.missing_values = missing_values
|
||||
self.strategy = strategy
|
||||
self.fill_value = fill_value
|
||||
self.verbose = verbose
|
||||
self.copy = copy
|
||||
|
||||
|
||||
def test_basic(print_changed_only_false):
|
||||
# Basic pprint test
|
||||
lr = LogisticRegression()
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_iter=100,
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
|
||||
def test_changed_only():
|
||||
# Make sure the changed_only param is correctly used when True (default)
|
||||
lr = LogisticRegression(C=99)
|
||||
expected = """LogisticRegression(C=99)"""
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
# Check with a repr that doesn't fit on a single line
|
||||
lr = LogisticRegression(
|
||||
C=99, class_weight=0.4, fit_intercept=False, tol=1234, verbose=True
|
||||
)
|
||||
expected = """
|
||||
LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
|
||||
verbose=True)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
imputer = SimpleImputer(missing_values=0)
|
||||
expected = """SimpleImputer(missing_values=0)"""
|
||||
assert imputer.__repr__() == expected
|
||||
|
||||
# Defaults to np.NaN, trying with float('NaN')
|
||||
imputer = SimpleImputer(missing_values=float("NaN"))
|
||||
expected = """SimpleImputer()"""
|
||||
assert imputer.__repr__() == expected
|
||||
|
||||
# make sure array parameters don't throw error (see #13583)
|
||||
repr(LogisticRegressionCV(Cs=np.array([0.1, 1])))
|
||||
|
||||
|
||||
def test_pipeline(print_changed_only_false):
|
||||
# Render a pipeline object
|
||||
pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
|
||||
expected = """
|
||||
Pipeline(memory=None,
|
||||
steps=[('standardscaler',
|
||||
StandardScaler(copy=True, with_mean=True, with_std=True)),
|
||||
('logisticregression',
|
||||
LogisticRegression(C=999, class_weight=None, dual=False,
|
||||
fit_intercept=True, intercept_scaling=1,
|
||||
l1_ratio=None, max_iter=100,
|
||||
multi_class='warn', n_jobs=None,
|
||||
penalty='l2', random_state=None,
|
||||
solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False))],
|
||||
verbose=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pipeline.__repr__() == expected
|
||||
|
||||
|
||||
def test_deeply_nested(print_changed_only_false):
|
||||
# Render a deeply nested estimator
|
||||
rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
|
||||
expected = """
|
||||
RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0,
|
||||
class_weight=None,
|
||||
dual=False,
|
||||
fit_intercept=True,
|
||||
intercept_scaling=1,
|
||||
l1_ratio=None,
|
||||
max_iter=100,
|
||||
multi_class='warn',
|
||||
n_jobs=None,
|
||||
penalty='l2',
|
||||
random_state=None,
|
||||
solver='warn',
|
||||
tol=0.0001,
|
||||
verbose=0,
|
||||
warm_start=False),
|
||||
n_features_to_select=None,
|
||||
step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None,
|
||||
step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None,
|
||||
step=1, verbose=0),
|
||||
n_features_to_select=None, step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert rfe.__repr__() == expected
|
||||
|
||||
|
||||
def test_gridsearch(print_changed_only_false):
|
||||
# render a gridsearch
|
||||
param_grid = [
|
||||
{"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
|
||||
{"kernel": ["linear"], "C": [1, 10, 100, 1000]},
|
||||
]
|
||||
gs = GridSearchCV(SVC(), param_grid, cv=5)
|
||||
|
||||
expected = """
|
||||
GridSearchCV(cv=5, error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
|
||||
'kernel': ['rbf']},
|
||||
{'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert gs.__repr__() == expected
|
||||
|
||||
|
||||
def test_gridsearch_pipeline(print_changed_only_false):
|
||||
# render a pipeline inside a gridsearch
|
||||
pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)
|
||||
|
||||
pipeline = Pipeline([("reduce_dim", PCA()), ("classify", SVC())])
|
||||
N_FEATURES_OPTIONS = [2, 4, 8]
|
||||
C_OPTIONS = [1, 10, 100, 1000]
|
||||
param_grid = [
|
||||
{
|
||||
"reduce_dim": [PCA(iterated_power=7), NMF()],
|
||||
"reduce_dim__n_components": N_FEATURES_OPTIONS,
|
||||
"classify__C": C_OPTIONS,
|
||||
},
|
||||
{
|
||||
"reduce_dim": [SelectKBest(chi2)],
|
||||
"reduce_dim__k": N_FEATURES_OPTIONS,
|
||||
"classify__C": C_OPTIONS,
|
||||
},
|
||||
]
|
||||
gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv=3, error_score='raise-deprecating',
|
||||
estimator=Pipeline(memory=None,
|
||||
steps=[('reduce_dim',
|
||||
PCA(copy=True, iterated_power='auto',
|
||||
n_components=None,
|
||||
random_state=None,
|
||||
svd_solver='auto', tol=0.0,
|
||||
whiten=False)),
|
||||
('classify',
|
||||
SVC(C=1.0, cache_size=200,
|
||||
class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr',
|
||||
degree=3, gamma='auto_deprecated',
|
||||
kernel='rbf', max_iter=-1,
|
||||
probability=False,
|
||||
random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False))]),
|
||||
iid='warn', n_jobs=1,
|
||||
param_grid=[{'classify__C': [1, 10, 100, 1000],
|
||||
'reduce_dim': [PCA(copy=True, iterated_power=7,
|
||||
n_components=None,
|
||||
random_state=None,
|
||||
svd_solver='auto', tol=0.0,
|
||||
whiten=False),
|
||||
NMF(alpha=0.0, beta_loss='frobenius',
|
||||
init=None, l1_ratio=0.0,
|
||||
max_iter=200, n_components=None,
|
||||
random_state=None, shuffle=False,
|
||||
solver='cd', tol=0.0001,
|
||||
verbose=0)],
|
||||
'reduce_dim__n_components': [2, 4, 8]},
|
||||
{'classify__C': [1, 10, 100, 1000],
|
||||
'reduce_dim': [SelectKBest(k=10,
|
||||
score_func=<function chi2 at some_address>)],
|
||||
'reduce_dim__k': [2, 4, 8]}],
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
repr_ = pp.pformat(gspipline)
|
||||
# Remove address of '<function chi2 at 0x.....>' for reproducibility
|
||||
repr_ = re.sub("function chi2 at 0x.*>", "function chi2 at some_address>", repr_)
|
||||
assert repr_ == expected
|
||||
|
||||
|
||||
def test_n_max_elements_to_show(print_changed_only_false):
|
||||
|
||||
n_max_elements_to_show = 30
|
||||
pp = _EstimatorPrettyPrinter(
|
||||
compact=True,
|
||||
indent=1,
|
||||
indent_at_name=True,
|
||||
n_max_elements_to_show=n_max_elements_to_show,
|
||||
)
|
||||
|
||||
# No ellipsis
|
||||
vocabulary = {i: i for i in range(n_max_elements_to_show)}
|
||||
vectorizer = CountVectorizer(vocabulary=vocabulary)
|
||||
|
||||
expected = r"""
|
||||
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
|
||||
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
|
||||
lowercase=True, max_df=1.0, max_features=None, min_df=1,
|
||||
ngram_range=(1, 1), preprocessor=None, stop_words=None,
|
||||
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
|
||||
tokenizer=None,
|
||||
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
|
||||
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
|
||||
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
|
||||
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
|
||||
27: 27, 28: 28, 29: 29})"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(vectorizer) == expected
|
||||
|
||||
# Now with ellipsis
|
||||
vocabulary = {i: i for i in range(n_max_elements_to_show + 1)}
|
||||
vectorizer = CountVectorizer(vocabulary=vocabulary)
|
||||
|
||||
expected = r"""
|
||||
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
|
||||
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
|
||||
lowercase=True, max_df=1.0, max_features=None, min_df=1,
|
||||
ngram_range=(1, 1), preprocessor=None, stop_words=None,
|
||||
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
|
||||
tokenizer=None,
|
||||
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
|
||||
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
|
||||
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
|
||||
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
|
||||
27: 27, 28: 28, 29: 29, ...})"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(vectorizer) == expected
|
||||
|
||||
# Also test with lists
|
||||
param_grid = {"C": list(range(n_max_elements_to_show))}
|
||||
gs = GridSearchCV(SVC(), param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv='warn', error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
|
||||
27, 28, 29]},
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(gs) == expected
|
||||
|
||||
# Now with ellipsis
|
||||
param_grid = {"C": list(range(n_max_elements_to_show + 1))}
|
||||
gs = GridSearchCV(SVC(), param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv='warn', error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
|
||||
27, 28, 29, ...]},
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(gs) == expected
|
||||
|
||||
|
||||
def test_bruteforce_ellipsis(print_changed_only_false):
|
||||
# Check that the bruteforce ellipsis (used when the number of non-blank
|
||||
# characters exceeds N_CHAR_MAX) renders correctly.
|
||||
|
||||
lr = LogisticRegression()
|
||||
|
||||
# test when the left and right side of the ellipsis aren't on the same
|
||||
# line.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
in...
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=150)
|
||||
|
||||
# test with very small N_CHAR_MAX
|
||||
# Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
|
||||
# weird reprs we still keep the whole line of the right part (after the
|
||||
# ellipsis).
|
||||
expected = """
|
||||
Lo...
|
||||
warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=4)
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters: In this case we
|
||||
# don't want ellipsis
|
||||
full_repr = lr.__repr__(N_CHAR_MAX=float("inf"))
|
||||
n_nonblank = len("".join(full_repr.split()))
|
||||
assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
|
||||
assert "..." not in full_repr
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
|
||||
# right side of the ellispsis are on different lines. In this case we
|
||||
# want to expend the whole line of the right side
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_i...
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 10)
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
|
||||
# right side of the ellispsis are on the same line. In this case we don't
|
||||
# want to expend the whole line of the right side, just add the ellispsis
|
||||
# between the 2 sides.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_iter...,
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 4)
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 2: the left and
|
||||
# right side of the ellispsis are on the same line, but adding the ellipsis
|
||||
# would actually make the repr longer. So we don't add the ellipsis.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_iter=100,
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 2)
|
||||
|
||||
|
||||
def test_builtin_prettyprinter():
|
||||
# non regression test than ensures we can still use the builtin
|
||||
# PrettyPrinter class for estimators (as done e.g. by joblib).
|
||||
# Used to be a bug
|
||||
|
||||
PrettyPrinter().pprint(LogisticRegression())
|
||||
|
||||
|
||||
def test_kwargs_in_init():
|
||||
# Make sure the changed_only=True mode is OK when an argument is passed as
|
||||
# kwargs.
|
||||
# Non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/17206
|
||||
|
||||
class WithKWargs(BaseEstimator):
|
||||
# Estimator with a kwargs argument. These need to hack around
|
||||
# set_params and get_params. Here we mimic what LightGBM does.
|
||||
def __init__(self, a="willchange", b="unchanged", **kwargs):
|
||||
self.a = a
|
||||
self.b = b
|
||||
self._other_params = {}
|
||||
self.set_params(**kwargs)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
params = super().get_params(deep=deep)
|
||||
params.update(self._other_params)
|
||||
return params
|
||||
|
||||
def set_params(self, **params):
|
||||
for key, value in params.items():
|
||||
setattr(self, key, value)
|
||||
self._other_params[key] = value
|
||||
return self
|
||||
|
||||
est = WithKWargs(a="something", c="abcd", d=None)
|
||||
|
||||
expected = "WithKWargs(a='something', c='abcd', d=None)"
|
||||
assert expected == est.__repr__()
|
||||
|
||||
with config_context(print_changed_only=False):
|
||||
expected = "WithKWargs(a='something', b='unchanged', c='abcd', d=None)"
|
||||
assert expected == est.__repr__()
|
||||
|
||||
|
||||
def test_complexity_print_changed_only():
|
||||
# Make sure `__repr__` is called the same amount of times
|
||||
# whether `print_changed_only` is True or False
|
||||
# Non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/18490
|
||||
|
||||
class DummyEstimator(TransformerMixin, BaseEstimator):
|
||||
nb_times_repr_called = 0
|
||||
|
||||
def __init__(self, estimator=None):
|
||||
self.estimator = estimator
|
||||
|
||||
def __repr__(self):
|
||||
DummyEstimator.nb_times_repr_called += 1
|
||||
return super().__repr__()
|
||||
|
||||
def transform(self, X, copy=None): # pragma: no cover
|
||||
return X
|
||||
|
||||
estimator = DummyEstimator(
|
||||
make_pipeline(DummyEstimator(DummyEstimator()), DummyEstimator(), "passthrough")
|
||||
)
|
||||
with config_context(print_changed_only=False):
|
||||
repr(estimator)
|
||||
nb_repr_print_changed_only_false = DummyEstimator.nb_times_repr_called
|
||||
|
||||
DummyEstimator.nb_times_repr_called = 0
|
||||
with config_context(print_changed_only=True):
|
||||
repr(estimator)
|
||||
nb_repr_print_changed_only_true = DummyEstimator.nb_times_repr_called
|
||||
|
||||
assert nb_repr_print_changed_only_false == nb_repr_print_changed_only_true
|
||||
@@ -0,0 +1,193 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
from scipy.special import comb
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
|
||||
from sklearn.utils.random import _random_choice_csc, sample_without_replacement
|
||||
from sklearn.utils._random import _our_rand_r_py
|
||||
|
||||
|
||||
###############################################################################
|
||||
# test custom sampling without replacement algorithm
|
||||
###############################################################################
|
||||
def test_invalid_sample_without_replacement_algorithm():
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(5, 4, "unknown")
|
||||
|
||||
|
||||
def test_sample_without_replacement_algorithms():
|
||||
methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
|
||||
|
||||
for m in methods:
|
||||
|
||||
def sample_without_replacement_method(
|
||||
n_population, n_samples, random_state=None
|
||||
):
|
||||
return sample_without_replacement(
|
||||
n_population, n_samples, method=m, random_state=random_state
|
||||
)
|
||||
|
||||
check_edge_case_of_sample_int(sample_without_replacement_method)
|
||||
check_sample_int(sample_without_replacement_method)
|
||||
check_sample_int_distribution(sample_without_replacement_method)
|
||||
|
||||
|
||||
def check_edge_case_of_sample_int(sample_without_replacement):
|
||||
|
||||
# n_population < n_sample
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(0, 1)
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(1, 2)
|
||||
|
||||
# n_population == n_samples
|
||||
assert sample_without_replacement(0, 0).shape == (0,)
|
||||
|
||||
assert sample_without_replacement(1, 1).shape == (1,)
|
||||
|
||||
# n_population >= n_samples
|
||||
assert sample_without_replacement(5, 0).shape == (0,)
|
||||
assert sample_without_replacement(5, 1).shape == (1,)
|
||||
|
||||
# n_population < 0 or n_samples < 0
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(-1, 5)
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(5, -1)
|
||||
|
||||
|
||||
def check_sample_int(sample_without_replacement):
|
||||
# This test is heavily inspired from test_random.py of python-core.
|
||||
#
|
||||
# For the entire allowable range of 0 <= k <= N, validate that
|
||||
# the sample is of the correct length and contains only unique items
|
||||
n_population = 100
|
||||
|
||||
for n_samples in range(n_population + 1):
|
||||
s = sample_without_replacement(n_population, n_samples)
|
||||
assert len(s) == n_samples
|
||||
unique = np.unique(s)
|
||||
assert np.size(unique) == n_samples
|
||||
assert np.all(unique < n_population)
|
||||
|
||||
# test edge case n_population == n_samples == 0
|
||||
assert np.size(sample_without_replacement(0, 0)) == 0
|
||||
|
||||
|
||||
def check_sample_int_distribution(sample_without_replacement):
|
||||
# This test is heavily inspired from test_random.py of python-core.
|
||||
#
|
||||
# For the entire allowable range of 0 <= k <= N, validate that
|
||||
# sample generates all possible permutations
|
||||
n_population = 10
|
||||
|
||||
# a large number of trials prevents false negatives without slowing normal
|
||||
# case
|
||||
n_trials = 10000
|
||||
|
||||
for n_samples in range(n_population):
|
||||
# Counting the number of combinations is not as good as counting the
|
||||
# the number of permutations. However, it works with sampling algorithm
|
||||
# that does not provide a random permutation of the subset of integer.
|
||||
n_expected = comb(n_population, n_samples, exact=True)
|
||||
|
||||
output = {}
|
||||
for i in range(n_trials):
|
||||
output[
|
||||
frozenset(sample_without_replacement(n_population, n_samples))
|
||||
] = None
|
||||
|
||||
if len(output) == n_expected:
|
||||
break
|
||||
else:
|
||||
raise AssertionError(
|
||||
"number of combinations != number of expected (%s != %s)"
|
||||
% (len(output), n_expected)
|
||||
)
|
||||
|
||||
|
||||
def test_random_choice_csc(n_samples=10000, random_state=24):
|
||||
# Explicit class probabilities
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
|
||||
got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# Implicit class probabilities
|
||||
classes = [[0, 1], [1, 2]] # test for array-like support
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0, 1 / 2, 1 / 2])]
|
||||
|
||||
got = _random_choice_csc(
|
||||
n_samples=n_samples, classes=classes, random_state=random_state
|
||||
)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# Edge case probabilities 1.0 and 0.0
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.0, 1.0]), np.array([0.0, 1.0, 0.0])]
|
||||
|
||||
got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = (
|
||||
np.bincount(
|
||||
got.getcol(k).toarray().ravel(), minlength=len(class_probabilities[k])
|
||||
)
|
||||
/ n_samples
|
||||
)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# One class target data
|
||||
classes = [[1], [0]] # test for array-like support
|
||||
class_probabilities = [np.array([0.0, 1.0]), np.array([1.0])]
|
||||
|
||||
got = _random_choice_csc(
|
||||
n_samples=n_samples, classes=classes, random_state=random_state
|
||||
)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
|
||||
def test_random_choice_csc_errors():
|
||||
# the length of an array in classes and class_probabilities is mismatched
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2, 3])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# the class dtype is not supported
|
||||
classes = [np.array(["a", "1"]), np.array(["z", "1", "2"])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# the class dtype is not supported
|
||||
classes = [np.array([4.2, 0.1]), np.array([0.1, 0.2, 9.4])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# Given probabilities don't sum to 1
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.5, 0.6]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
|
||||
def test_our_rand_r():
|
||||
assert 131541053 == _our_rand_r_py(1273642419)
|
||||
assert 270369 == _our_rand_r_py(0)
|
||||
@@ -0,0 +1,41 @@
|
||||
import numpy as np
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._readonly_array_wrapper import ReadonlyArrayWrapper, _test_sum
|
||||
from sklearn.utils._testing import create_memmap_backed_data
|
||||
|
||||
|
||||
def _readonly_array_copy(x):
|
||||
"""Return a copy of x with flag writeable set to False."""
|
||||
y = x.copy()
|
||||
y.flags["WRITEABLE"] = False
|
||||
return y
|
||||
|
||||
|
||||
def _create_memmap_backed_data(data):
|
||||
return create_memmap_backed_data(
|
||||
data, mmap_mode="r", return_folder=False, aligned=True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data])
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
|
||||
def test_readonly_array_wrapper(readonly, dtype):
|
||||
"""Test that ReadonlyWrapper allows working with fused-typed."""
|
||||
x = np.arange(10).astype(dtype)
|
||||
sum_origin = _test_sum(x)
|
||||
|
||||
# ReadonlyArrayWrapper works with writable buffers
|
||||
sum_writable = _test_sum(ReadonlyArrayWrapper(x))
|
||||
assert sum_writable == pytest.approx(sum_origin, rel=1e-11)
|
||||
|
||||
# Now, check on readonly buffers
|
||||
x_readonly = readonly(x)
|
||||
|
||||
with pytest.raises(ValueError, match="buffer source array is read-only"):
|
||||
_test_sum(x_readonly)
|
||||
|
||||
x_readonly = ReadonlyArrayWrapper(x_readonly)
|
||||
sum_readonly = _test_sum(x_readonly)
|
||||
assert sum_readonly == pytest.approx(sum_origin, rel=1e-11)
|
||||
@@ -0,0 +1,170 @@
|
||||
# Author: Tom Dupre la Tour
|
||||
# Joan Massich <mailsik@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
from numpy.testing import assert_array_equal
|
||||
from sklearn.utils._seq_dataset import (
|
||||
ArrayDataset32,
|
||||
ArrayDataset64,
|
||||
CSRDataset32,
|
||||
CSRDataset64,
|
||||
)
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
iris = load_iris()
|
||||
X64 = iris.data.astype(np.float64)
|
||||
y64 = iris.target.astype(np.float64)
|
||||
X_csr64 = sp.csr_matrix(X64)
|
||||
sample_weight64 = np.arange(y64.size, dtype=np.float64)
|
||||
|
||||
X32 = iris.data.astype(np.float32)
|
||||
y32 = iris.target.astype(np.float32)
|
||||
X_csr32 = sp.csr_matrix(X32)
|
||||
sample_weight32 = np.arange(y32.size, dtype=np.float32)
|
||||
|
||||
|
||||
def assert_csr_equal_values(current, expected):
|
||||
current.eliminate_zeros()
|
||||
expected.eliminate_zeros()
|
||||
expected = expected.astype(current.dtype)
|
||||
assert current.shape[0] == expected.shape[0]
|
||||
assert current.shape[1] == expected.shape[1]
|
||||
assert_array_equal(current.data, expected.data)
|
||||
assert_array_equal(current.indices, expected.indices)
|
||||
assert_array_equal(current.indptr, expected.indptr)
|
||||
|
||||
|
||||
def make_dense_dataset_32():
|
||||
return ArrayDataset32(X32, y32, sample_weight32, seed=42)
|
||||
|
||||
|
||||
def make_dense_dataset_64():
|
||||
return ArrayDataset64(X64, y64, sample_weight64, seed=42)
|
||||
|
||||
|
||||
def make_sparse_dataset_32():
|
||||
return CSRDataset32(
|
||||
X_csr32.data, X_csr32.indptr, X_csr32.indices, y32, sample_weight32, seed=42
|
||||
)
|
||||
|
||||
|
||||
def make_sparse_dataset_64():
|
||||
return CSRDataset64(
|
||||
X_csr64.data, X_csr64.indptr, X_csr64.indices, y64, sample_weight64, seed=42
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dataset_constructor",
|
||||
[
|
||||
make_dense_dataset_32,
|
||||
make_dense_dataset_64,
|
||||
make_sparse_dataset_32,
|
||||
make_sparse_dataset_64,
|
||||
],
|
||||
)
|
||||
def test_seq_dataset_basic_iteration(dataset_constructor):
|
||||
NUMBER_OF_RUNS = 5
|
||||
dataset = dataset_constructor()
|
||||
for _ in range(NUMBER_OF_RUNS):
|
||||
# next sample
|
||||
xi_, yi, swi, idx = dataset._next_py()
|
||||
xi = sp.csr_matrix((xi_), shape=(1, X64.shape[1]))
|
||||
|
||||
assert_csr_equal_values(xi, X_csr64[idx])
|
||||
assert yi == y64[idx]
|
||||
assert swi == sample_weight64[idx]
|
||||
|
||||
# random sample
|
||||
xi_, yi, swi, idx = dataset._random_py()
|
||||
xi = sp.csr_matrix((xi_), shape=(1, X64.shape[1]))
|
||||
|
||||
assert_csr_equal_values(xi, X_csr64[idx])
|
||||
assert yi == y64[idx]
|
||||
assert swi == sample_weight64[idx]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"make_dense_dataset,make_sparse_dataset",
|
||||
[
|
||||
(make_dense_dataset_32, make_sparse_dataset_32),
|
||||
(make_dense_dataset_64, make_sparse_dataset_64),
|
||||
],
|
||||
)
|
||||
def test_seq_dataset_shuffle(make_dense_dataset, make_sparse_dataset):
|
||||
dense_dataset, sparse_dataset = make_dense_dataset(), make_sparse_dataset()
|
||||
# not shuffled
|
||||
for i in range(5):
|
||||
_, _, _, idx1 = dense_dataset._next_py()
|
||||
_, _, _, idx2 = sparse_dataset._next_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
for i in [132, 50, 9, 18, 58]:
|
||||
_, _, _, idx1 = dense_dataset._random_py()
|
||||
_, _, _, idx2 = sparse_dataset._random_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
seed = 77
|
||||
dense_dataset._shuffle_py(seed)
|
||||
sparse_dataset._shuffle_py(seed)
|
||||
|
||||
idx_next = [63, 91, 148, 87, 29]
|
||||
idx_shuffle = [137, 125, 56, 121, 127]
|
||||
for i, j in zip(idx_next, idx_shuffle):
|
||||
_, _, _, idx1 = dense_dataset._next_py()
|
||||
_, _, _, idx2 = sparse_dataset._next_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
_, _, _, idx1 = dense_dataset._random_py()
|
||||
_, _, _, idx2 = sparse_dataset._random_py()
|
||||
assert idx1 == j
|
||||
assert idx2 == j
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"make_dataset_32,make_dataset_64",
|
||||
[
|
||||
(make_dense_dataset_32, make_dense_dataset_64),
|
||||
(make_sparse_dataset_32, make_sparse_dataset_64),
|
||||
],
|
||||
)
|
||||
def test_fused_types_consistency(make_dataset_32, make_dataset_64):
|
||||
dataset_32, dataset_64 = make_dataset_32(), make_dataset_64()
|
||||
NUMBER_OF_RUNS = 5
|
||||
for _ in range(NUMBER_OF_RUNS):
|
||||
# next sample
|
||||
(xi_data32, _, _), yi32, _, _ = dataset_32._next_py()
|
||||
(xi_data64, _, _), yi64, _, _ = dataset_64._next_py()
|
||||
|
||||
assert xi_data32.dtype == np.float32
|
||||
assert xi_data64.dtype == np.float64
|
||||
|
||||
assert_allclose(xi_data64, xi_data32, rtol=1e-5)
|
||||
assert_allclose(yi64, yi32, rtol=1e-5)
|
||||
|
||||
|
||||
def test_buffer_dtype_mismatch_error():
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
ArrayDataset64(X32, y32, sample_weight32, seed=42),
|
||||
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
ArrayDataset32(X64, y64, sample_weight64, seed=42),
|
||||
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
CSRDataset64(
|
||||
X_csr32.data, X_csr32.indptr, X_csr32.indices, y32, sample_weight32, seed=42
|
||||
),
|
||||
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
CSRDataset32(
|
||||
X_csr64.data, X_csr64.indptr, X_csr64.indices, y64, sample_weight64, seed=42
|
||||
),
|
||||
@@ -0,0 +1,103 @@
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
from sklearn.utils.graph import graph_shortest_path, single_source_shortest_path_length
|
||||
|
||||
|
||||
# FIXME: to be removed in 1.2
|
||||
def test_graph_shortest_path_deprecation():
|
||||
dist_matrix = generate_graph(20)
|
||||
|
||||
with pytest.warns(FutureWarning, match="deprecated"):
|
||||
_ = graph_shortest_path(dist_matrix)
|
||||
|
||||
|
||||
def floyd_warshall_slow(graph, directed=False):
|
||||
N = graph.shape[0]
|
||||
|
||||
# set nonzero entries to infinity
|
||||
graph[np.where(graph == 0)] = np.inf
|
||||
|
||||
# set diagonal to zero
|
||||
graph.flat[:: N + 1] = 0
|
||||
|
||||
if not directed:
|
||||
graph = np.minimum(graph, graph.T)
|
||||
|
||||
for k in range(N):
|
||||
for i in range(N):
|
||||
for j in range(N):
|
||||
graph[i, j] = min(graph[i, j], graph[i, k] + graph[k, j])
|
||||
|
||||
graph[np.where(np.isinf(graph))] = 0
|
||||
|
||||
return graph
|
||||
|
||||
|
||||
def generate_graph(N=20):
|
||||
# sparse grid of distances
|
||||
rng = np.random.RandomState(0)
|
||||
dist_matrix = rng.random_sample((N, N))
|
||||
|
||||
# make symmetric: distances are not direction-dependent
|
||||
dist_matrix = dist_matrix + dist_matrix.T
|
||||
|
||||
# make graph sparse
|
||||
i = (rng.randint(N, size=N * N // 2), rng.randint(N, size=N * N // 2))
|
||||
dist_matrix[i] = 0
|
||||
|
||||
# set diagonal to zero
|
||||
dist_matrix.flat[:: N + 1] = 0
|
||||
|
||||
return dist_matrix
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Function graph_shortest_path is deprecated")
|
||||
def test_floyd_warshall():
|
||||
dist_matrix = generate_graph(20)
|
||||
|
||||
for directed in (True, False):
|
||||
graph_FW = graph_shortest_path(dist_matrix, directed, "FW")
|
||||
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
|
||||
|
||||
assert_array_almost_equal(graph_FW, graph_py)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Function graph_shortest_path is deprecated")
|
||||
def test_dijkstra():
|
||||
dist_matrix = generate_graph(20)
|
||||
|
||||
for directed in (True, False):
|
||||
graph_D = graph_shortest_path(dist_matrix, directed, "D")
|
||||
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
|
||||
|
||||
assert_array_almost_equal(graph_D, graph_py)
|
||||
|
||||
|
||||
def test_shortest_path():
|
||||
dist_matrix = generate_graph(20)
|
||||
# We compare path length and not costs (-> set distances to 0 or 1)
|
||||
dist_matrix[dist_matrix != 0] = 1
|
||||
|
||||
for directed in (True, False):
|
||||
if not directed:
|
||||
dist_matrix = np.minimum(dist_matrix, dist_matrix.T)
|
||||
|
||||
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
|
||||
for i in range(dist_matrix.shape[0]):
|
||||
# Non-reachable nodes have distance 0 in graph_py
|
||||
dist_dict = defaultdict(int)
|
||||
dist_dict.update(single_source_shortest_path_length(dist_matrix, i))
|
||||
|
||||
for j in range(graph_py[i].shape[0]):
|
||||
assert_array_almost_equal(dist_dict[j], graph_py[i, j])
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Function graph_shortest_path is deprecated")
|
||||
def test_dijkstra_bug_fix():
|
||||
X = np.array([[0.0, 0.0, 4.0], [1.0, 0.0, 2.0], [0.0, 5.0, 0.0]])
|
||||
dist_FW = graph_shortest_path(X, directed=False, method="FW")
|
||||
dist_D = graph_shortest_path(X, directed=False, method="D")
|
||||
assert_array_almost_equal(dist_D, dist_FW)
|
||||
@@ -0,0 +1,41 @@
|
||||
from sklearn.utils.fixes import threadpool_info
|
||||
from sklearn.utils._show_versions import _get_sys_info
|
||||
from sklearn.utils._show_versions import _get_deps_info
|
||||
from sklearn.utils._show_versions import show_versions
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
|
||||
def test_get_sys_info():
|
||||
sys_info = _get_sys_info()
|
||||
|
||||
assert "python" in sys_info
|
||||
assert "executable" in sys_info
|
||||
assert "machine" in sys_info
|
||||
|
||||
|
||||
def test_get_deps_info():
|
||||
with ignore_warnings():
|
||||
deps_info = _get_deps_info()
|
||||
|
||||
assert "pip" in deps_info
|
||||
assert "setuptools" in deps_info
|
||||
assert "sklearn" in deps_info
|
||||
assert "numpy" in deps_info
|
||||
assert "scipy" in deps_info
|
||||
assert "Cython" in deps_info
|
||||
assert "pandas" in deps_info
|
||||
assert "matplotlib" in deps_info
|
||||
assert "joblib" in deps_info
|
||||
|
||||
|
||||
def test_show_versions(capsys):
|
||||
with ignore_warnings():
|
||||
show_versions()
|
||||
out, err = capsys.readouterr()
|
||||
|
||||
assert "python" in out
|
||||
assert "numpy" in out
|
||||
|
||||
info = threadpool_info()
|
||||
if info:
|
||||
assert "threadpoolctl info:" in out
|
||||
@@ -0,0 +1,917 @@
|
||||
import pytest
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from scipy import linalg
|
||||
from numpy.testing import assert_array_almost_equal, assert_array_equal
|
||||
from numpy.random import RandomState
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.utils.sparsefuncs import (
|
||||
mean_variance_axis,
|
||||
incr_mean_variance_axis,
|
||||
inplace_column_scale,
|
||||
inplace_row_scale,
|
||||
inplace_swap_row,
|
||||
inplace_swap_column,
|
||||
min_max_axis,
|
||||
count_nonzero,
|
||||
csc_median_axis_0,
|
||||
)
|
||||
from sklearn.utils.sparsefuncs_fast import (
|
||||
assign_rows_csr,
|
||||
inplace_csr_row_normalize_l1,
|
||||
inplace_csr_row_normalize_l2,
|
||||
csr_row_norms,
|
||||
)
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
|
||||
def test_mean_variance_axis0():
|
||||
X, _ = make_classification(5, 4, random_state=0)
|
||||
# Sparsify the array a little bit
|
||||
X[0, 0] = 0
|
||||
X[2, 1] = 0
|
||||
X[4, 3] = 0
|
||||
X_lil = sp.lil_matrix(X)
|
||||
X_lil[1, 0] = 0
|
||||
X[1, 0] = 0
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
mean_variance_axis(X_lil, axis=0)
|
||||
|
||||
X_csr = sp.csr_matrix(X_lil)
|
||||
X_csc = sp.csc_matrix(X_lil)
|
||||
|
||||
expected_dtypes = [
|
||||
(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64),
|
||||
]
|
||||
|
||||
for input_dtype, output_dtype in expected_dtypes:
|
||||
X_test = X.astype(input_dtype)
|
||||
for X_sparse in (X_csr, X_csc):
|
||||
X_sparse = X_sparse.astype(input_dtype)
|
||||
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
|
||||
assert X_means.dtype == output_dtype
|
||||
assert X_vars.dtype == output_dtype
|
||||
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
|
||||
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("sparse_constructor", [sp.csr_matrix, sp.csc_matrix])
|
||||
def test_mean_variance_axis0_precision(dtype, sparse_constructor):
|
||||
# Check that there's no big loss of precision when the real variance is
|
||||
# exactly 0. (#19766)
|
||||
rng = np.random.RandomState(0)
|
||||
X = np.full(fill_value=100.0, shape=(1000, 1), dtype=dtype)
|
||||
# Add some missing records which should be ignored:
|
||||
missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False)
|
||||
X[missing_indices, 0] = np.nan
|
||||
X = sparse_constructor(X)
|
||||
|
||||
# Random positive weights:
|
||||
sample_weight = rng.rand(X.shape[0]).astype(dtype)
|
||||
|
||||
_, var = mean_variance_axis(X, weights=sample_weight, axis=0)
|
||||
|
||||
assert var < np.finfo(dtype).eps
|
||||
|
||||
|
||||
def test_mean_variance_axis1():
|
||||
X, _ = make_classification(5, 4, random_state=0)
|
||||
# Sparsify the array a little bit
|
||||
X[0, 0] = 0
|
||||
X[2, 1] = 0
|
||||
X[4, 3] = 0
|
||||
X_lil = sp.lil_matrix(X)
|
||||
X_lil[1, 0] = 0
|
||||
X[1, 0] = 0
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
mean_variance_axis(X_lil, axis=1)
|
||||
|
||||
X_csr = sp.csr_matrix(X_lil)
|
||||
X_csc = sp.csc_matrix(X_lil)
|
||||
|
||||
expected_dtypes = [
|
||||
(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64),
|
||||
]
|
||||
|
||||
for input_dtype, output_dtype in expected_dtypes:
|
||||
X_test = X.astype(input_dtype)
|
||||
for X_sparse in (X_csr, X_csc):
|
||||
X_sparse = X_sparse.astype(input_dtype)
|
||||
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
|
||||
assert X_means.dtype == output_dtype
|
||||
assert X_vars.dtype == output_dtype
|
||||
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
|
||||
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["Xw", "X", "weights"],
|
||||
[
|
||||
([[0, 0, 1], [0, 2, 3]], [[0, 0, 1], [0, 2, 3]], [1, 1, 1]),
|
||||
([[0, 0, 1], [0, 1, 1]], [[0, 0, 0, 1], [0, 1, 1, 1]], [1, 2, 1]),
|
||||
([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1]], None),
|
||||
(
|
||||
[[0, np.nan, 2], [0, np.nan, np.nan]],
|
||||
[[0, np.nan, 2], [0, np.nan, np.nan]],
|
||||
[1.0, 1.0, 1.0],
|
||||
),
|
||||
(
|
||||
[[0, 0], [1, np.nan], [2, 0], [0, 3], [np.nan, np.nan], [np.nan, 2]],
|
||||
[
|
||||
[0, 0, 0],
|
||||
[1, 1, np.nan],
|
||||
[2, 2, 0],
|
||||
[0, 0, 3],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, 2],
|
||||
],
|
||||
[2.0, 1.0],
|
||||
),
|
||||
(
|
||||
[[1, 0, 1], [0, 3, 1]],
|
||||
[[1, 0, 0, 0, 1], [0, 3, 3, 3, 1]],
|
||||
np.array([1, 3, 1]),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix])
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_incr_mean_variance_axis_weighted_axis1(
|
||||
Xw, X, weights, sparse_constructor, dtype
|
||||
):
|
||||
axis = 1
|
||||
Xw_sparse = sparse_constructor(Xw).astype(dtype)
|
||||
X_sparse = sparse_constructor(X).astype(dtype)
|
||||
|
||||
last_mean = np.zeros(np.shape(Xw)[0], dtype=dtype)
|
||||
last_var = np.zeros_like(last_mean, dtype=dtype)
|
||||
last_n = np.zeros_like(last_mean, dtype=np.int64)
|
||||
means0, vars0, n_incr0 = incr_mean_variance_axis(
|
||||
X=X_sparse,
|
||||
axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n,
|
||||
weights=None,
|
||||
)
|
||||
|
||||
means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(
|
||||
X=Xw_sparse,
|
||||
axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n,
|
||||
weights=weights,
|
||||
)
|
||||
|
||||
assert means_w0.dtype == dtype
|
||||
assert vars_w0.dtype == dtype
|
||||
assert n_incr_w0.dtype == dtype
|
||||
|
||||
means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)
|
||||
|
||||
assert_array_almost_equal(means0, means_w0)
|
||||
assert_array_almost_equal(means0, means_simple)
|
||||
assert_array_almost_equal(vars0, vars_w0)
|
||||
assert_array_almost_equal(vars0, vars_simple)
|
||||
assert_array_almost_equal(n_incr0, n_incr_w0)
|
||||
|
||||
# check second round for incremental
|
||||
means1, vars1, n_incr1 = incr_mean_variance_axis(
|
||||
X=X_sparse,
|
||||
axis=axis,
|
||||
last_mean=means0,
|
||||
last_var=vars0,
|
||||
last_n=n_incr0,
|
||||
weights=None,
|
||||
)
|
||||
|
||||
means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(
|
||||
X=Xw_sparse,
|
||||
axis=axis,
|
||||
last_mean=means_w0,
|
||||
last_var=vars_w0,
|
||||
last_n=n_incr_w0,
|
||||
weights=weights,
|
||||
)
|
||||
|
||||
assert_array_almost_equal(means1, means_w1)
|
||||
assert_array_almost_equal(vars1, vars_w1)
|
||||
assert_array_almost_equal(n_incr1, n_incr_w1)
|
||||
|
||||
assert means_w1.dtype == dtype
|
||||
assert vars_w1.dtype == dtype
|
||||
assert n_incr_w1.dtype == dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["Xw", "X", "weights"],
|
||||
[
|
||||
([[0, 0, 1], [0, 2, 3]], [[0, 0, 1], [0, 2, 3]], [1, 1]),
|
||||
([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1], [0, 1, 1]], [1, 2]),
|
||||
([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1]], None),
|
||||
(
|
||||
[[0, np.nan, 2], [0, np.nan, np.nan]],
|
||||
[[0, np.nan, 2], [0, np.nan, np.nan]],
|
||||
[1.0, 1.0],
|
||||
),
|
||||
(
|
||||
[[0, 0, 1, np.nan, 2, 0], [0, 3, np.nan, np.nan, np.nan, 2]],
|
||||
[
|
||||
[0, 0, 1, np.nan, 2, 0],
|
||||
[0, 0, 1, np.nan, 2, 0],
|
||||
[0, 3, np.nan, np.nan, np.nan, 2],
|
||||
],
|
||||
[2.0, 1.0],
|
||||
),
|
||||
(
|
||||
[[1, 0, 1], [0, 0, 1]],
|
||||
[[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]],
|
||||
np.array([1, 3]),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix])
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_incr_mean_variance_axis_weighted_axis0(
|
||||
Xw, X, weights, sparse_constructor, dtype
|
||||
):
|
||||
axis = 0
|
||||
Xw_sparse = sparse_constructor(Xw).astype(dtype)
|
||||
X_sparse = sparse_constructor(X).astype(dtype)
|
||||
|
||||
last_mean = np.zeros(np.size(Xw, 1), dtype=dtype)
|
||||
last_var = np.zeros_like(last_mean)
|
||||
last_n = np.zeros_like(last_mean, dtype=np.int64)
|
||||
means0, vars0, n_incr0 = incr_mean_variance_axis(
|
||||
X=X_sparse,
|
||||
axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n,
|
||||
weights=None,
|
||||
)
|
||||
|
||||
means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(
|
||||
X=Xw_sparse,
|
||||
axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n,
|
||||
weights=weights,
|
||||
)
|
||||
|
||||
assert means_w0.dtype == dtype
|
||||
assert vars_w0.dtype == dtype
|
||||
assert n_incr_w0.dtype == dtype
|
||||
|
||||
means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)
|
||||
|
||||
assert_array_almost_equal(means0, means_w0)
|
||||
assert_array_almost_equal(means0, means_simple)
|
||||
assert_array_almost_equal(vars0, vars_w0)
|
||||
assert_array_almost_equal(vars0, vars_simple)
|
||||
assert_array_almost_equal(n_incr0, n_incr_w0)
|
||||
|
||||
# check second round for incremental
|
||||
means1, vars1, n_incr1 = incr_mean_variance_axis(
|
||||
X=X_sparse,
|
||||
axis=axis,
|
||||
last_mean=means0,
|
||||
last_var=vars0,
|
||||
last_n=n_incr0,
|
||||
weights=None,
|
||||
)
|
||||
|
||||
means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(
|
||||
X=Xw_sparse,
|
||||
axis=axis,
|
||||
last_mean=means_w0,
|
||||
last_var=vars_w0,
|
||||
last_n=n_incr_w0,
|
||||
weights=weights,
|
||||
)
|
||||
|
||||
assert_array_almost_equal(means1, means_w1)
|
||||
assert_array_almost_equal(vars1, vars_w1)
|
||||
assert_array_almost_equal(n_incr1, n_incr_w1)
|
||||
|
||||
assert means_w1.dtype == dtype
|
||||
assert vars_w1.dtype == dtype
|
||||
assert n_incr_w1.dtype == dtype
|
||||
|
||||
|
||||
def test_incr_mean_variance_axis():
|
||||
for axis in [0, 1]:
|
||||
rng = np.random.RandomState(0)
|
||||
n_features = 50
|
||||
n_samples = 10
|
||||
if axis == 0:
|
||||
data_chunks = [rng.randint(0, 2, size=n_features) for i in range(n_samples)]
|
||||
else:
|
||||
data_chunks = [rng.randint(0, 2, size=n_samples) for i in range(n_features)]
|
||||
|
||||
# default params for incr_mean_variance
|
||||
last_mean = np.zeros(n_features) if axis == 0 else np.zeros(n_samples)
|
||||
last_var = np.zeros_like(last_mean)
|
||||
last_n = np.zeros_like(last_mean, dtype=np.int64)
|
||||
|
||||
# Test errors
|
||||
X = np.array(data_chunks[0])
|
||||
X = np.atleast_2d(X)
|
||||
X = X.T if axis == 1 else X
|
||||
X_lil = sp.lil_matrix(X)
|
||||
X_csr = sp.csr_matrix(X_lil)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
incr_mean_variance_axis(
|
||||
X=axis, axis=last_mean, last_mean=last_var, last_var=last_n
|
||||
)
|
||||
with pytest.raises(TypeError):
|
||||
incr_mean_variance_axis(
|
||||
X_lil, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
|
||||
# Test _incr_mean_and_var with a 1 row input
|
||||
X_means, X_vars = mean_variance_axis(X_csr, axis)
|
||||
X_means_incr, X_vars_incr, n_incr = incr_mean_variance_axis(
|
||||
X_csr, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
assert_array_almost_equal(X_means, X_means_incr)
|
||||
assert_array_almost_equal(X_vars, X_vars_incr)
|
||||
# X.shape[axis] picks # samples
|
||||
assert_array_equal(X.shape[axis], n_incr)
|
||||
|
||||
X_csc = sp.csc_matrix(X_lil)
|
||||
X_means, X_vars = mean_variance_axis(X_csc, axis)
|
||||
assert_array_almost_equal(X_means, X_means_incr)
|
||||
assert_array_almost_equal(X_vars, X_vars_incr)
|
||||
assert_array_equal(X.shape[axis], n_incr)
|
||||
|
||||
# Test _incremental_mean_and_var with whole data
|
||||
X = np.vstack(data_chunks)
|
||||
X = X.T if axis == 1 else X
|
||||
X_lil = sp.lil_matrix(X)
|
||||
X_csr = sp.csr_matrix(X_lil)
|
||||
X_csc = sp.csc_matrix(X_lil)
|
||||
|
||||
expected_dtypes = [
|
||||
(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64),
|
||||
]
|
||||
|
||||
for input_dtype, output_dtype in expected_dtypes:
|
||||
for X_sparse in (X_csr, X_csc):
|
||||
X_sparse = X_sparse.astype(input_dtype)
|
||||
last_mean = last_mean.astype(output_dtype)
|
||||
last_var = last_var.astype(output_dtype)
|
||||
X_means, X_vars = mean_variance_axis(X_sparse, axis)
|
||||
X_means_incr, X_vars_incr, n_incr = incr_mean_variance_axis(
|
||||
X_sparse,
|
||||
axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n,
|
||||
)
|
||||
assert X_means_incr.dtype == output_dtype
|
||||
assert X_vars_incr.dtype == output_dtype
|
||||
assert_array_almost_equal(X_means, X_means_incr)
|
||||
assert_array_almost_equal(X_vars, X_vars_incr)
|
||||
assert_array_equal(X.shape[axis], n_incr)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix])
|
||||
def test_incr_mean_variance_axis_dim_mismatch(sparse_constructor):
|
||||
"""Check that we raise proper error when axis=1 and the dimension mismatch.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/pull/18655
|
||||
"""
|
||||
n_samples, n_features = 60, 4
|
||||
rng = np.random.RandomState(42)
|
||||
X = sparse_constructor(rng.rand(n_samples, n_features))
|
||||
|
||||
last_mean = np.zeros(n_features)
|
||||
last_var = np.zeros_like(last_mean)
|
||||
last_n = np.zeros(last_mean.shape, dtype=np.int64)
|
||||
|
||||
kwargs = dict(last_mean=last_mean, last_var=last_var, last_n=last_n)
|
||||
mean0, var0, _ = incr_mean_variance_axis(X, axis=0, **kwargs)
|
||||
assert_allclose(np.mean(X.toarray(), axis=0), mean0)
|
||||
assert_allclose(np.var(X.toarray(), axis=0), var0)
|
||||
|
||||
# test ValueError if axis=1 and last_mean.size == n_features
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(X, axis=1, **kwargs)
|
||||
|
||||
# test inconsistent shapes of last_mean, last_var, last_n
|
||||
kwargs = dict(last_mean=last_mean[:-1], last_var=last_var, last_n=last_n)
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(X, axis=0, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X1, X2",
|
||||
[
|
||||
(
|
||||
sp.random(5, 2, density=0.8, format="csr", random_state=0),
|
||||
sp.random(13, 2, density=0.8, format="csr", random_state=0),
|
||||
),
|
||||
(
|
||||
sp.random(5, 2, density=0.8, format="csr", random_state=0),
|
||||
sp.hstack(
|
||||
[
|
||||
sp.csr_matrix(np.full((13, 1), fill_value=np.nan)),
|
||||
sp.random(13, 1, density=0.8, random_state=42),
|
||||
],
|
||||
format="csr",
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2):
|
||||
# non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/16448
|
||||
# check that computing the incremental mean and variance is equivalent to
|
||||
# computing the mean and variance on the stacked dataset.
|
||||
axis = 0
|
||||
last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
|
||||
last_n = np.zeros(X1.shape[1], dtype=np.int64)
|
||||
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
|
||||
X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
|
||||
X2, axis=axis, last_mean=updated_mean, last_var=updated_var, last_n=updated_n
|
||||
)
|
||||
X = sp.vstack([X1, X2])
|
||||
assert_allclose(updated_mean, np.nanmean(X.A, axis=axis))
|
||||
assert_allclose(updated_var, np.nanvar(X.A, axis=axis))
|
||||
assert_allclose(updated_n, np.count_nonzero(~np.isnan(X.A), axis=0))
|
||||
|
||||
|
||||
def test_incr_mean_variance_no_new_n():
|
||||
# check the behaviour when we update the variance with an empty matrix
|
||||
axis = 0
|
||||
X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr()
|
||||
X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr()
|
||||
last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
|
||||
last_n = np.zeros(X1.shape[1], dtype=np.int64)
|
||||
last_mean, last_var, last_n = incr_mean_variance_axis(
|
||||
X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
# update statistic with a column which should ignored
|
||||
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
|
||||
X2, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
assert_allclose(updated_mean, last_mean)
|
||||
assert_allclose(updated_var, last_var)
|
||||
assert_allclose(updated_n, last_n)
|
||||
|
||||
|
||||
def test_incr_mean_variance_n_float():
|
||||
# check the behaviour when last_n is just a number
|
||||
axis = 0
|
||||
X = sp.random(5, 2, density=0.8, random_state=0).tocsr()
|
||||
last_mean, last_var = np.zeros(X.shape[1]), np.zeros(X.shape[1])
|
||||
last_n = 0
|
||||
_, _, new_n = incr_mean_variance_axis(
|
||||
X, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
assert_allclose(new_n, np.full(X.shape[1], X.shape[0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("axis", [0, 1])
|
||||
@pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix])
|
||||
def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor):
|
||||
old_means = np.array([535.0, 535.0, 535.0, 535.0])
|
||||
old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
|
||||
old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64)
|
||||
|
||||
X = sparse_constructor(
|
||||
np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])
|
||||
)
|
||||
|
||||
X_nan = sparse_constructor(
|
||||
np.array(
|
||||
[
|
||||
[170, np.nan, 170, 170],
|
||||
[np.nan, 170, 430, 430],
|
||||
[430, 430, np.nan, 300],
|
||||
[300, 300, 300, np.nan],
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
# we avoid creating specific data for axis 0 and 1: translating the data is
|
||||
# enough.
|
||||
if axis:
|
||||
X = X.T
|
||||
X_nan = X_nan.T
|
||||
|
||||
# take a copy of the old statistics since they are modified in place.
|
||||
X_means, X_vars, X_sample_count = incr_mean_variance_axis(
|
||||
X,
|
||||
axis=axis,
|
||||
last_mean=old_means.copy(),
|
||||
last_var=old_variances.copy(),
|
||||
last_n=old_sample_count.copy(),
|
||||
)
|
||||
X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis(
|
||||
X_nan,
|
||||
axis=axis,
|
||||
last_mean=old_means.copy(),
|
||||
last_var=old_variances.copy(),
|
||||
last_n=old_sample_count.copy(),
|
||||
)
|
||||
|
||||
assert_allclose(X_nan_means, X_means)
|
||||
assert_allclose(X_nan_vars, X_vars)
|
||||
assert_allclose(X_nan_sample_count, X_sample_count)
|
||||
|
||||
|
||||
def test_mean_variance_illegal_axis():
|
||||
X, _ = make_classification(5, 4, random_state=0)
|
||||
# Sparsify the array a little bit
|
||||
X[0, 0] = 0
|
||||
X[2, 1] = 0
|
||||
X[4, 3] = 0
|
||||
X_csr = sp.csr_matrix(X)
|
||||
with pytest.raises(ValueError):
|
||||
mean_variance_axis(X_csr, axis=-3)
|
||||
with pytest.raises(ValueError):
|
||||
mean_variance_axis(X_csr, axis=2)
|
||||
with pytest.raises(ValueError):
|
||||
mean_variance_axis(X_csr, axis=-1)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(
|
||||
X_csr, axis=-3, last_mean=None, last_var=None, last_n=None
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(
|
||||
X_csr, axis=2, last_mean=None, last_var=None, last_n=None
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(
|
||||
X_csr, axis=-1, last_mean=None, last_var=None, last_n=None
|
||||
)
|
||||
|
||||
|
||||
def test_densify_rows():
|
||||
for dtype in (np.float32, np.float64):
|
||||
X = sp.csr_matrix(
|
||||
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=dtype
|
||||
)
|
||||
X_rows = np.array([0, 2, 3], dtype=np.intp)
|
||||
out = np.ones((6, X.shape[1]), dtype=dtype)
|
||||
out_rows = np.array([1, 3, 4], dtype=np.intp)
|
||||
|
||||
expect = np.ones_like(out)
|
||||
expect[out_rows] = X[X_rows, :].toarray()
|
||||
|
||||
assign_rows_csr(X, X_rows, out_rows, out)
|
||||
assert_array_equal(out, expect)
|
||||
|
||||
|
||||
def test_inplace_column_scale():
|
||||
rng = np.random.RandomState(0)
|
||||
X = sp.rand(100, 200, 0.05)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
scale = rng.rand(200)
|
||||
XA *= scale
|
||||
|
||||
inplace_column_scale(Xc, scale)
|
||||
inplace_column_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
X = X.astype(np.float32)
|
||||
scale = scale.astype(np.float32)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
XA *= scale
|
||||
inplace_column_scale(Xc, scale)
|
||||
inplace_column_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
|
||||
def test_inplace_row_scale():
|
||||
rng = np.random.RandomState(0)
|
||||
X = sp.rand(100, 200, 0.05)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
scale = rng.rand(100)
|
||||
XA *= scale.reshape(-1, 1)
|
||||
|
||||
inplace_row_scale(Xc, scale)
|
||||
inplace_row_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
X = X.astype(np.float32)
|
||||
scale = scale.astype(np.float32)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
XA *= scale.reshape(-1, 1)
|
||||
inplace_row_scale(Xc, scale)
|
||||
inplace_row_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
|
||||
def test_inplace_swap_row():
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
|
||||
)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
|
||||
swap = linalg.get_blas_funcs(("swap",), (X,))
|
||||
swap = swap[0]
|
||||
X[0], X[-1] = swap(X[0], X[-1])
|
||||
inplace_swap_row(X_csr, 0, -1)
|
||||
inplace_swap_row(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
|
||||
X[2], X[3] = swap(X[2], X[3])
|
||||
inplace_swap_row(X_csr, 2, 3)
|
||||
inplace_swap_row(X_csc, 2, 3)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_row(X_csr.tolil())
|
||||
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32
|
||||
)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
swap = linalg.get_blas_funcs(("swap",), (X,))
|
||||
swap = swap[0]
|
||||
X[0], X[-1] = swap(X[0], X[-1])
|
||||
inplace_swap_row(X_csr, 0, -1)
|
||||
inplace_swap_row(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
X[2], X[3] = swap(X[2], X[3])
|
||||
inplace_swap_row(X_csr, 2, 3)
|
||||
inplace_swap_row(X_csc, 2, 3)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_row(X_csr.tolil())
|
||||
|
||||
|
||||
def test_inplace_swap_column():
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
|
||||
)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
|
||||
swap = linalg.get_blas_funcs(("swap",), (X,))
|
||||
swap = swap[0]
|
||||
X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
|
||||
inplace_swap_column(X_csr, 0, -1)
|
||||
inplace_swap_column(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
|
||||
X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
|
||||
inplace_swap_column(X_csr, 0, 1)
|
||||
inplace_swap_column(X_csc, 0, 1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_column(X_csr.tolil())
|
||||
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32
|
||||
)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
swap = linalg.get_blas_funcs(("swap",), (X,))
|
||||
swap = swap[0]
|
||||
X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
|
||||
inplace_swap_column(X_csr, 0, -1)
|
||||
inplace_swap_column(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
|
||||
inplace_swap_column(X_csr, 0, 1)
|
||||
inplace_swap_column(X_csc, 0, 1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_column(X_csr.tolil())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("axis", [0, 1, None])
|
||||
@pytest.mark.parametrize("sparse_format", [sp.csr_matrix, sp.csc_matrix])
|
||||
@pytest.mark.parametrize(
|
||||
"missing_values, min_func, max_func, ignore_nan",
|
||||
[(0, np.min, np.max, False), (np.nan, np.nanmin, np.nanmax, True)],
|
||||
)
|
||||
@pytest.mark.parametrize("large_indices", [True, False])
|
||||
def test_min_max(
|
||||
dtype,
|
||||
axis,
|
||||
sparse_format,
|
||||
missing_values,
|
||||
min_func,
|
||||
max_func,
|
||||
ignore_nan,
|
||||
large_indices,
|
||||
):
|
||||
X = np.array(
|
||||
[
|
||||
[0, 3, 0],
|
||||
[2, -1, missing_values],
|
||||
[0, 0, 0],
|
||||
[9, missing_values, 7],
|
||||
[4, 0, 5],
|
||||
],
|
||||
dtype=dtype,
|
||||
)
|
||||
X_sparse = sparse_format(X)
|
||||
if large_indices:
|
||||
X_sparse.indices = X_sparse.indices.astype("int64")
|
||||
X_sparse.indptr = X_sparse.indptr.astype("int64")
|
||||
|
||||
mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis, ignore_nan=ignore_nan)
|
||||
assert_array_equal(mins_sparse, min_func(X, axis=axis))
|
||||
assert_array_equal(maxs_sparse, max_func(X, axis=axis))
|
||||
|
||||
|
||||
def test_min_max_axis_errors():
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
|
||||
)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
with pytest.raises(TypeError):
|
||||
min_max_axis(X_csr.tolil(), axis=0)
|
||||
with pytest.raises(ValueError):
|
||||
min_max_axis(X_csr, axis=2)
|
||||
with pytest.raises(ValueError):
|
||||
min_max_axis(X_csc, axis=-3)
|
||||
|
||||
|
||||
def test_count_nonzero():
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
|
||||
)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
X_csc = sp.csc_matrix(X)
|
||||
X_nonzero = X != 0
|
||||
sample_weight = [0.5, 0.2, 0.3, 0.1, 0.1]
|
||||
X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None]
|
||||
|
||||
for axis in [0, 1, -1, -2, None]:
|
||||
assert_array_almost_equal(
|
||||
count_nonzero(X_csr, axis=axis), X_nonzero.sum(axis=axis)
|
||||
)
|
||||
assert_array_almost_equal(
|
||||
count_nonzero(X_csr, axis=axis, sample_weight=sample_weight),
|
||||
X_nonzero_weighted.sum(axis=axis),
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
count_nonzero(X_csc)
|
||||
with pytest.raises(ValueError):
|
||||
count_nonzero(X_csr, axis=2)
|
||||
|
||||
assert count_nonzero(X_csr, axis=0).dtype == count_nonzero(X_csr, axis=1).dtype
|
||||
assert (
|
||||
count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype
|
||||
== count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype
|
||||
)
|
||||
|
||||
# Check dtypes with large sparse matrices too
|
||||
# XXX: test fails on 32bit (Windows/Linux)
|
||||
try:
|
||||
X_csr.indices = X_csr.indices.astype(np.int64)
|
||||
X_csr.indptr = X_csr.indptr.astype(np.int64)
|
||||
assert count_nonzero(X_csr, axis=0).dtype == count_nonzero(X_csr, axis=1).dtype
|
||||
assert (
|
||||
count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype
|
||||
== count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype
|
||||
)
|
||||
except TypeError as e:
|
||||
assert "according to the rule 'safe'" in e.args[0] and np.intp().nbytes < 8, e
|
||||
|
||||
|
||||
def test_csc_row_median():
|
||||
# Test csc_row_median actually calculates the median.
|
||||
|
||||
# Test that it gives the same output when X is dense.
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(100, 50)
|
||||
dense_median = np.median(X, axis=0)
|
||||
csc = sp.csc_matrix(X)
|
||||
sparse_median = csc_median_axis_0(csc)
|
||||
assert_array_equal(sparse_median, dense_median)
|
||||
|
||||
# Test that it gives the same output when X is sparse
|
||||
X = rng.rand(51, 100)
|
||||
X[X < 0.7] = 0.0
|
||||
ind = rng.randint(0, 50, 10)
|
||||
X[ind] = -X[ind]
|
||||
csc = sp.csc_matrix(X)
|
||||
dense_median = np.median(X, axis=0)
|
||||
sparse_median = csc_median_axis_0(csc)
|
||||
assert_array_equal(sparse_median, dense_median)
|
||||
|
||||
# Test for toy data.
|
||||
X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
|
||||
csc = sp.csc_matrix(X)
|
||||
assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5]))
|
||||
X = [[0, -2], [-1, -5], [1, -3]]
|
||||
csc = sp.csc_matrix(X)
|
||||
assert_array_equal(csc_median_axis_0(csc), np.array([0.0, -3]))
|
||||
|
||||
# Test that it raises an Error for non-csc matrices.
|
||||
with pytest.raises(TypeError):
|
||||
csc_median_axis_0(sp.csr_matrix(X))
|
||||
|
||||
|
||||
def test_inplace_normalize():
|
||||
ones = np.ones((10, 1))
|
||||
rs = RandomState(10)
|
||||
|
||||
for inplace_csr_row_normalize in (
|
||||
inplace_csr_row_normalize_l1,
|
||||
inplace_csr_row_normalize_l2,
|
||||
):
|
||||
for dtype in (np.float64, np.float32):
|
||||
X = rs.randn(10, 5).astype(dtype)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
for index_dtype in [np.int32, np.int64]:
|
||||
# csr_matrix will use int32 indices by default,
|
||||
# up-casting those to int64 when necessary
|
||||
if index_dtype is np.int64:
|
||||
X_csr.indptr = X_csr.indptr.astype(index_dtype)
|
||||
X_csr.indices = X_csr.indices.astype(index_dtype)
|
||||
assert X_csr.indices.dtype == index_dtype
|
||||
assert X_csr.indptr.dtype == index_dtype
|
||||
inplace_csr_row_normalize(X_csr)
|
||||
assert X_csr.dtype == dtype
|
||||
if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
|
||||
X_csr.data **= 2
|
||||
assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_csr_row_norms(dtype):
|
||||
# checks that csr_row_norms returns the same output as
|
||||
# scipy.sparse.linalg.norm, and that the dype is the same as X.dtype.
|
||||
X = sp.random(100, 10, format="csr", dtype=dtype, random_state=42)
|
||||
|
||||
scipy_norms = sp.linalg.norm(X, axis=1) ** 2
|
||||
norms = csr_row_norms(X)
|
||||
|
||||
assert norms.dtype == dtype
|
||||
rtol = 1e-6 if dtype == np.float32 else 1e-7
|
||||
assert_allclose(norms, scipy_norms, rtol=rtol)
|
||||
@@ -0,0 +1,98 @@
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
from pytest import approx
|
||||
|
||||
from sklearn.utils.stats import _weighted_percentile
|
||||
|
||||
|
||||
def test_weighted_percentile():
|
||||
y = np.empty(102, dtype=np.float64)
|
||||
y[:50] = 0
|
||||
y[-51:] = 2
|
||||
y[-1] = 100000
|
||||
y[50] = 1
|
||||
sw = np.ones(102, dtype=np.float64)
|
||||
sw[-1] = 0.0
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert approx(score) == 1
|
||||
|
||||
|
||||
def test_weighted_percentile_equal():
|
||||
y = np.empty(102, dtype=np.float64)
|
||||
y.fill(0.0)
|
||||
sw = np.ones(102, dtype=np.float64)
|
||||
sw[-1] = 0.0
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert score == 0
|
||||
|
||||
|
||||
def test_weighted_percentile_zero_weight():
|
||||
y = np.empty(102, dtype=np.float64)
|
||||
y.fill(1.0)
|
||||
sw = np.ones(102, dtype=np.float64)
|
||||
sw.fill(0.0)
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert approx(score) == 1.0
|
||||
|
||||
|
||||
def test_weighted_percentile_zero_weight_zero_percentile():
|
||||
y = np.array([0, 1, 2, 3, 4, 5])
|
||||
sw = np.array([0, 0, 1, 1, 1, 0])
|
||||
score = _weighted_percentile(y, sw, 0)
|
||||
assert approx(score) == 2
|
||||
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert approx(score) == 3
|
||||
|
||||
score = _weighted_percentile(y, sw, 100)
|
||||
assert approx(score) == 4
|
||||
|
||||
|
||||
def test_weighted_median_equal_weights():
|
||||
# Checks weighted percentile=0.5 is same as median when weights equal
|
||||
rng = np.random.RandomState(0)
|
||||
# Odd size as _weighted_percentile takes lower weighted percentile
|
||||
x = rng.randint(10, size=11)
|
||||
weights = np.ones(x.shape)
|
||||
|
||||
median = np.median(x)
|
||||
w_median = _weighted_percentile(x, weights)
|
||||
assert median == approx(w_median)
|
||||
|
||||
|
||||
def test_weighted_median_integer_weights():
|
||||
# Checks weighted percentile=0.5 is same as median when manually weight
|
||||
# data
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.randint(20, size=10)
|
||||
weights = rng.choice(5, size=10)
|
||||
x_manual = np.repeat(x, weights)
|
||||
|
||||
median = np.median(x_manual)
|
||||
w_median = _weighted_percentile(x, weights)
|
||||
|
||||
assert median == approx(w_median)
|
||||
|
||||
|
||||
def test_weighted_percentile_2d():
|
||||
# Check for when array 2D and sample_weight 1D
|
||||
rng = np.random.RandomState(0)
|
||||
x1 = rng.randint(10, size=10)
|
||||
w1 = rng.choice(5, size=10)
|
||||
|
||||
x2 = rng.randint(20, size=10)
|
||||
x_2d = np.vstack((x1, x2)).T
|
||||
|
||||
w_median = _weighted_percentile(x_2d, w1)
|
||||
p_axis_0 = [_weighted_percentile(x_2d[:, i], w1) for i in range(x_2d.shape[1])]
|
||||
assert_allclose(w_median, p_axis_0)
|
||||
|
||||
# Check when array and sample_weight boht 2D
|
||||
w2 = rng.choice(5, size=10)
|
||||
w_2d = np.vstack((w1, w2)).T
|
||||
|
||||
w_median = _weighted_percentile(x_2d, w_2d)
|
||||
p_axis_0 = [
|
||||
_weighted_percentile(x_2d[:, i], w_2d[:, i]) for i in range(x_2d.shape[1])
|
||||
]
|
||||
assert_allclose(w_median, p_axis_0)
|
||||
@@ -0,0 +1,47 @@
|
||||
import pytest
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.utils._tags import (
|
||||
_DEFAULT_TAGS,
|
||||
_safe_tags,
|
||||
)
|
||||
|
||||
|
||||
class NoTagsEstimator:
|
||||
pass
|
||||
|
||||
|
||||
class MoreTagsEstimator:
|
||||
def _more_tags(self):
|
||||
return {"allow_nan": True}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, err_msg",
|
||||
[
|
||||
(BaseEstimator(), "The key xxx is not defined in _get_tags"),
|
||||
(NoTagsEstimator(), "The key xxx is not defined in _DEFAULT_TAGS"),
|
||||
],
|
||||
)
|
||||
def test_safe_tags_error(estimator, err_msg):
|
||||
# Check that safe_tags raises error in ambiguous case.
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_tags(estimator, key="xxx")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, key, expected_results",
|
||||
[
|
||||
(NoTagsEstimator(), None, _DEFAULT_TAGS),
|
||||
(NoTagsEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]),
|
||||
(MoreTagsEstimator(), None, {**_DEFAULT_TAGS, **{"allow_nan": True}}),
|
||||
(MoreTagsEstimator(), "allow_nan", True),
|
||||
(BaseEstimator(), None, _DEFAULT_TAGS),
|
||||
(BaseEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]),
|
||||
(BaseEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]),
|
||||
],
|
||||
)
|
||||
def test_safe_tags_no_get_tags(estimator, key, expected_results):
|
||||
# check the behaviour of _safe_tags when an estimator does not implement
|
||||
# _get_tags
|
||||
assert _safe_tags(estimator, key=key) == expected_results
|
||||
@@ -0,0 +1,876 @@
|
||||
import warnings
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import atexit
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy import sparse
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils.deprecation import deprecated
|
||||
from sklearn.utils.metaestimators import available_if, if_delegate_has_method
|
||||
from sklearn.utils._readonly_array_wrapper import _test_sum
|
||||
from sklearn.utils._testing import (
|
||||
assert_raises,
|
||||
assert_warns,
|
||||
assert_no_warnings,
|
||||
set_random_state,
|
||||
assert_raise_message,
|
||||
ignore_warnings,
|
||||
check_docstring_parameters,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_raises_regex,
|
||||
TempMemmap,
|
||||
create_memmap_backed_data,
|
||||
_delete_folder,
|
||||
_convert_container,
|
||||
raises,
|
||||
assert_allclose,
|
||||
)
|
||||
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
|
||||
|
||||
def test_set_random_state():
|
||||
lda = LinearDiscriminantAnalysis()
|
||||
tree = DecisionTreeClassifier()
|
||||
# Linear Discriminant Analysis doesn't have random state: smoke test
|
||||
set_random_state(lda, 3)
|
||||
set_random_state(tree, 3)
|
||||
assert tree.random_state == 3
|
||||
|
||||
|
||||
def test_assert_allclose_dense_sparse():
|
||||
x = np.arange(9).reshape(3, 3)
|
||||
msg = "Not equal to tolerance "
|
||||
y = sparse.csc_matrix(x)
|
||||
for X in [x, y]:
|
||||
# basic compare
|
||||
with pytest.raises(AssertionError, match=msg):
|
||||
assert_allclose_dense_sparse(X, X * 2)
|
||||
assert_allclose_dense_sparse(X, X)
|
||||
|
||||
with pytest.raises(ValueError, match="Can only compare two sparse"):
|
||||
assert_allclose_dense_sparse(x, y)
|
||||
|
||||
A = sparse.diags(np.ones(5), offsets=0).tocsr()
|
||||
B = sparse.csr_matrix(np.ones((1, 5)))
|
||||
with pytest.raises(AssertionError, match="Arrays are not equal"):
|
||||
assert_allclose_dense_sparse(B, A)
|
||||
|
||||
|
||||
def test_assert_raises_msg():
|
||||
with assert_raises_regex(AssertionError, "Hello world"):
|
||||
with assert_raises(ValueError, msg="Hello world"):
|
||||
pass
|
||||
|
||||
|
||||
def test_assert_raise_message():
|
||||
def _raise_ValueError(message):
|
||||
raise ValueError(message)
|
||||
|
||||
def _no_raise():
|
||||
pass
|
||||
|
||||
assert_raise_message(ValueError, "test", _raise_ValueError, "test")
|
||||
|
||||
assert_raises(
|
||||
AssertionError,
|
||||
assert_raise_message,
|
||||
ValueError,
|
||||
"something else",
|
||||
_raise_ValueError,
|
||||
"test",
|
||||
)
|
||||
|
||||
assert_raises(
|
||||
ValueError,
|
||||
assert_raise_message,
|
||||
TypeError,
|
||||
"something else",
|
||||
_raise_ValueError,
|
||||
"test",
|
||||
)
|
||||
|
||||
assert_raises(AssertionError, assert_raise_message, ValueError, "test", _no_raise)
|
||||
|
||||
# multiple exceptions in a tuple
|
||||
assert_raises(
|
||||
AssertionError,
|
||||
assert_raise_message,
|
||||
(ValueError, AttributeError),
|
||||
"test",
|
||||
_no_raise,
|
||||
)
|
||||
|
||||
|
||||
def test_ignore_warning():
|
||||
# This check that ignore_warning decorator and context manager are working
|
||||
# as expected
|
||||
def _warning_function():
|
||||
warnings.warn("deprecation warning", DeprecationWarning)
|
||||
|
||||
def _multiple_warning_function():
|
||||
warnings.warn("deprecation warning", DeprecationWarning)
|
||||
warnings.warn("deprecation warning")
|
||||
|
||||
# Check the function directly
|
||||
assert_no_warnings(ignore_warnings(_warning_function))
|
||||
assert_no_warnings(ignore_warnings(_warning_function, category=DeprecationWarning))
|
||||
with pytest.warns(DeprecationWarning):
|
||||
ignore_warnings(_warning_function, category=UserWarning)()
|
||||
with pytest.warns(UserWarning):
|
||||
ignore_warnings(_multiple_warning_function, category=FutureWarning)()
|
||||
with pytest.warns(DeprecationWarning):
|
||||
ignore_warnings(_multiple_warning_function, category=UserWarning)()
|
||||
assert_no_warnings(
|
||||
ignore_warnings(_warning_function, category=(DeprecationWarning, UserWarning))
|
||||
)
|
||||
|
||||
# Check the decorator
|
||||
@ignore_warnings
|
||||
def decorator_no_warning():
|
||||
_warning_function()
|
||||
_multiple_warning_function()
|
||||
|
||||
@ignore_warnings(category=(DeprecationWarning, UserWarning))
|
||||
def decorator_no_warning_multiple():
|
||||
_multiple_warning_function()
|
||||
|
||||
@ignore_warnings(category=DeprecationWarning)
|
||||
def decorator_no_deprecation_warning():
|
||||
_warning_function()
|
||||
|
||||
@ignore_warnings(category=UserWarning)
|
||||
def decorator_no_user_warning():
|
||||
_warning_function()
|
||||
|
||||
@ignore_warnings(category=DeprecationWarning)
|
||||
def decorator_no_deprecation_multiple_warning():
|
||||
_multiple_warning_function()
|
||||
|
||||
@ignore_warnings(category=UserWarning)
|
||||
def decorator_no_user_multiple_warning():
|
||||
_multiple_warning_function()
|
||||
|
||||
assert_no_warnings(decorator_no_warning)
|
||||
assert_no_warnings(decorator_no_warning_multiple)
|
||||
assert_no_warnings(decorator_no_deprecation_warning)
|
||||
with pytest.warns(DeprecationWarning):
|
||||
decorator_no_user_warning()
|
||||
with pytest.warns(UserWarning):
|
||||
decorator_no_deprecation_multiple_warning()
|
||||
with pytest.warns(DeprecationWarning):
|
||||
decorator_no_user_multiple_warning()
|
||||
|
||||
# Check the context manager
|
||||
def context_manager_no_warning():
|
||||
with ignore_warnings():
|
||||
_warning_function()
|
||||
|
||||
def context_manager_no_warning_multiple():
|
||||
with ignore_warnings(category=(DeprecationWarning, UserWarning)):
|
||||
_multiple_warning_function()
|
||||
|
||||
def context_manager_no_deprecation_warning():
|
||||
with ignore_warnings(category=DeprecationWarning):
|
||||
_warning_function()
|
||||
|
||||
def context_manager_no_user_warning():
|
||||
with ignore_warnings(category=UserWarning):
|
||||
_warning_function()
|
||||
|
||||
def context_manager_no_deprecation_multiple_warning():
|
||||
with ignore_warnings(category=DeprecationWarning):
|
||||
_multiple_warning_function()
|
||||
|
||||
def context_manager_no_user_multiple_warning():
|
||||
with ignore_warnings(category=UserWarning):
|
||||
_multiple_warning_function()
|
||||
|
||||
assert_no_warnings(context_manager_no_warning)
|
||||
assert_no_warnings(context_manager_no_warning_multiple)
|
||||
assert_no_warnings(context_manager_no_deprecation_warning)
|
||||
with pytest.warns(DeprecationWarning):
|
||||
context_manager_no_user_warning()
|
||||
with pytest.warns(UserWarning):
|
||||
context_manager_no_deprecation_multiple_warning()
|
||||
with pytest.warns(DeprecationWarning):
|
||||
context_manager_no_user_multiple_warning()
|
||||
|
||||
# Check that passing warning class as first positional argument
|
||||
warning_class = UserWarning
|
||||
match = "'obj' should be a callable.+you should use 'category=UserWarning'"
|
||||
|
||||
with pytest.raises(ValueError, match=match):
|
||||
silence_warnings_func = ignore_warnings(warning_class)(_warning_function)
|
||||
silence_warnings_func()
|
||||
|
||||
with pytest.raises(ValueError, match=match):
|
||||
|
||||
@ignore_warnings(warning_class)
|
||||
def test():
|
||||
pass
|
||||
|
||||
|
||||
class TestWarns(unittest.TestCase):
|
||||
def test_warn(self):
|
||||
def f():
|
||||
warnings.warn("yo")
|
||||
return 3
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", UserWarning)
|
||||
filters_orig = warnings.filters[:]
|
||||
|
||||
# TODO: remove in 1.2
|
||||
with pytest.warns(FutureWarning):
|
||||
assert assert_warns(UserWarning, f) == 3
|
||||
|
||||
# test that assert_warns doesn't have side effects on warnings
|
||||
# filters
|
||||
assert warnings.filters == filters_orig
|
||||
with pytest.raises(AssertionError):
|
||||
assert_no_warnings(f)
|
||||
assert assert_no_warnings(lambda x: x, 1) == 1
|
||||
|
||||
# TODO: remove in 1.2
|
||||
@ignore_warnings(category=FutureWarning)
|
||||
def test_warn_wrong_warning(self):
|
||||
def f():
|
||||
warnings.warn("yo", FutureWarning)
|
||||
|
||||
failed = False
|
||||
filters = sys.modules["warnings"].filters[:]
|
||||
try:
|
||||
try:
|
||||
# Should raise an AssertionError
|
||||
|
||||
# assert_warns has a special handling of "FutureWarning" that
|
||||
# pytest.warns does not have
|
||||
assert_warns(UserWarning, f)
|
||||
failed = True
|
||||
except AssertionError:
|
||||
pass
|
||||
finally:
|
||||
sys.modules["warnings"].filters = filters
|
||||
|
||||
if failed:
|
||||
raise AssertionError("wrong warning caught by assert_warn")
|
||||
|
||||
|
||||
# Tests for docstrings:
|
||||
|
||||
|
||||
def f_ok(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_bad_sections(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Results
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_bad_order(b, a):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_too_many_param_docstring(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : int
|
||||
Parameter b
|
||||
c : int
|
||||
Parameter c
|
||||
|
||||
Returns
|
||||
-------
|
||||
d : list
|
||||
Parameter c
|
||||
"""
|
||||
d = a + b
|
||||
return d
|
||||
|
||||
|
||||
def f_missing(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_check_param_definition(a, b, c, d, e):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a: int
|
||||
Parameter a
|
||||
b:
|
||||
Parameter b
|
||||
c :
|
||||
This is parsed correctly in numpydoc 1.2
|
||||
d:int
|
||||
Parameter d
|
||||
e
|
||||
No typespec is allowed without colon
|
||||
"""
|
||||
return a + b + c + d
|
||||
|
||||
|
||||
class Klass:
|
||||
def f_missing(self, X, y):
|
||||
pass
|
||||
|
||||
def f_bad_sections(self, X, y):
|
||||
"""Function f
|
||||
|
||||
Parameter
|
||||
---------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Results
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class MockEst:
|
||||
def __init__(self):
|
||||
"""MockEstimator"""
|
||||
|
||||
def fit(self, X, y):
|
||||
return X
|
||||
|
||||
def predict(self, X):
|
||||
return X
|
||||
|
||||
def predict_proba(self, X):
|
||||
return X
|
||||
|
||||
def score(self, X):
|
||||
return 1.0
|
||||
|
||||
|
||||
class MockMetaEstimator:
|
||||
def __init__(self, delegate):
|
||||
"""MetaEstimator to check if doctest on delegated methods work.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
delegate : estimator
|
||||
Delegated estimator.
|
||||
"""
|
||||
self.delegate = delegate
|
||||
|
||||
@available_if(lambda self: hasattr(self.delegate, "predict"))
|
||||
def predict(self, X):
|
||||
"""This is available only if delegate has predict.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : ndarray
|
||||
Parameter y
|
||||
"""
|
||||
return self.delegate.predict(X)
|
||||
|
||||
@available_if(lambda self: hasattr(self.delegate, "score"))
|
||||
@deprecated("Testing a deprecated delegated method")
|
||||
def score(self, X):
|
||||
"""This is available only if delegate has score.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
y : ndarray
|
||||
Parameter y
|
||||
"""
|
||||
|
||||
@available_if(lambda self: hasattr(self.delegate, "predict_proba"))
|
||||
def predict_proba(self, X):
|
||||
"""This is available only if delegate has predict_proba.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
X : ndarray
|
||||
Parameter X
|
||||
"""
|
||||
return X
|
||||
|
||||
@deprecated("Testing deprecated function with wrong params")
|
||||
def fit(self, X, y):
|
||||
"""Incorrect docstring but should not be tested"""
|
||||
|
||||
|
||||
class MockMetaEstimatorDeprecatedDelegation:
|
||||
def __init__(self, delegate):
|
||||
"""MetaEstimator to check if doctest on delegated methods work.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
delegate : estimator
|
||||
Delegated estimator.
|
||||
"""
|
||||
self.delegate = delegate
|
||||
|
||||
@if_delegate_has_method(delegate="delegate")
|
||||
def predict(self, X):
|
||||
"""This is available only if delegate has predict.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : ndarray
|
||||
Parameter y
|
||||
"""
|
||||
return self.delegate.predict(X)
|
||||
|
||||
@if_delegate_has_method(delegate="delegate")
|
||||
@deprecated("Testing a deprecated delegated method")
|
||||
def score(self, X):
|
||||
"""This is available only if delegate has score.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
y : ndarray
|
||||
Parameter y
|
||||
"""
|
||||
|
||||
@if_delegate_has_method(delegate="delegate")
|
||||
def predict_proba(self, X):
|
||||
"""This is available only if delegate has predict_proba.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
X : ndarray
|
||||
Parameter X
|
||||
"""
|
||||
return X
|
||||
|
||||
@deprecated("Testing deprecated function with wrong params")
|
||||
def fit(self, X, y):
|
||||
"""Incorrect docstring but should not be tested"""
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:if_delegate_has_method was deprecated")
|
||||
@pytest.mark.parametrize(
|
||||
"mock_meta",
|
||||
[
|
||||
MockMetaEstimator(delegate=MockEst()),
|
||||
MockMetaEstimatorDeprecatedDelegation(delegate=MockEst()),
|
||||
],
|
||||
)
|
||||
def test_check_docstring_parameters(mock_meta):
|
||||
pytest.importorskip(
|
||||
"numpydoc",
|
||||
reason="numpydoc is required to test the docstrings",
|
||||
minversion="1.2.0",
|
||||
)
|
||||
|
||||
incorrect = check_docstring_parameters(f_ok)
|
||||
assert incorrect == []
|
||||
incorrect = check_docstring_parameters(f_ok, ignore=["b"])
|
||||
assert incorrect == []
|
||||
incorrect = check_docstring_parameters(f_missing, ignore=["b"])
|
||||
assert incorrect == []
|
||||
with pytest.raises(RuntimeError, match="Unknown section Results"):
|
||||
check_docstring_parameters(f_bad_sections)
|
||||
with pytest.raises(RuntimeError, match="Unknown section Parameter"):
|
||||
check_docstring_parameters(Klass.f_bad_sections)
|
||||
|
||||
incorrect = check_docstring_parameters(f_check_param_definition)
|
||||
mock_meta_name = mock_meta.__class__.__name__
|
||||
assert incorrect == [
|
||||
"sklearn.utils.tests.test_testing.f_check_param_definition There "
|
||||
"was no space between the param name and colon ('a: int')",
|
||||
"sklearn.utils.tests.test_testing.f_check_param_definition There "
|
||||
"was no space between the param name and colon ('b:')",
|
||||
"sklearn.utils.tests.test_testing.f_check_param_definition There "
|
||||
"was no space between the param name and colon ('d:int')",
|
||||
]
|
||||
|
||||
messages = [
|
||||
[
|
||||
"In function: sklearn.utils.tests.test_testing.f_bad_order",
|
||||
"There's a parameter name mismatch in function docstring w.r.t."
|
||||
" function signature, at index 0 diff: 'b' != 'a'",
|
||||
"Full diff:",
|
||||
"- ['b', 'a']",
|
||||
"+ ['a', 'b']",
|
||||
],
|
||||
[
|
||||
"In function: "
|
||||
+ "sklearn.utils.tests.test_testing.f_too_many_param_docstring",
|
||||
"Parameters in function docstring have more items w.r.t. function"
|
||||
" signature, first extra item: c",
|
||||
"Full diff:",
|
||||
"- ['a', 'b']",
|
||||
"+ ['a', 'b', 'c']",
|
||||
"? +++++",
|
||||
],
|
||||
[
|
||||
"In function: sklearn.utils.tests.test_testing.f_missing",
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: b",
|
||||
"Full diff:",
|
||||
"- ['a', 'b']",
|
||||
"+ ['a']",
|
||||
],
|
||||
[
|
||||
"In function: sklearn.utils.tests.test_testing.Klass.f_missing",
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: X",
|
||||
"Full diff:",
|
||||
"- ['X', 'y']",
|
||||
"+ []",
|
||||
],
|
||||
[
|
||||
"In function: "
|
||||
+ f"sklearn.utils.tests.test_testing.{mock_meta_name}.predict",
|
||||
"There's a parameter name mismatch in function docstring w.r.t."
|
||||
" function signature, at index 0 diff: 'X' != 'y'",
|
||||
"Full diff:",
|
||||
"- ['X']",
|
||||
"? ^",
|
||||
"+ ['y']",
|
||||
"? ^",
|
||||
],
|
||||
[
|
||||
"In function: "
|
||||
+ f"sklearn.utils.tests.test_testing.{mock_meta_name}."
|
||||
+ "predict_proba",
|
||||
"potentially wrong underline length... ",
|
||||
"Parameters ",
|
||||
"--------- in ",
|
||||
],
|
||||
[
|
||||
"In function: "
|
||||
+ f"sklearn.utils.tests.test_testing.{mock_meta_name}.score",
|
||||
"potentially wrong underline length... ",
|
||||
"Parameters ",
|
||||
"--------- in ",
|
||||
],
|
||||
[
|
||||
"In function: " + f"sklearn.utils.tests.test_testing.{mock_meta_name}.fit",
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: X",
|
||||
"Full diff:",
|
||||
"- ['X', 'y']",
|
||||
"+ []",
|
||||
],
|
||||
]
|
||||
|
||||
for msg, f in zip(
|
||||
messages,
|
||||
[
|
||||
f_bad_order,
|
||||
f_too_many_param_docstring,
|
||||
f_missing,
|
||||
Klass.f_missing,
|
||||
mock_meta.predict,
|
||||
mock_meta.predict_proba,
|
||||
mock_meta.score,
|
||||
mock_meta.fit,
|
||||
],
|
||||
):
|
||||
incorrect = check_docstring_parameters(f)
|
||||
assert msg == incorrect, '\n"%s"\n not in \n"%s"' % (msg, incorrect)
|
||||
|
||||
|
||||
class RegistrationCounter:
|
||||
def __init__(self):
|
||||
self.nb_calls = 0
|
||||
|
||||
def __call__(self, to_register_func):
|
||||
self.nb_calls += 1
|
||||
assert to_register_func.func is _delete_folder
|
||||
|
||||
|
||||
def check_memmap(input_array, mmap_data, mmap_mode="r"):
|
||||
assert isinstance(mmap_data, np.memmap)
|
||||
writeable = mmap_mode != "r"
|
||||
assert mmap_data.flags.writeable is writeable
|
||||
np.testing.assert_array_equal(input_array, mmap_data)
|
||||
|
||||
|
||||
def test_tempmemmap(monkeypatch):
|
||||
registration_counter = RegistrationCounter()
|
||||
monkeypatch.setattr(atexit, "register", registration_counter)
|
||||
|
||||
input_array = np.ones(3)
|
||||
with TempMemmap(input_array) as data:
|
||||
check_memmap(input_array, data)
|
||||
temp_folder = os.path.dirname(data.filename)
|
||||
if os.name != "nt":
|
||||
assert not os.path.exists(temp_folder)
|
||||
assert registration_counter.nb_calls == 1
|
||||
|
||||
mmap_mode = "r+"
|
||||
with TempMemmap(input_array, mmap_mode=mmap_mode) as data:
|
||||
check_memmap(input_array, data, mmap_mode=mmap_mode)
|
||||
temp_folder = os.path.dirname(data.filename)
|
||||
if os.name != "nt":
|
||||
assert not os.path.exists(temp_folder)
|
||||
assert registration_counter.nb_calls == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("aligned", [False, True])
|
||||
def test_create_memmap_backed_data(monkeypatch, aligned):
|
||||
registration_counter = RegistrationCounter()
|
||||
monkeypatch.setattr(atexit, "register", registration_counter)
|
||||
|
||||
input_array = np.ones(3)
|
||||
data = create_memmap_backed_data(input_array, aligned=aligned)
|
||||
check_memmap(input_array, data)
|
||||
assert registration_counter.nb_calls == 1
|
||||
|
||||
data, folder = create_memmap_backed_data(
|
||||
input_array, return_folder=True, aligned=aligned
|
||||
)
|
||||
check_memmap(input_array, data)
|
||||
assert folder == os.path.dirname(data.filename)
|
||||
assert registration_counter.nb_calls == 2
|
||||
|
||||
mmap_mode = "r+"
|
||||
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode, aligned=aligned)
|
||||
check_memmap(input_array, data, mmap_mode)
|
||||
assert registration_counter.nb_calls == 3
|
||||
|
||||
input_list = [input_array, input_array + 1, input_array + 2]
|
||||
if aligned:
|
||||
with pytest.raises(
|
||||
ValueError, match="If aligned=True, input must be a single numpy array."
|
||||
):
|
||||
create_memmap_backed_data(input_list, aligned=True)
|
||||
else:
|
||||
mmap_data_list = create_memmap_backed_data(input_list, aligned=False)
|
||||
for input_array, data in zip(input_list, mmap_data_list):
|
||||
check_memmap(input_array, data)
|
||||
assert registration_counter.nb_calls == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
|
||||
def test_memmap_on_contiguous_data(dtype):
|
||||
"""Test memory mapped array on contiguous memoryview."""
|
||||
x = np.arange(10).astype(dtype)
|
||||
assert x.flags["C_CONTIGUOUS"]
|
||||
assert x.flags["ALIGNED"]
|
||||
|
||||
# _test_sum consumes contiguous arrays
|
||||
# def _test_sum(NUM_TYPES[::1] x):
|
||||
sum_origin = _test_sum(x)
|
||||
|
||||
# now on memory mapped data
|
||||
# aligned=True so avoid https://github.com/joblib/joblib/issues/563
|
||||
# without alignment, this can produce segmentation faults, see
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/21654
|
||||
x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True)
|
||||
sum_mmap = _test_sum(x_mmap)
|
||||
assert sum_mmap == pytest.approx(sum_origin, rel=1e-11)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name, container_type",
|
||||
[
|
||||
("list", list),
|
||||
("tuple", tuple),
|
||||
("array", np.ndarray),
|
||||
("sparse", sparse.csr_matrix),
|
||||
("sparse_csr", sparse.csr_matrix),
|
||||
("sparse_csc", sparse.csc_matrix),
|
||||
("dataframe", lambda: pytest.importorskip("pandas").DataFrame),
|
||||
("series", lambda: pytest.importorskip("pandas").Series),
|
||||
("index", lambda: pytest.importorskip("pandas").Index),
|
||||
("slice", slice),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, superdtype",
|
||||
[
|
||||
(np.int32, np.integer),
|
||||
(np.int64, np.integer),
|
||||
(np.float32, np.floating),
|
||||
(np.float64, np.floating),
|
||||
],
|
||||
)
|
||||
def test_convert_container(
|
||||
constructor_name,
|
||||
container_type,
|
||||
dtype,
|
||||
superdtype,
|
||||
):
|
||||
"""Check that we convert the container to the right type of array with the
|
||||
right data type."""
|
||||
if constructor_name in ("dataframe", "series", "index"):
|
||||
# delay the import of pandas within the function to only skip this test
|
||||
# instead of the whole file
|
||||
container_type = container_type()
|
||||
container = [0, 1]
|
||||
container_converted = _convert_container(
|
||||
container,
|
||||
constructor_name,
|
||||
dtype=dtype,
|
||||
)
|
||||
assert isinstance(container_converted, container_type)
|
||||
|
||||
if constructor_name in ("list", "tuple", "index"):
|
||||
# list and tuple will use Python class dtype: int, float
|
||||
# pandas index will always use high precision: np.int64 and np.float64
|
||||
assert np.issubdtype(type(container_converted[0]), superdtype)
|
||||
elif hasattr(container_converted, "dtype"):
|
||||
assert container_converted.dtype == dtype
|
||||
elif hasattr(container_converted, "dtypes"):
|
||||
assert container_converted.dtypes[0] == dtype
|
||||
|
||||
|
||||
def test_raises():
|
||||
# Tests for the raises context manager
|
||||
|
||||
# Proper type, no match
|
||||
with raises(TypeError):
|
||||
raise TypeError()
|
||||
|
||||
# Proper type, proper match
|
||||
with raises(TypeError, match="how are you") as cm:
|
||||
raise TypeError("hello how are you")
|
||||
assert cm.raised_and_matched
|
||||
|
||||
# Proper type, proper match with multiple patterns
|
||||
with raises(TypeError, match=["not this one", "how are you"]) as cm:
|
||||
raise TypeError("hello how are you")
|
||||
assert cm.raised_and_matched
|
||||
|
||||
# bad type, no match
|
||||
with pytest.raises(ValueError, match="this will be raised"):
|
||||
with raises(TypeError) as cm:
|
||||
raise ValueError("this will be raised")
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# Bad type, no match, with a err_msg
|
||||
with pytest.raises(AssertionError, match="the failure message"):
|
||||
with raises(TypeError, err_msg="the failure message") as cm:
|
||||
raise ValueError()
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# bad type, with match (is ignored anyway)
|
||||
with pytest.raises(ValueError, match="this will be raised"):
|
||||
with raises(TypeError, match="this is ignored") as cm:
|
||||
raise ValueError("this will be raised")
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# proper type but bad match
|
||||
with pytest.raises(
|
||||
AssertionError, match="should contain one of the following patterns"
|
||||
):
|
||||
with raises(TypeError, match="hello") as cm:
|
||||
raise TypeError("Bad message")
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# proper type but bad match, with err_msg
|
||||
with pytest.raises(AssertionError, match="the failure message"):
|
||||
with raises(TypeError, match="hello", err_msg="the failure message") as cm:
|
||||
raise TypeError("Bad message")
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# no raise with default may_pass=False
|
||||
with pytest.raises(AssertionError, match="Did not raise"):
|
||||
with raises(TypeError) as cm:
|
||||
pass
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# no raise with may_pass=True
|
||||
with raises(TypeError, match="hello", may_pass=True) as cm:
|
||||
pass # still OK
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# Multiple exception types:
|
||||
with raises((TypeError, ValueError)):
|
||||
raise TypeError()
|
||||
with raises((TypeError, ValueError)):
|
||||
raise ValueError()
|
||||
with pytest.raises(AssertionError):
|
||||
with raises((TypeError, ValueError)):
|
||||
pass
|
||||
|
||||
|
||||
def test_float32_aware_assert_allclose():
|
||||
# The relative tolerance for float32 inputs is 1e-4
|
||||
assert_allclose(np.array([1.0 + 2e-5], dtype=np.float32), 1.0)
|
||||
with pytest.raises(AssertionError):
|
||||
assert_allclose(np.array([1.0 + 2e-4], dtype=np.float32), 1.0)
|
||||
|
||||
# The relative tolerance for other inputs is left to 1e-7 as in
|
||||
# the original numpy version.
|
||||
assert_allclose(np.array([1.0 + 2e-8], dtype=np.float64), 1.0)
|
||||
with pytest.raises(AssertionError):
|
||||
assert_allclose(np.array([1.0 + 2e-7], dtype=np.float64), 1.0)
|
||||
|
||||
# atol is left to 0.0 by default, even for float32
|
||||
with pytest.raises(AssertionError):
|
||||
assert_allclose(np.array([1e-5], dtype=np.float32), 0.0)
|
||||
assert_allclose(np.array([1e-5], dtype=np.float32), 0.0, atol=2e-5)
|
||||
@@ -0,0 +1,739 @@
|
||||
from copy import copy
|
||||
from itertools import chain
|
||||
import warnings
|
||||
import string
|
||||
import timeit
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.utils._testing import (
|
||||
assert_array_equal,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_no_warnings,
|
||||
_convert_container,
|
||||
)
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils import _determine_key_type
|
||||
from sklearn.utils import deprecated
|
||||
from sklearn.utils import gen_batches
|
||||
from sklearn.utils import _get_column_indices
|
||||
from sklearn.utils import resample
|
||||
from sklearn.utils import safe_mask
|
||||
from sklearn.utils import column_or_1d
|
||||
from sklearn.utils import _safe_indexing
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils import gen_even_slices
|
||||
from sklearn.utils import _message_with_time, _print_elapsed_time
|
||||
from sklearn.utils import get_chunk_n_rows
|
||||
from sklearn.utils import is_scalar_nan
|
||||
from sklearn.utils import _to_object_array
|
||||
from sklearn.utils import _approximate_mode
|
||||
from sklearn.utils._mocking import MockDataFrame
|
||||
from sklearn import config_context
|
||||
|
||||
# toy array
|
||||
X_toy = np.arange(9).reshape((3, 3))
|
||||
|
||||
|
||||
def test_make_rng():
|
||||
# Check the check_random_state utility function behavior
|
||||
assert check_random_state(None) is np.random.mtrand._rand
|
||||
assert check_random_state(np.random) is np.random.mtrand._rand
|
||||
|
||||
rng_42 = np.random.RandomState(42)
|
||||
assert check_random_state(42).randint(100) == rng_42.randint(100)
|
||||
|
||||
rng_42 = np.random.RandomState(42)
|
||||
assert check_random_state(rng_42) is rng_42
|
||||
|
||||
rng_42 = np.random.RandomState(42)
|
||||
assert check_random_state(43).randint(100) != rng_42.randint(100)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
check_random_state("some invalid seed")
|
||||
|
||||
|
||||
def test_gen_batches():
|
||||
# Make sure gen_batches errors on invalid batch_size
|
||||
|
||||
assert_array_equal(list(gen_batches(4, 2)), [slice(0, 2, None), slice(2, 4, None)])
|
||||
msg_zero = "gen_batches got batch_size=0, must be positive"
|
||||
with pytest.raises(ValueError, match=msg_zero):
|
||||
next(gen_batches(4, 0))
|
||||
|
||||
msg_float = "gen_batches got batch_size=0.5, must be an integer"
|
||||
with pytest.raises(TypeError, match=msg_float):
|
||||
next(gen_batches(4, 0.5))
|
||||
|
||||
|
||||
def test_deprecated():
|
||||
# Test whether the deprecated decorator issues appropriate warnings
|
||||
# Copied almost verbatim from https://docs.python.org/library/warnings.html
|
||||
|
||||
# First a function...
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
warnings.simplefilter("always")
|
||||
|
||||
@deprecated()
|
||||
def ham():
|
||||
return "spam"
|
||||
|
||||
spam = ham()
|
||||
|
||||
assert spam == "spam" # function must remain usable
|
||||
|
||||
assert len(w) == 1
|
||||
assert issubclass(w[0].category, FutureWarning)
|
||||
assert "deprecated" in str(w[0].message).lower()
|
||||
|
||||
# ... then a class.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
warnings.simplefilter("always")
|
||||
|
||||
@deprecated("don't use this")
|
||||
class Ham:
|
||||
SPAM = 1
|
||||
|
||||
ham = Ham()
|
||||
|
||||
assert hasattr(ham, "SPAM")
|
||||
|
||||
assert len(w) == 1
|
||||
assert issubclass(w[0].category, FutureWarning)
|
||||
assert "deprecated" in str(w[0].message).lower()
|
||||
|
||||
|
||||
def test_resample():
|
||||
# Border case not worth mentioning in doctests
|
||||
assert resample() is None
|
||||
|
||||
# Check that invalid arguments yield ValueError
|
||||
with pytest.raises(ValueError):
|
||||
resample([0], [0, 1])
|
||||
with pytest.raises(ValueError):
|
||||
resample([0, 1], [0, 1], replace=False, n_samples=3)
|
||||
|
||||
# Issue:6581, n_samples can be more when replace is True (default).
|
||||
assert len(resample([1, 2], n_samples=5)) == 5
|
||||
|
||||
|
||||
def test_resample_stratified():
|
||||
# Make sure resample can stratify
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
p = 0.9
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.binomial(1, p, size=n_samples)
|
||||
|
||||
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0, stratify=None)
|
||||
assert np.all(y_not_stratified == 1)
|
||||
|
||||
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
|
||||
assert not np.all(y_stratified == 1)
|
||||
assert np.sum(y_stratified) == 9 # all 1s, one 0
|
||||
|
||||
|
||||
def test_resample_stratified_replace():
|
||||
# Make sure stratified resampling supports the replace parameter
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
|
||||
X_replace, _ = resample(
|
||||
X, y, replace=True, n_samples=50, random_state=rng, stratify=y
|
||||
)
|
||||
X_no_replace, _ = resample(
|
||||
X, y, replace=False, n_samples=50, random_state=rng, stratify=y
|
||||
)
|
||||
assert np.unique(X_replace).shape[0] < 50
|
||||
assert np.unique(X_no_replace).shape[0] == 50
|
||||
|
||||
# make sure n_samples can be greater than X.shape[0] if we sample with
|
||||
# replacement
|
||||
X_replace, _ = resample(
|
||||
X, y, replace=True, n_samples=1000, random_state=rng, stratify=y
|
||||
)
|
||||
assert X_replace.shape[0] == 1000
|
||||
assert np.unique(X_replace).shape[0] == 100
|
||||
|
||||
|
||||
def test_resample_stratify_2dy():
|
||||
# Make sure y can be 2d when stratifying
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.randint(0, 2, size=(n_samples, 2))
|
||||
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
|
||||
assert y.ndim == 2
|
||||
|
||||
|
||||
def test_resample_stratify_sparse_error():
|
||||
# resample must be ndarray
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 2))
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
stratify = sp.csr_matrix(y)
|
||||
with pytest.raises(TypeError, match="A sparse matrix was passed"):
|
||||
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)
|
||||
|
||||
|
||||
def test_safe_mask():
|
||||
random_state = check_random_state(0)
|
||||
X = random_state.rand(5, 4)
|
||||
X_csr = sp.csr_matrix(X)
|
||||
mask = [False, False, True, True, True]
|
||||
|
||||
mask = safe_mask(X, mask)
|
||||
assert X[mask].shape[0] == 3
|
||||
|
||||
mask = safe_mask(X_csr, mask)
|
||||
assert X_csr[mask].shape[0] == 3
|
||||
|
||||
|
||||
def test_column_or_1d():
|
||||
EXAMPLES = [
|
||||
("binary", ["spam", "egg", "spam"]),
|
||||
("binary", [0, 1, 0, 1]),
|
||||
("continuous", np.arange(10) / 20.0),
|
||||
("multiclass", [1, 2, 3]),
|
||||
("multiclass", [0, 1, 2, 2, 0]),
|
||||
("multiclass", [[1], [2], [3]]),
|
||||
("multilabel-indicator", [[0, 1, 0], [0, 0, 1]]),
|
||||
("multiclass-multioutput", [[1, 2, 3]]),
|
||||
("multiclass-multioutput", [[1, 1], [2, 2], [3, 1]]),
|
||||
("multiclass-multioutput", [[5, 1], [4, 2], [3, 1]]),
|
||||
("multiclass-multioutput", [[1, 2, 3]]),
|
||||
("continuous-multioutput", np.arange(30).reshape((-1, 3))),
|
||||
]
|
||||
|
||||
for y_type, y in EXAMPLES:
|
||||
if y_type in ["binary", "multiclass", "continuous"]:
|
||||
assert_array_equal(column_or_1d(y), np.ravel(y))
|
||||
else:
|
||||
with pytest.raises(ValueError):
|
||||
column_or_1d(y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, dtype",
|
||||
[
|
||||
(0, "int"),
|
||||
("0", "str"),
|
||||
(True, "bool"),
|
||||
(np.bool_(True), "bool"),
|
||||
([0, 1, 2], "int"),
|
||||
(["0", "1", "2"], "str"),
|
||||
((0, 1, 2), "int"),
|
||||
(("0", "1", "2"), "str"),
|
||||
(slice(None, None), None),
|
||||
(slice(0, 2), "int"),
|
||||
(np.array([0, 1, 2], dtype=np.int32), "int"),
|
||||
(np.array([0, 1, 2], dtype=np.int64), "int"),
|
||||
(np.array([0, 1, 2], dtype=np.uint8), "int"),
|
||||
([True, False], "bool"),
|
||||
((True, False), "bool"),
|
||||
(np.array([True, False]), "bool"),
|
||||
("col_0", "str"),
|
||||
(["col_0", "col_1", "col_2"], "str"),
|
||||
(("col_0", "col_1", "col_2"), "str"),
|
||||
(slice("begin", "end"), "str"),
|
||||
(np.array(["col_0", "col_1", "col_2"]), "str"),
|
||||
(np.array(["col_0", "col_1", "col_2"], dtype=object), "str"),
|
||||
],
|
||||
)
|
||||
def test_determine_key_type(key, dtype):
|
||||
assert _determine_key_type(key) == dtype
|
||||
|
||||
|
||||
def test_determine_key_type_error():
|
||||
with pytest.raises(ValueError, match="No valid specification of the"):
|
||||
_determine_key_type(1.0)
|
||||
|
||||
|
||||
def test_determine_key_type_slice_error():
|
||||
with pytest.raises(TypeError, match="Only array-like or scalar are"):
|
||||
_determine_key_type(slice(0, 2, 1), accept_slice=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "sparse", "dataframe"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
|
||||
def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
|
||||
indices = [1, 2]
|
||||
if indices_type == "slice" and isinstance(indices[1], int):
|
||||
indices[1] += 1
|
||||
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
|
||||
def test_safe_indexing_1d_container(array_type, indices_type):
|
||||
indices = [1, 2]
|
||||
if indices_type == "slice" and isinstance(indices[1], int):
|
||||
indices[1] += 1
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
|
||||
@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
|
||||
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
|
||||
# validation of the indices
|
||||
# we make a copy because indices is mutable and shared between tests
|
||||
indices_converted = copy(indices)
|
||||
if indices_type == "slice" and isinstance(indices[1], int):
|
||||
indices_converted[1] += 1
|
||||
|
||||
columns_name = ["col_0", "col_1", "col_2"]
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
indices_converted = _convert_container(indices_converted, indices_type)
|
||||
|
||||
if isinstance(indices[0], str) and array_type != "dataframe":
|
||||
err_msg = (
|
||||
"Specifying the columns using strings is only supported "
|
||||
"for pandas DataFrames"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices_converted, axis=1)
|
||||
else:
|
||||
subset = _safe_indexing(array, indices_converted, axis=1)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_read_only", [True, False])
|
||||
@pytest.mark.parametrize("indices_read_only", [True, False])
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
|
||||
@pytest.mark.parametrize("indices_type", ["array", "series"])
|
||||
@pytest.mark.parametrize(
|
||||
"axis, expected_array", [(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
|
||||
)
|
||||
def test_safe_indexing_2d_read_only_axis_1(
|
||||
array_read_only, indices_read_only, array_type, indices_type, axis, expected_array
|
||||
):
|
||||
array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
if array_read_only:
|
||||
array.setflags(write=False)
|
||||
array = _convert_container(array, array_type)
|
||||
indices = np.array([1, 2])
|
||||
if indices_read_only:
|
||||
indices.setflags(write=False)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=axis)
|
||||
assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
|
||||
def test_safe_indexing_1d_container_mask(array_type, indices_type):
|
||||
indices = [False] + [True] * 2 + [False] * 6
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
|
||||
@pytest.mark.parametrize(
|
||||
"axis, expected_subset",
|
||||
[(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])],
|
||||
)
|
||||
def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset):
|
||||
columns_name = ["col_0", "col_1", "col_2"]
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
indices = [False, True, True]
|
||||
indices = _convert_container(indices, indices_type)
|
||||
|
||||
subset = _safe_indexing(array, indices, axis=axis)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container(expected_subset, array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type, expected_output_type",
|
||||
[
|
||||
("list", "list"),
|
||||
("array", "array"),
|
||||
("sparse", "sparse"),
|
||||
("dataframe", "series"),
|
||||
],
|
||||
)
|
||||
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
|
||||
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
indices = 2
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
expected_array = _convert_container([7, 8, 9], expected_output_type)
|
||||
assert_allclose_dense_sparse(subset, expected_array)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "series"])
|
||||
def test_safe_indexing_1d_scalar(array_type):
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = 2
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert subset == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type, expected_output_type",
|
||||
[("array", "array"), ("sparse", "sparse"), ("dataframe", "series")],
|
||||
)
|
||||
@pytest.mark.parametrize("indices", [2, "col_2"])
|
||||
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indices):
|
||||
columns_name = ["col_0", "col_1", "col_2"]
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
|
||||
if isinstance(indices, str) and array_type != "dataframe":
|
||||
err_msg = (
|
||||
"Specifying the columns using strings is only supported "
|
||||
"for pandas DataFrames"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices, axis=1)
|
||||
else:
|
||||
subset = _safe_indexing(array, indices, axis=1)
|
||||
expected_output = [3, 6, 9]
|
||||
if expected_output_type == "sparse":
|
||||
# sparse matrix are keeping the 2D shape
|
||||
expected_output = [[3], [6], [9]]
|
||||
expected_array = _convert_container(expected_output, expected_output_type)
|
||||
assert_allclose_dense_sparse(subset, expected_array)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "sparse"])
|
||||
def test_safe_indexing_None_axis_0(array_type):
|
||||
X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
X_subset = _safe_indexing(X, None, axis=0)
|
||||
assert_allclose_dense_sparse(X_subset, X)
|
||||
|
||||
|
||||
def test_safe_indexing_pandas_no_matching_cols_error():
|
||||
pd = pytest.importorskip("pandas")
|
||||
err_msg = "No valid specification of the columns."
|
||||
X = pd.DataFrame(X_toy)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(X, [1.0], axis=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("axis", [None, 3])
|
||||
def test_safe_indexing_error_axis(axis):
|
||||
with pytest.raises(ValueError, match="'axis' should be either 0"):
|
||||
_safe_indexing(X_toy, [0, 1], axis=axis)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X_constructor", ["array", "series"])
|
||||
def test_safe_indexing_1d_array_error(X_constructor):
|
||||
# check that we are raising an error if the array-like passed is 1D and
|
||||
# we try to index on the 2nd dimension
|
||||
X = list(range(5))
|
||||
if X_constructor == "array":
|
||||
X_constructor = np.asarray(X)
|
||||
elif X_constructor == "series":
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_constructor = pd.Series(X)
|
||||
|
||||
err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or pandas"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(X_constructor, [0, 1], axis=1)
|
||||
|
||||
|
||||
def test_safe_indexing_container_axis_0_unsupported_type():
|
||||
indices = ["col_1", "col_2"]
|
||||
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
|
||||
err_msg = "String indexing is not supported with 'axis=0'"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices, axis=0)
|
||||
|
||||
|
||||
def test_safe_indexing_pandas_no_settingwithcopy_warning():
|
||||
# Using safe_indexing with an array-like indexer gives a copy of the
|
||||
# DataFrame -> ensure it doesn't raise a warning if modified
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
|
||||
subset = _safe_indexing(X, [0, 1], axis=0)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", pd.core.common.SettingWithCopyWarning)
|
||||
subset.iloc[0, 0] = 10
|
||||
# The original dataframe is unaffected by the assignment on the subset:
|
||||
assert X.iloc[0, 0] == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, err_msg",
|
||||
[
|
||||
(10, r"all features must be in \[0, 2\]"),
|
||||
("whatever", "A given column is not a column of the dataframe"),
|
||||
],
|
||||
)
|
||||
def test_get_column_indices_error(key, err_msg):
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_df = pd.DataFrame(X_toy, columns=["col_0", "col_1", "col_2"])
|
||||
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_column_indices(X_df, key)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key", [["col1"], ["col2"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]]
|
||||
)
|
||||
def test_get_column_indices_pandas_nonunique_columns_error(key):
|
||||
pd = pytest.importorskip("pandas")
|
||||
toy = np.zeros((1, 5), dtype=int)
|
||||
columns = ["col1", "col1", "col2", "col3", "col2"]
|
||||
X = pd.DataFrame(toy, columns=columns)
|
||||
|
||||
err_msg = "Selected columns, {}, are not unique in dataframe".format(key)
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
_get_column_indices(X, key)
|
||||
assert str(exc_info.value) == err_msg
|
||||
|
||||
|
||||
def test_shuffle_on_ndim_equals_three():
|
||||
def to_tuple(A): # to make the inner arrays hashable
|
||||
return tuple(tuple(tuple(C) for C in B) for B in A)
|
||||
|
||||
A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2)
|
||||
S = set(to_tuple(A))
|
||||
shuffle(A) # shouldn't raise a ValueError for dim = 3
|
||||
assert set(to_tuple(A)) == S
|
||||
|
||||
|
||||
def test_shuffle_dont_convert_to_array():
|
||||
# Check that shuffle does not try to convert to numpy arrays with float
|
||||
# dtypes can let any indexable datastructure pass-through.
|
||||
a = ["a", "b", "c"]
|
||||
b = np.array(["a", "b", "c"], dtype=object)
|
||||
c = [1, 2, 3]
|
||||
d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object))
|
||||
e = sp.csc_matrix(np.arange(6).reshape(3, 2))
|
||||
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
|
||||
|
||||
assert a_s == ["c", "b", "a"]
|
||||
assert type(a_s) == list
|
||||
|
||||
assert_array_equal(b_s, ["c", "b", "a"])
|
||||
assert b_s.dtype == object
|
||||
|
||||
assert c_s == [3, 2, 1]
|
||||
assert type(c_s) == list
|
||||
|
||||
assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
|
||||
assert type(d_s) == MockDataFrame
|
||||
|
||||
assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))
|
||||
|
||||
|
||||
def test_gen_even_slices():
|
||||
# check that gen_even_slices contains all samples
|
||||
some_range = range(10)
|
||||
joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)]))
|
||||
assert_array_equal(some_range, joined_range)
|
||||
|
||||
# check that passing negative n_chunks raises an error
|
||||
slices = gen_even_slices(10, -1)
|
||||
with pytest.raises(ValueError, match="gen_even_slices got n_packs=-1, must be >=1"):
|
||||
next(slices)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("row_bytes", "max_n_rows", "working_memory", "expected"),
|
||||
[
|
||||
(1024, None, 1, 1024),
|
||||
(1024, None, 0.99999999, 1023),
|
||||
(1023, None, 1, 1025),
|
||||
(1025, None, 1, 1023),
|
||||
(1024, None, 2, 2048),
|
||||
(1024, 7, 1, 7),
|
||||
(1024 * 1024, None, 1, 1),
|
||||
],
|
||||
)
|
||||
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
actual = get_chunk_n_rows(
|
||||
row_bytes=row_bytes,
|
||||
max_n_rows=max_n_rows,
|
||||
working_memory=working_memory,
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
with config_context(working_memory=working_memory):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
|
||||
|
||||
def test_get_chunk_n_rows_warns():
|
||||
"""Check that warning is raised when working_memory is too low."""
|
||||
row_bytes = 1024 * 1024 + 1
|
||||
max_n_rows = None
|
||||
working_memory = 1
|
||||
expected = 1
|
||||
|
||||
warn_msg = (
|
||||
"Could not adhere to working_memory config. Currently 1MiB, 2MiB required."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
actual = get_chunk_n_rows(
|
||||
row_bytes=row_bytes,
|
||||
max_n_rows=max_n_rows,
|
||||
working_memory=working_memory,
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
|
||||
with config_context(working_memory=working_memory):
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["source", "message", "is_long"],
|
||||
[
|
||||
("ABC", string.ascii_lowercase, False),
|
||||
("ABCDEF", string.ascii_lowercase, False),
|
||||
("ABC", string.ascii_lowercase * 3, True),
|
||||
("ABC" * 10, string.ascii_lowercase, True),
|
||||
("ABC", string.ascii_lowercase + "\u1048", False),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
["time", "time_str"],
|
||||
[
|
||||
(0.2, " 0.2s"),
|
||||
(20, " 20.0s"),
|
||||
(2000, "33.3min"),
|
||||
(20000, "333.3min"),
|
||||
],
|
||||
)
|
||||
def test_message_with_time(source, message, is_long, time, time_str):
|
||||
out = _message_with_time(source, message, time)
|
||||
if is_long:
|
||||
assert len(out) > 70
|
||||
else:
|
||||
assert len(out) == 70
|
||||
|
||||
assert out.startswith("[" + source + "] ")
|
||||
out = out[len(source) + 3 :]
|
||||
|
||||
assert out.endswith(time_str)
|
||||
out = out[: -len(time_str)]
|
||||
assert out.endswith(", total=")
|
||||
out = out[: -len(", total=")]
|
||||
assert out.endswith(message)
|
||||
out = out[: -len(message)]
|
||||
assert out.endswith(" ")
|
||||
out = out[:-1]
|
||||
|
||||
if is_long:
|
||||
assert not out
|
||||
else:
|
||||
assert list(set(out)) == ["."]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["message", "expected"],
|
||||
[
|
||||
("hello", _message_with_time("ABC", "hello", 0.1) + "\n"),
|
||||
("", _message_with_time("ABC", "", 0.1) + "\n"),
|
||||
(None, ""),
|
||||
],
|
||||
)
|
||||
def test_print_elapsed_time(message, expected, capsys, monkeypatch):
|
||||
monkeypatch.setattr(timeit, "default_timer", lambda: 0)
|
||||
with _print_elapsed_time("ABC", message):
|
||||
monkeypatch.setattr(timeit, "default_timer", lambda: 0.1)
|
||||
assert capsys.readouterr().out == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value, result",
|
||||
[
|
||||
(float("nan"), True),
|
||||
(np.nan, True),
|
||||
(float(np.nan), True),
|
||||
(np.float32(np.nan), True),
|
||||
(np.float64(np.nan), True),
|
||||
(0, False),
|
||||
(0.0, False),
|
||||
(None, False),
|
||||
("", False),
|
||||
("nan", False),
|
||||
([np.nan], False),
|
||||
(9867966753463435747313673, False), # Python int that overflows with C type
|
||||
],
|
||||
)
|
||||
def test_is_scalar_nan(value, result):
|
||||
assert is_scalar_nan(value) is result
|
||||
# make sure that we are returning a Python bool
|
||||
assert isinstance(is_scalar_nan(value), bool)
|
||||
|
||||
|
||||
def test_approximate_mode():
|
||||
"""Make sure sklearn.utils._approximate_mode returns valid
|
||||
results for cases where "class_counts * n_draws" is enough
|
||||
to overflow 32-bit signed integer.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/20774
|
||||
"""
|
||||
X = np.array([99000, 1000], dtype=np.int32)
|
||||
ret = _approximate_mode(class_counts=X, n_draws=25000, rng=0)
|
||||
|
||||
# Draws 25% of the total population, so in this case a fair draw means:
|
||||
# 25% * 99.000 = 24.750
|
||||
# 25% * 1.000 = 250
|
||||
assert_array_equal(ret, [24750, 250])
|
||||
|
||||
|
||||
def dummy_func():
|
||||
pass
|
||||
|
||||
|
||||
def test_deprecation_joblib_api(tmpdir):
|
||||
|
||||
# Only parallel_backend and register_parallel_backend are not deprecated in
|
||||
# sklearn.utils
|
||||
from sklearn.utils import parallel_backend, register_parallel_backend
|
||||
|
||||
assert_no_warnings(parallel_backend, "loky", None)
|
||||
assert_no_warnings(register_parallel_backend, "failing", None)
|
||||
|
||||
from sklearn.utils._joblib import joblib
|
||||
|
||||
del joblib.parallel.BACKENDS["failing"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sequence", [[np.array(1), np.array(2)], [[1, 2], [3, 4]]])
|
||||
def test_to_object_array(sequence):
|
||||
out = _to_object_array(sequence)
|
||||
assert isinstance(out, np.ndarray)
|
||||
assert out.dtype.kind == "O"
|
||||
assert out.ndim == 1
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,24 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from sklearn.utils._weight_vector import (
|
||||
WeightVector32,
|
||||
WeightVector64,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, WeightVector",
|
||||
[
|
||||
(np.float32, WeightVector32),
|
||||
(np.float64, WeightVector64),
|
||||
],
|
||||
)
|
||||
def test_type_invariance(dtype, WeightVector):
|
||||
"""Check the `dtype` consistency of `WeightVector`."""
|
||||
weights = np.random.rand(100).astype(dtype)
|
||||
average_weights = np.random.rand(100).astype(dtype)
|
||||
|
||||
weight_vector = WeightVector(weights, average_weights)
|
||||
|
||||
assert np.asarray(weight_vector.w).dtype is np.dtype(dtype)
|
||||
assert np.asarray(weight_vector.aw).dtype is np.dtype(dtype)
|
||||
Reference in New Issue
Block a user