first commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,38 @@
|
||||
"""
|
||||
Common utilities for testing clustering.
|
||||
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Generate sample data
|
||||
|
||||
|
||||
def generate_clustered_data(
|
||||
seed=0, n_clusters=3, n_features=2, n_samples_per_cluster=20, std=0.4
|
||||
):
|
||||
prng = np.random.RandomState(seed)
|
||||
|
||||
# the data is voluntary shifted away from zero to check clustering
|
||||
# algorithm robustness with regards to non centered data
|
||||
means = (
|
||||
np.array(
|
||||
[
|
||||
[1, 1, 1, 0],
|
||||
[-1, -1, 0, 1],
|
||||
[1, -1, 1, 1],
|
||||
[-1, 1, 1, 0],
|
||||
]
|
||||
)
|
||||
+ 10
|
||||
)
|
||||
|
||||
X = np.empty((0, n_features))
|
||||
for i in range(n_clusters):
|
||||
X = np.r_[
|
||||
X,
|
||||
means[i][:n_features] + std * prng.randn(n_samples_per_cluster, n_features),
|
||||
]
|
||||
return X
|
||||
@@ -0,0 +1,284 @@
|
||||
"""
|
||||
Testing for Clustering methods
|
||||
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import warnings
|
||||
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
from sklearn.cluster import AffinityPropagation
|
||||
from sklearn.cluster._affinity_propagation import _equal_similarities_and_preferences
|
||||
from sklearn.cluster import affinity_propagation
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.metrics import euclidean_distances
|
||||
|
||||
n_clusters = 3
|
||||
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
|
||||
X, _ = make_blobs(
|
||||
n_samples=60,
|
||||
n_features=2,
|
||||
centers=centers,
|
||||
cluster_std=0.4,
|
||||
shuffle=True,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
|
||||
def test_affinity_propagation():
|
||||
# Affinity Propagation algorithm
|
||||
# Compute similarities
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
preference = np.median(S) * 10
|
||||
# Compute Affinity Propagation
|
||||
cluster_centers_indices, labels = affinity_propagation(
|
||||
S, preference=preference, random_state=39
|
||||
)
|
||||
|
||||
n_clusters_ = len(cluster_centers_indices)
|
||||
|
||||
assert n_clusters == n_clusters_
|
||||
|
||||
af = AffinityPropagation(
|
||||
preference=preference, affinity="precomputed", random_state=28
|
||||
)
|
||||
labels_precomputed = af.fit(S).labels_
|
||||
|
||||
af = AffinityPropagation(preference=preference, verbose=True, random_state=37)
|
||||
labels = af.fit(X).labels_
|
||||
|
||||
assert_array_equal(labels, labels_precomputed)
|
||||
|
||||
cluster_centers_indices = af.cluster_centers_indices_
|
||||
|
||||
n_clusters_ = len(cluster_centers_indices)
|
||||
assert np.unique(labels).size == n_clusters_
|
||||
assert n_clusters == n_clusters_
|
||||
|
||||
# Test also with no copy
|
||||
_, labels_no_copy = affinity_propagation(
|
||||
S, preference=preference, copy=False, random_state=74
|
||||
)
|
||||
assert_array_equal(labels, labels_no_copy)
|
||||
|
||||
|
||||
def test_affinity_propagation_affinity_shape():
|
||||
"""Check the shape of the affinity matrix when using `affinity_propagation."""
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
err_msg = "S must be a square array"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
affinity_propagation(S[:, :-1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input, params, err_type, err_msg",
|
||||
[
|
||||
(X, {"damping": 0}, ValueError, "damping == 0, must be >= 0.5"),
|
||||
(X, {"damping": 2}, ValueError, "damping == 2, must be < 1"),
|
||||
(X, {"max_iter": 0}, ValueError, "max_iter == 0, must be >= 1."),
|
||||
(X, {"convergence_iter": 0}, ValueError, "convergence_iter == 0, must be >= 1"),
|
||||
(X, {"affinity": "unknown"}, ValueError, "Affinity must be"),
|
||||
(
|
||||
csr_matrix((3, 3)),
|
||||
{"affinity": "precomputed"},
|
||||
TypeError,
|
||||
"A sparse matrix was passed, but dense data is required",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_affinity_propagation_params_validation(input, params, err_type, err_msg):
|
||||
"""Check the parameters validation in `AffinityPropagation`."""
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
AffinityPropagation(**params).fit(input)
|
||||
|
||||
|
||||
def test_affinity_propagation_predict():
|
||||
# Test AffinityPropagation.predict
|
||||
af = AffinityPropagation(affinity="euclidean", random_state=63)
|
||||
labels = af.fit_predict(X)
|
||||
labels2 = af.predict(X)
|
||||
assert_array_equal(labels, labels2)
|
||||
|
||||
|
||||
def test_affinity_propagation_predict_error():
|
||||
# Test exception in AffinityPropagation.predict
|
||||
# Not fitted.
|
||||
af = AffinityPropagation(affinity="euclidean")
|
||||
with pytest.raises(ValueError):
|
||||
af.predict(X)
|
||||
|
||||
# Predict not supported when affinity="precomputed".
|
||||
S = np.dot(X, X.T)
|
||||
af = AffinityPropagation(affinity="precomputed", random_state=57)
|
||||
af.fit(S)
|
||||
with pytest.raises(ValueError):
|
||||
af.predict(X)
|
||||
|
||||
|
||||
def test_affinity_propagation_fit_non_convergence():
|
||||
# In case of non-convergence of affinity_propagation(), the cluster
|
||||
# centers should be an empty array and training samples should be labelled
|
||||
# as noise (-1)
|
||||
X = np.array([[0, 0], [1, 1], [-2, -2]])
|
||||
|
||||
# Force non-convergence by allowing only a single iteration
|
||||
af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
|
||||
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
af.fit(X)
|
||||
assert_array_equal(np.empty((0, 2)), af.cluster_centers_)
|
||||
assert_array_equal(np.array([-1, -1, -1]), af.labels_)
|
||||
|
||||
|
||||
def test_affinity_propagation_equal_mutual_similarities():
|
||||
X = np.array([[-1, 1], [1, -1]])
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
|
||||
# setting preference > similarity
|
||||
with pytest.warns(UserWarning, match="mutually equal"):
|
||||
cluster_center_indices, labels = affinity_propagation(S, preference=0)
|
||||
|
||||
# expect every sample to become an exemplar
|
||||
assert_array_equal([0, 1], cluster_center_indices)
|
||||
assert_array_equal([0, 1], labels)
|
||||
|
||||
# setting preference < similarity
|
||||
with pytest.warns(UserWarning, match="mutually equal"):
|
||||
cluster_center_indices, labels = affinity_propagation(S, preference=-10)
|
||||
|
||||
# expect one cluster, with arbitrary (first) sample as exemplar
|
||||
assert_array_equal([0], cluster_center_indices)
|
||||
assert_array_equal([0, 0], labels)
|
||||
|
||||
# setting different preferences
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
cluster_center_indices, labels = affinity_propagation(
|
||||
S, preference=[-20, -10], random_state=37
|
||||
)
|
||||
|
||||
# expect one cluster, with highest-preference sample as exemplar
|
||||
assert_array_equal([1], cluster_center_indices)
|
||||
assert_array_equal([0, 0], labels)
|
||||
|
||||
|
||||
def test_affinity_propagation_predict_non_convergence():
|
||||
# In case of non-convergence of affinity_propagation(), the cluster
|
||||
# centers should be an empty array
|
||||
X = np.array([[0, 0], [1, 1], [-2, -2]])
|
||||
|
||||
# Force non-convergence by allowing only a single iteration
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
af = AffinityPropagation(preference=-10, max_iter=1, random_state=75).fit(X)
|
||||
|
||||
# At prediction time, consider new samples as noise since there are no
|
||||
# clusters
|
||||
to_predict = np.array([[2, 2], [3, 3], [4, 4]])
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
y = af.predict(to_predict)
|
||||
assert_array_equal(np.array([-1, -1, -1]), y)
|
||||
|
||||
|
||||
def test_affinity_propagation_non_convergence_regressiontest():
|
||||
X = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0], [0, 0, 1, 0, 0, 1]])
|
||||
af = AffinityPropagation(affinity="euclidean", max_iter=2, random_state=34)
|
||||
msg = (
|
||||
"Affinity propagation did not converge, this model may return degenerate"
|
||||
" cluster centers and labels."
|
||||
)
|
||||
with pytest.warns(ConvergenceWarning, match=msg):
|
||||
af.fit(X)
|
||||
|
||||
assert_array_equal(np.array([0, 0, 0]), af.labels_)
|
||||
|
||||
|
||||
def test_equal_similarities_and_preferences():
|
||||
# Unequal distances
|
||||
X = np.array([[0, 0], [1, 1], [-2, -2]])
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
|
||||
assert not _equal_similarities_and_preferences(S, np.array(0))
|
||||
assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
|
||||
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
|
||||
|
||||
# Equal distances
|
||||
X = np.array([[0, 0], [1, 1]])
|
||||
S = -euclidean_distances(X, squared=True)
|
||||
|
||||
# Different preferences
|
||||
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
|
||||
|
||||
# Same preferences
|
||||
assert _equal_similarities_and_preferences(S, np.array([0, 0]))
|
||||
assert _equal_similarities_and_preferences(S, np.array(0))
|
||||
|
||||
|
||||
def test_affinity_propagation_random_state():
|
||||
# Significance of random_state parameter
|
||||
# Generate sample data
|
||||
centers = [[1, 1], [-1, -1], [1, -1]]
|
||||
X, labels_true = make_blobs(
|
||||
n_samples=300, centers=centers, cluster_std=0.5, random_state=0
|
||||
)
|
||||
# random_state = 0
|
||||
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0)
|
||||
ap.fit(X)
|
||||
centers0 = ap.cluster_centers_
|
||||
|
||||
# random_state = 76
|
||||
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76)
|
||||
ap.fit(X)
|
||||
centers76 = ap.cluster_centers_
|
||||
|
||||
assert np.mean((centers0 - centers76) ** 2) > 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("centers", [csr_matrix(np.zeros((1, 10))), np.zeros((1, 10))])
|
||||
def test_affinity_propagation_convergence_warning_dense_sparse(centers):
|
||||
"""Non-regression, see #13334"""
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.rand(40, 10)
|
||||
y = (4 * rng.rand(40)).astype(int)
|
||||
ap = AffinityPropagation(random_state=46)
|
||||
ap.fit(X, y)
|
||||
ap.cluster_centers_ = centers
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", ConvergenceWarning)
|
||||
assert_array_equal(ap.predict(X), np.zeros(X.shape[0], dtype=int))
|
||||
|
||||
|
||||
def test_affinity_propagation_float32():
|
||||
# Test to fix incorrect clusters due to dtype change
|
||||
# (non-regression test for issue #10832)
|
||||
X = np.array(
|
||||
[[1, 0, 0, 0], [0, 1, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]], dtype="float32"
|
||||
)
|
||||
afp = AffinityPropagation(preference=1, affinity="precomputed", random_state=0).fit(
|
||||
X
|
||||
)
|
||||
expected = np.array([0, 1, 1, 2])
|
||||
assert_array_equal(afp.labels_, expected)
|
||||
|
||||
|
||||
def test_sparse_input_for_predict():
|
||||
# Test to make sure sparse inputs are accepted for predict
|
||||
# (non-regression test for issue #20049)
|
||||
af = AffinityPropagation(affinity="euclidean", random_state=42)
|
||||
af.fit(X)
|
||||
labels = af.predict(csr_matrix((2, 2)))
|
||||
assert_array_equal(labels, (2, 2))
|
||||
|
||||
|
||||
def test_sparse_input_for_fit_predict():
|
||||
# Test to make sure sparse inputs are accepted for fit_predict
|
||||
# (non-regression test for issue #20049)
|
||||
af = AffinityPropagation(affinity="euclidean", random_state=42)
|
||||
rng = np.random.RandomState(42)
|
||||
X = csr_matrix(rng.randint(0, 2, size=(5, 5)))
|
||||
labels = af.fit_predict(X)
|
||||
assert_array_equal(labels, (0, 1, 1, 2, 3))
|
||||
@@ -0,0 +1,280 @@
|
||||
"""Testing for Spectral Biclustering methods"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import csr_matrix, issparse
|
||||
|
||||
from sklearn.model_selection import ParameterGrid
|
||||
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
from sklearn.base import BaseEstimator, BiclusterMixin
|
||||
|
||||
from sklearn.cluster import SpectralCoclustering
|
||||
from sklearn.cluster import SpectralBiclustering
|
||||
from sklearn.cluster._bicluster import _scale_normalize
|
||||
from sklearn.cluster._bicluster import _bistochastic_normalize
|
||||
from sklearn.cluster._bicluster import _log_normalize
|
||||
|
||||
from sklearn.metrics import consensus_score, v_measure_score
|
||||
|
||||
from sklearn.datasets import make_biclusters, make_checkerboard
|
||||
|
||||
|
||||
class MockBiclustering(BiclusterMixin, BaseEstimator):
|
||||
# Mock object for testing get_submatrix.
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get_indices(self, i):
|
||||
# Overridden to reproduce old get_submatrix test.
|
||||
return (
|
||||
np.where([True, True, False, False, True])[0],
|
||||
np.where([False, False, True, True])[0],
|
||||
)
|
||||
|
||||
|
||||
def test_get_submatrix():
|
||||
data = np.arange(20).reshape(5, 4)
|
||||
model = MockBiclustering()
|
||||
|
||||
for X in (data, csr_matrix(data), data.tolist()):
|
||||
submatrix = model.get_submatrix(0, X)
|
||||
if issparse(submatrix):
|
||||
submatrix = submatrix.toarray()
|
||||
assert_array_equal(submatrix, [[2, 3], [6, 7], [18, 19]])
|
||||
submatrix[:] = -1
|
||||
if issparse(X):
|
||||
X = X.toarray()
|
||||
assert np.all(X != -1)
|
||||
|
||||
|
||||
def _test_shape_indices(model):
|
||||
# Test get_shape and get_indices on fitted model.
|
||||
for i in range(model.n_clusters):
|
||||
m, n = model.get_shape(i)
|
||||
i_ind, j_ind = model.get_indices(i)
|
||||
assert len(i_ind) == m
|
||||
assert len(j_ind) == n
|
||||
|
||||
|
||||
def test_spectral_coclustering():
|
||||
# Test Dhillon's Spectral CoClustering on a simple problem.
|
||||
param_grid = {
|
||||
"svd_method": ["randomized", "arpack"],
|
||||
"n_svd_vecs": [None, 20],
|
||||
"mini_batch": [False, True],
|
||||
"init": ["k-means++"],
|
||||
"n_init": [10],
|
||||
}
|
||||
random_state = 0
|
||||
S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state)
|
||||
S -= S.min() # needs to be nonnegative before making it sparse
|
||||
S = np.where(S < 1, 0, S) # threshold some values
|
||||
for mat in (S, csr_matrix(S)):
|
||||
for kwargs in ParameterGrid(param_grid):
|
||||
model = SpectralCoclustering(
|
||||
n_clusters=3, random_state=random_state, **kwargs
|
||||
)
|
||||
model.fit(mat)
|
||||
|
||||
assert model.rows_.shape == (3, 30)
|
||||
assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
|
||||
assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
|
||||
assert consensus_score(model.biclusters_, (rows, cols)) == 1
|
||||
|
||||
_test_shape_indices(model)
|
||||
|
||||
|
||||
def test_spectral_biclustering():
|
||||
# Test Kluger methods on a checkerboard dataset.
|
||||
S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5, random_state=0)
|
||||
|
||||
non_default_params = {
|
||||
"method": ["scale", "log"],
|
||||
"svd_method": ["arpack"],
|
||||
"n_svd_vecs": [20],
|
||||
"mini_batch": [True],
|
||||
}
|
||||
|
||||
for mat in (S, csr_matrix(S)):
|
||||
for param_name, param_values in non_default_params.items():
|
||||
for param_value in param_values:
|
||||
|
||||
model = SpectralBiclustering(
|
||||
n_clusters=3,
|
||||
n_init=3,
|
||||
init="k-means++",
|
||||
random_state=0,
|
||||
)
|
||||
model.set_params(**dict([(param_name, param_value)]))
|
||||
|
||||
if issparse(mat) and model.get_params().get("method") == "log":
|
||||
# cannot take log of sparse matrix
|
||||
with pytest.raises(ValueError):
|
||||
model.fit(mat)
|
||||
continue
|
||||
else:
|
||||
model.fit(mat)
|
||||
|
||||
assert model.rows_.shape == (9, 30)
|
||||
assert model.columns_.shape == (9, 30)
|
||||
assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30))
|
||||
assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30))
|
||||
assert consensus_score(model.biclusters_, (rows, cols)) == 1
|
||||
|
||||
_test_shape_indices(model)
|
||||
|
||||
|
||||
def _do_scale_test(scaled):
|
||||
"""Check that rows sum to one constant, and columns to another."""
|
||||
row_sum = scaled.sum(axis=1)
|
||||
col_sum = scaled.sum(axis=0)
|
||||
if issparse(scaled):
|
||||
row_sum = np.asarray(row_sum).squeeze()
|
||||
col_sum = np.asarray(col_sum).squeeze()
|
||||
assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100), decimal=1)
|
||||
assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100), decimal=1)
|
||||
|
||||
|
||||
def _do_bistochastic_test(scaled):
|
||||
"""Check that rows and columns sum to the same constant."""
|
||||
_do_scale_test(scaled)
|
||||
assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1)
|
||||
|
||||
|
||||
def test_scale_normalize():
|
||||
generator = np.random.RandomState(0)
|
||||
X = generator.rand(100, 100)
|
||||
for mat in (X, csr_matrix(X)):
|
||||
scaled, _, _ = _scale_normalize(mat)
|
||||
_do_scale_test(scaled)
|
||||
if issparse(mat):
|
||||
assert issparse(scaled)
|
||||
|
||||
|
||||
def test_bistochastic_normalize():
|
||||
generator = np.random.RandomState(0)
|
||||
X = generator.rand(100, 100)
|
||||
for mat in (X, csr_matrix(X)):
|
||||
scaled = _bistochastic_normalize(mat)
|
||||
_do_bistochastic_test(scaled)
|
||||
if issparse(mat):
|
||||
assert issparse(scaled)
|
||||
|
||||
|
||||
def test_log_normalize():
|
||||
# adding any constant to a log-scaled matrix should make it
|
||||
# bistochastic
|
||||
generator = np.random.RandomState(0)
|
||||
mat = generator.rand(100, 100)
|
||||
scaled = _log_normalize(mat) + 1
|
||||
_do_bistochastic_test(scaled)
|
||||
|
||||
|
||||
def test_fit_best_piecewise():
|
||||
model = SpectralBiclustering(random_state=0)
|
||||
vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]])
|
||||
best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
|
||||
assert_array_equal(best, vectors[:2])
|
||||
|
||||
|
||||
def test_project_and_cluster():
|
||||
model = SpectralBiclustering(random_state=0)
|
||||
data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]])
|
||||
vectors = np.array([[1, 0], [0, 1], [0, 0]])
|
||||
for mat in (data, csr_matrix(data)):
|
||||
labels = model._project_and_cluster(mat, vectors, n_clusters=2)
|
||||
assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)
|
||||
|
||||
|
||||
def test_perfect_checkerboard():
|
||||
# XXX Previously failed on build bot (not reproducible)
|
||||
model = SpectralBiclustering(3, svd_method="arpack", random_state=0)
|
||||
|
||||
S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0)
|
||||
model.fit(S)
|
||||
assert consensus_score(model.biclusters_, (rows, cols)) == 1
|
||||
|
||||
S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0)
|
||||
model.fit(S)
|
||||
assert consensus_score(model.biclusters_, (rows, cols)) == 1
|
||||
|
||||
S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0)
|
||||
model.fit(S)
|
||||
assert consensus_score(model.biclusters_, (rows, cols)) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, type_err, err_msg",
|
||||
[
|
||||
({"n_init": 0}, ValueError, "n_init == 0, must be >= 1."),
|
||||
({"n_init": 1.5}, TypeError, "n_init must be an instance of"),
|
||||
(
|
||||
{"n_clusters": "abc"},
|
||||
TypeError,
|
||||
"n_clusters must be an instance of",
|
||||
),
|
||||
({"svd_method": "unknown"}, ValueError, "Unknown SVD method: 'unknown'"),
|
||||
],
|
||||
)
|
||||
def test_spectralcoclustering_parameter_validation(params, type_err, err_msg):
|
||||
"""Check parameters validation in `SpectralBiClustering`"""
|
||||
data = np.arange(25).reshape((5, 5))
|
||||
model = SpectralCoclustering(**params)
|
||||
with pytest.raises(type_err, match=err_msg):
|
||||
model.fit(data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, type_err, err_msg",
|
||||
[
|
||||
({"n_init": 0}, ValueError, "n_init == 0, must be >= 1."),
|
||||
({"n_init": 1.5}, TypeError, "n_init must be an instance of"),
|
||||
(
|
||||
{"n_clusters": (3, 3, 3)},
|
||||
ValueError,
|
||||
r"Incorrect parameter n_clusters has value: \(3, 3, 3\)",
|
||||
),
|
||||
(
|
||||
{"n_clusters": "abc"},
|
||||
ValueError,
|
||||
"Incorrect parameter n_clusters has value: abc",
|
||||
),
|
||||
(
|
||||
{"n_clusters": (3, "abc")},
|
||||
ValueError,
|
||||
r"Incorrect parameter n_clusters has value: \(3, 'abc'\)",
|
||||
),
|
||||
(
|
||||
{"n_clusters": ("abc", 3)},
|
||||
ValueError,
|
||||
r"Incorrect parameter n_clusters has value: \('abc', 3\)",
|
||||
),
|
||||
({"method": "unknown"}, ValueError, "Unknown method: 'unknown'"),
|
||||
({"n_components": 0}, ValueError, "n_components == 0, must be >= 1."),
|
||||
({"n_components": 1.5}, TypeError, "n_components must be an instance of"),
|
||||
({"n_components": 3, "n_best": 4}, ValueError, "n_best == 4, must be <= 3."),
|
||||
({"n_best": 0}, ValueError, "n_best == 0, must be >= 1."),
|
||||
({"n_best": 1.5}, TypeError, "n_best must be an instance of"),
|
||||
({"svd_method": "unknown"}, ValueError, "Unknown SVD method: 'unknown'"),
|
||||
],
|
||||
)
|
||||
def test_spectralbiclustering_parameter_validation(params, type_err, err_msg):
|
||||
"""Check parameters validation in `SpectralBiClustering`"""
|
||||
data = np.arange(25).reshape((5, 5))
|
||||
model = SpectralBiclustering(**params)
|
||||
with pytest.raises(type_err, match=err_msg):
|
||||
model.fit(data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering()))
|
||||
def test_n_features_in_(est):
|
||||
|
||||
X, _, _ = make_biclusters((3, 3), 3, random_state=0)
|
||||
|
||||
assert not hasattr(est, "n_features_in_")
|
||||
est.fit(X)
|
||||
assert est.n_features_in_ == 3
|
||||
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Tests for the birch clustering algorithm.
|
||||
"""
|
||||
|
||||
from scipy import sparse
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.cluster.tests.common import generate_clustered_data
|
||||
from sklearn.cluster import Birch
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.linear_model import ElasticNet
|
||||
from sklearn.metrics import pairwise_distances_argmin, v_measure_score
|
||||
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
|
||||
def test_n_samples_leaves_roots():
|
||||
# Sanity check for the number of samples in leaves and roots
|
||||
X, y = make_blobs(n_samples=10)
|
||||
brc = Birch()
|
||||
brc.fit(X)
|
||||
n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
|
||||
n_samples_leaves = sum(
|
||||
[sc.n_samples_ for leaf in brc._get_leaves() for sc in leaf.subclusters_]
|
||||
)
|
||||
assert n_samples_leaves == X.shape[0]
|
||||
assert n_samples_root == X.shape[0]
|
||||
|
||||
|
||||
def test_partial_fit():
|
||||
# Test that fit is equivalent to calling partial_fit multiple times
|
||||
X, y = make_blobs(n_samples=100)
|
||||
brc = Birch(n_clusters=3)
|
||||
brc.fit(X)
|
||||
brc_partial = Birch(n_clusters=None)
|
||||
brc_partial.partial_fit(X[:50])
|
||||
brc_partial.partial_fit(X[50:])
|
||||
assert_array_almost_equal(brc_partial.subcluster_centers_, brc.subcluster_centers_)
|
||||
|
||||
# Test that same global labels are obtained after calling partial_fit
|
||||
# with None
|
||||
brc_partial.set_params(n_clusters=3)
|
||||
brc_partial.partial_fit(None)
|
||||
assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
|
||||
|
||||
|
||||
def test_birch_predict():
|
||||
# Test the predict method predicts the nearest centroid.
|
||||
rng = np.random.RandomState(0)
|
||||
X = generate_clustered_data(n_clusters=3, n_features=3, n_samples_per_cluster=10)
|
||||
|
||||
# n_samples * n_samples_per_cluster
|
||||
shuffle_indices = np.arange(30)
|
||||
rng.shuffle(shuffle_indices)
|
||||
X_shuffle = X[shuffle_indices, :]
|
||||
brc = Birch(n_clusters=4, threshold=1.0)
|
||||
brc.fit(X_shuffle)
|
||||
centroids = brc.subcluster_centers_
|
||||
assert_array_equal(brc.labels_, brc.predict(X_shuffle))
|
||||
nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids)
|
||||
assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
|
||||
|
||||
|
||||
def test_n_clusters():
|
||||
# Test that n_clusters param works properly
|
||||
X, y = make_blobs(n_samples=100, centers=10)
|
||||
brc1 = Birch(n_clusters=10)
|
||||
brc1.fit(X)
|
||||
assert len(brc1.subcluster_centers_) > 10
|
||||
assert len(np.unique(brc1.labels_)) == 10
|
||||
|
||||
# Test that n_clusters = Agglomerative Clustering gives
|
||||
# the same results.
|
||||
gc = AgglomerativeClustering(n_clusters=10)
|
||||
brc2 = Birch(n_clusters=gc)
|
||||
brc2.fit(X)
|
||||
assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
|
||||
assert_array_equal(brc1.labels_, brc2.labels_)
|
||||
|
||||
# Test that the wrong global clustering step raises an Error.
|
||||
clf = ElasticNet()
|
||||
brc3 = Birch(n_clusters=clf)
|
||||
err_msg = "n_clusters should be an instance of ClusterMixin or an int"
|
||||
with pytest.raises(TypeError, match=err_msg):
|
||||
brc3.fit(X)
|
||||
|
||||
# Test that a small number of clusters raises a warning.
|
||||
brc4 = Birch(threshold=10000.0)
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
brc4.fit(X)
|
||||
|
||||
|
||||
def test_sparse_X():
|
||||
# Test that sparse and dense data give same results
|
||||
X, y = make_blobs(n_samples=100, centers=10)
|
||||
brc = Birch(n_clusters=10)
|
||||
brc.fit(X)
|
||||
|
||||
csr = sparse.csr_matrix(X)
|
||||
brc_sparse = Birch(n_clusters=10)
|
||||
brc_sparse.fit(csr)
|
||||
|
||||
assert_array_equal(brc.labels_, brc_sparse.labels_)
|
||||
assert_array_almost_equal(brc.subcluster_centers_, brc_sparse.subcluster_centers_)
|
||||
|
||||
|
||||
def test_partial_fit_second_call_error_checks():
|
||||
# second partial fit calls will error when n_features is not consistent
|
||||
# with the first call
|
||||
X, y = make_blobs(n_samples=100)
|
||||
brc = Birch(n_clusters=3)
|
||||
brc.partial_fit(X, y)
|
||||
|
||||
msg = "X has 1 features, but Birch is expecting 2 features"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
brc.partial_fit(X[:, [0]], y)
|
||||
|
||||
|
||||
def check_branching_factor(node, branching_factor):
|
||||
subclusters = node.subclusters_
|
||||
assert branching_factor >= len(subclusters)
|
||||
for cluster in subclusters:
|
||||
if cluster.child_:
|
||||
check_branching_factor(cluster.child_, branching_factor)
|
||||
|
||||
|
||||
def test_branching_factor():
|
||||
# Test that nodes have at max branching_factor number of subclusters
|
||||
X, y = make_blobs()
|
||||
branching_factor = 9
|
||||
|
||||
# Purposefully set a low threshold to maximize the subclusters.
|
||||
brc = Birch(n_clusters=None, branching_factor=branching_factor, threshold=0.01)
|
||||
brc.fit(X)
|
||||
check_branching_factor(brc.root_, branching_factor)
|
||||
brc = Birch(n_clusters=3, branching_factor=branching_factor, threshold=0.01)
|
||||
brc.fit(X)
|
||||
check_branching_factor(brc.root_, branching_factor)
|
||||
|
||||
|
||||
def check_threshold(birch_instance, threshold):
|
||||
"""Use the leaf linked list for traversal"""
|
||||
current_leaf = birch_instance.dummy_leaf_.next_leaf_
|
||||
while current_leaf:
|
||||
subclusters = current_leaf.subclusters_
|
||||
for sc in subclusters:
|
||||
assert threshold >= sc.radius
|
||||
current_leaf = current_leaf.next_leaf_
|
||||
|
||||
|
||||
def test_threshold():
|
||||
# Test that the leaf subclusters have a threshold lesser than radius
|
||||
X, y = make_blobs(n_samples=80, centers=4)
|
||||
brc = Birch(threshold=0.5, n_clusters=None)
|
||||
brc.fit(X)
|
||||
check_threshold(brc, 0.5)
|
||||
|
||||
brc = Birch(threshold=5.0, n_clusters=None)
|
||||
brc.fit(X)
|
||||
check_threshold(brc, 5.0)
|
||||
|
||||
|
||||
def test_birch_n_clusters_long_int():
|
||||
# Check that birch supports n_clusters with np.int64 dtype, for instance
|
||||
# coming from np.arange. #16484
|
||||
X, _ = make_blobs(random_state=0)
|
||||
n_clusters = np.int64(5)
|
||||
Birch(n_clusters=n_clusters).fit(X)
|
||||
|
||||
|
||||
# TODO: Remove in 1.2
|
||||
@pytest.mark.parametrize("attribute", ["fit_", "partial_fit_"])
|
||||
def test_birch_fit_attributes_deprecated(attribute):
|
||||
"""Test that fit_ and partial_fit_ attributes are deprecated."""
|
||||
msg = f"`{attribute}` is deprecated in 1.0 and will be removed in 1.2"
|
||||
X, y = make_blobs(n_samples=10)
|
||||
brc = Birch().fit(X, y)
|
||||
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
getattr(brc, attribute)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_type, err_msg",
|
||||
[
|
||||
({"threshold": -1.0}, ValueError, "threshold == -1.0, must be > 0.0."),
|
||||
({"threshold": 0.0}, ValueError, "threshold == 0.0, must be > 0.0."),
|
||||
({"branching_factor": 0}, ValueError, "branching_factor == 0, must be > 1."),
|
||||
({"branching_factor": 1}, ValueError, "branching_factor == 1, must be > 1."),
|
||||
(
|
||||
{"branching_factor": 1.5},
|
||||
TypeError,
|
||||
"branching_factor must be an instance of int, not float.",
|
||||
),
|
||||
({"branching_factor": -2}, ValueError, "branching_factor == -2, must be > 1."),
|
||||
({"n_clusters": 0}, ValueError, "n_clusters == 0, must be >= 1."),
|
||||
(
|
||||
{"n_clusters": 2.5},
|
||||
TypeError,
|
||||
"n_clusters must be an instance of int, not float.",
|
||||
),
|
||||
(
|
||||
{"n_clusters": "whatever"},
|
||||
TypeError,
|
||||
"n_clusters should be an instance of ClusterMixin or an int",
|
||||
),
|
||||
({"n_clusters": -3}, ValueError, "n_clusters == -3, must be >= 1."),
|
||||
],
|
||||
)
|
||||
def test_birch_params_validation(params, err_type, err_msg):
|
||||
"""Check the parameters validation in `Birch`."""
|
||||
X, _ = make_blobs(n_samples=80, centers=4)
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
Birch(**params).fit(X)
|
||||
|
||||
|
||||
def test_feature_names_out():
|
||||
"""Check `get_feature_names_out` for `Birch`."""
|
||||
X, _ = make_blobs(n_samples=80, n_features=4, random_state=0)
|
||||
brc = Birch(n_clusters=4)
|
||||
brc.fit(X)
|
||||
n_clusters = brc.subcluster_centers_.shape[0]
|
||||
|
||||
names_out = brc.get_feature_names_out()
|
||||
assert_array_equal([f"birch{i}" for i in range(n_clusters)], names_out)
|
||||
@@ -0,0 +1,160 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal, assert_allclose
|
||||
from sklearn.cluster import BisectingKMeans
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bisecting_strategy", ["biggest_inertia", "largest_cluster"])
|
||||
def test_three_clusters(bisecting_strategy):
|
||||
"""Tries to perform bisect k-means for three clusters to check
|
||||
if splitting data is performed correctly.
|
||||
"""
|
||||
|
||||
# X = np.array([[1, 2], [1, 4], [1, 0],
|
||||
# [10, 2], [10, 4], [10, 0],
|
||||
# [10, 6], [10, 8], [10, 10]])
|
||||
|
||||
# X[0][1] swapped with X[1][1] intentionally for checking labeling
|
||||
X = np.array(
|
||||
[[1, 2], [10, 4], [1, 0], [10, 2], [1, 4], [10, 0], [10, 6], [10, 8], [10, 10]]
|
||||
)
|
||||
bisect_means = BisectingKMeans(
|
||||
n_clusters=3, random_state=0, bisecting_strategy=bisecting_strategy
|
||||
)
|
||||
bisect_means.fit(X)
|
||||
|
||||
expected_centers = [[10, 2], [10, 8], [1, 2]]
|
||||
expected_predict = [2, 0]
|
||||
expected_labels = [2, 0, 2, 0, 2, 0, 1, 1, 1]
|
||||
|
||||
assert_allclose(expected_centers, bisect_means.cluster_centers_)
|
||||
assert_array_equal(expected_predict, bisect_means.predict([[0, 0], [12, 3]]))
|
||||
assert_array_equal(expected_labels, bisect_means.labels_)
|
||||
|
||||
|
||||
def test_sparse():
|
||||
"""Test Bisecting K-Means with sparse data.
|
||||
|
||||
Checks if labels and centers are the same between dense and sparse.
|
||||
"""
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = rng.rand(20, 2)
|
||||
X[X < 0.8] = 0
|
||||
X_csr = sp.csr_matrix(X)
|
||||
|
||||
bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
|
||||
|
||||
bisect_means.fit(X_csr)
|
||||
sparse_centers = bisect_means.cluster_centers_
|
||||
|
||||
bisect_means.fit(X)
|
||||
normal_centers = bisect_means.cluster_centers_
|
||||
|
||||
# Check if results is the same for dense and sparse data
|
||||
assert_allclose(normal_centers, sparse_centers, atol=1e-8)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_clusters", [4, 5])
|
||||
def test_n_clusters(n_clusters):
|
||||
"""Test if resulting labels are in range [0, n_clusters - 1]."""
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(10, 2)
|
||||
|
||||
bisect_means = BisectingKMeans(n_clusters=n_clusters, random_state=0)
|
||||
bisect_means.fit(X)
|
||||
|
||||
assert_array_equal(np.unique(bisect_means.labels_), np.arange(n_clusters))
|
||||
|
||||
|
||||
def test_one_cluster():
|
||||
"""Test single cluster."""
|
||||
|
||||
X = np.array([[1, 2], [10, 2], [10, 8]])
|
||||
|
||||
bisect_means = BisectingKMeans(n_clusters=1, random_state=0).fit(X)
|
||||
|
||||
# All labels from fit or predict should be equal 0
|
||||
assert all(bisect_means.labels_ == 0)
|
||||
assert all(bisect_means.predict(X) == 0)
|
||||
|
||||
assert_allclose(bisect_means.cluster_centers_, X.mean(axis=0).reshape(1, -1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"param, match",
|
||||
[
|
||||
# Test bisecting_strategy param
|
||||
(
|
||||
{"bisecting_strategy": "None"},
|
||||
"Bisect Strategy must be 'biggest_inertia' or 'largest_cluster'",
|
||||
),
|
||||
# Test init array
|
||||
(
|
||||
{"init": np.ones((5, 2))},
|
||||
"BisectingKMeans does not support init as array.",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_wrong_params(param, match):
|
||||
"""Test Exceptions at check_params function."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(5, 2)
|
||||
|
||||
with pytest.raises(ValueError, match=match):
|
||||
bisect_means = BisectingKMeans(n_clusters=3, **param)
|
||||
bisect_means.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_sparse", [True, False])
|
||||
def test_fit_predict(is_sparse):
|
||||
"""Check if labels from fit(X) method are same as from fit(X).predict(X)."""
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = rng.rand(10, 2)
|
||||
|
||||
if is_sparse:
|
||||
X[X < 0.8] = 0
|
||||
X = sp.csr_matrix(X)
|
||||
|
||||
bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
|
||||
bisect_means.fit(X)
|
||||
|
||||
assert_array_equal(bisect_means.labels_, bisect_means.predict(X))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_sparse", [True, False])
|
||||
def test_dtype_preserved(is_sparse, global_dtype):
|
||||
"""Check that centers dtype is the same as input data dtype."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(10, 2).astype(global_dtype, copy=False)
|
||||
|
||||
if is_sparse:
|
||||
X[X < 0.8] = 0
|
||||
X = sp.csr_matrix(X)
|
||||
|
||||
km = BisectingKMeans(n_clusters=3, random_state=0)
|
||||
km.fit(X)
|
||||
|
||||
assert km.cluster_centers_.dtype == global_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_sparse", [True, False])
|
||||
def test_float32_float64_equivalence(is_sparse):
|
||||
"""Check that the results are the same between float32 and float64."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(10, 2)
|
||||
|
||||
if is_sparse:
|
||||
X[X < 0.8] = 0
|
||||
X = sp.csr_matrix(X)
|
||||
|
||||
km64 = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
|
||||
km32 = BisectingKMeans(n_clusters=3, random_state=0).fit(X.astype(np.float32))
|
||||
|
||||
assert_allclose(km32.cluster_centers_, km64.cluster_centers_)
|
||||
assert_array_equal(km32.labels_, km64.labels_)
|
||||
@@ -0,0 +1,460 @@
|
||||
"""
|
||||
Tests for DBSCAN clustering algorithm
|
||||
"""
|
||||
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
|
||||
import warnings
|
||||
|
||||
from scipy.spatial import distance
|
||||
from scipy import sparse
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn.cluster import dbscan
|
||||
from sklearn.cluster.tests.common import generate_clustered_data
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
|
||||
|
||||
n_clusters = 3
|
||||
X = generate_clustered_data(n_clusters=n_clusters)
|
||||
|
||||
|
||||
def test_dbscan_similarity():
|
||||
# Tests the DBSCAN algorithm with a similarity array.
|
||||
# Parameters chosen specifically for this task.
|
||||
eps = 0.15
|
||||
min_samples = 10
|
||||
# Compute similarities
|
||||
D = distance.squareform(distance.pdist(X))
|
||||
D /= np.max(D)
|
||||
# Compute DBSCAN
|
||||
core_samples, labels = dbscan(
|
||||
D, metric="precomputed", eps=eps, min_samples=min_samples
|
||||
)
|
||||
# number of clusters, ignoring noise if present
|
||||
n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
|
||||
|
||||
assert n_clusters_1 == n_clusters
|
||||
|
||||
db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
|
||||
labels = db.fit(D).labels_
|
||||
|
||||
n_clusters_2 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_2 == n_clusters
|
||||
|
||||
|
||||
def test_dbscan_feature():
|
||||
# Tests the DBSCAN algorithm with a feature vector array.
|
||||
# Parameters chosen specifically for this task.
|
||||
# Different eps to other test, because distance is not normalised.
|
||||
eps = 0.8
|
||||
min_samples = 10
|
||||
metric = "euclidean"
|
||||
# Compute DBSCAN
|
||||
# parameters chosen for task
|
||||
core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples)
|
||||
|
||||
# number of clusters, ignoring noise if present
|
||||
n_clusters_1 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_1 == n_clusters
|
||||
|
||||
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_2 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_2 == n_clusters
|
||||
|
||||
|
||||
def test_dbscan_sparse():
|
||||
core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=0.8, min_samples=10)
|
||||
core_dense, labels_dense = dbscan(X, eps=0.8, min_samples=10)
|
||||
assert_array_equal(core_dense, core_sparse)
|
||||
assert_array_equal(labels_dense, labels_sparse)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("include_self", [False, True])
|
||||
def test_dbscan_sparse_precomputed(include_self):
|
||||
D = pairwise_distances(X)
|
||||
nn = NearestNeighbors(radius=0.9).fit(X)
|
||||
X_ = X if include_self else None
|
||||
D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance")
|
||||
# Ensure it is sparse not merely on diagonals:
|
||||
assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
|
||||
core_sparse, labels_sparse = dbscan(
|
||||
D_sparse, eps=0.8, min_samples=10, metric="precomputed"
|
||||
)
|
||||
core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed")
|
||||
assert_array_equal(core_dense, core_sparse)
|
||||
assert_array_equal(labels_dense, labels_sparse)
|
||||
|
||||
|
||||
def test_dbscan_sparse_precomputed_different_eps():
|
||||
# test that precomputed neighbors graph is filtered if computed with
|
||||
# a radius larger than DBSCAN's eps.
|
||||
lower_eps = 0.2
|
||||
nn = NearestNeighbors(radius=lower_eps).fit(X)
|
||||
D_sparse = nn.radius_neighbors_graph(X, mode="distance")
|
||||
dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
|
||||
|
||||
higher_eps = lower_eps + 0.7
|
||||
nn = NearestNeighbors(radius=higher_eps).fit(X)
|
||||
D_sparse = nn.radius_neighbors_graph(X, mode="distance")
|
||||
dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
|
||||
|
||||
assert_array_equal(dbscan_lower[0], dbscan_higher[0])
|
||||
assert_array_equal(dbscan_lower[1], dbscan_higher[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_sparse", [True, False])
|
||||
@pytest.mark.parametrize("metric", ["precomputed", "minkowski"])
|
||||
def test_dbscan_input_not_modified(use_sparse, metric):
|
||||
# test that the input is not modified by dbscan
|
||||
X = np.random.RandomState(0).rand(10, 10)
|
||||
X = sparse.csr_matrix(X) if use_sparse else X
|
||||
X_copy = X.copy()
|
||||
dbscan(X, metric=metric)
|
||||
|
||||
if use_sparse:
|
||||
assert_array_equal(X.toarray(), X_copy.toarray())
|
||||
else:
|
||||
assert_array_equal(X, X_copy)
|
||||
|
||||
|
||||
def test_dbscan_no_core_samples():
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(40, 10)
|
||||
X[X < 0.8] = 0
|
||||
|
||||
for X_ in [X, sparse.csr_matrix(X)]:
|
||||
db = DBSCAN(min_samples=6).fit(X_)
|
||||
assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
|
||||
assert_array_equal(db.labels_, -1)
|
||||
assert db.core_sample_indices_.shape == (0,)
|
||||
|
||||
|
||||
def test_dbscan_callable():
|
||||
# Tests the DBSCAN algorithm with a callable metric.
|
||||
# Parameters chosen specifically for this task.
|
||||
# Different eps to other test, because distance is not normalised.
|
||||
eps = 0.8
|
||||
min_samples = 10
|
||||
# metric is the function reference, not the string key.
|
||||
metric = distance.euclidean
|
||||
# Compute DBSCAN
|
||||
# parameters chosen for task
|
||||
core_samples, labels = dbscan(
|
||||
X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree"
|
||||
)
|
||||
|
||||
# number of clusters, ignoring noise if present
|
||||
n_clusters_1 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_1 == n_clusters
|
||||
|
||||
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree")
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_2 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_2 == n_clusters
|
||||
|
||||
|
||||
def test_dbscan_metric_params():
|
||||
# Tests that DBSCAN works with the metrics_params argument.
|
||||
eps = 0.8
|
||||
min_samples = 10
|
||||
p = 1
|
||||
|
||||
# Compute DBSCAN with metric_params arg
|
||||
|
||||
with warnings.catch_warnings(record=True) as warns:
|
||||
db = DBSCAN(
|
||||
metric="minkowski",
|
||||
metric_params={"p": p},
|
||||
eps=eps,
|
||||
p=None,
|
||||
min_samples=min_samples,
|
||||
algorithm="ball_tree",
|
||||
).fit(X)
|
||||
assert not warns, warns[0].message
|
||||
core_sample_1, labels_1 = db.core_sample_indices_, db.labels_
|
||||
|
||||
# Test that sample labels are the same as passing Minkowski 'p' directly
|
||||
db = DBSCAN(
|
||||
metric="minkowski", eps=eps, min_samples=min_samples, algorithm="ball_tree", p=p
|
||||
).fit(X)
|
||||
core_sample_2, labels_2 = db.core_sample_indices_, db.labels_
|
||||
|
||||
assert_array_equal(core_sample_1, core_sample_2)
|
||||
assert_array_equal(labels_1, labels_2)
|
||||
|
||||
# Minkowski with p=1 should be equivalent to Manhattan distance
|
||||
db = DBSCAN(
|
||||
metric="manhattan", eps=eps, min_samples=min_samples, algorithm="ball_tree"
|
||||
).fit(X)
|
||||
core_sample_3, labels_3 = db.core_sample_indices_, db.labels_
|
||||
|
||||
assert_array_equal(core_sample_1, core_sample_3)
|
||||
assert_array_equal(labels_1, labels_3)
|
||||
|
||||
with pytest.warns(
|
||||
SyntaxWarning,
|
||||
match=(
|
||||
"Parameter p is found in metric_params. "
|
||||
"The corresponding parameter from __init__ "
|
||||
"is ignored."
|
||||
),
|
||||
):
|
||||
# Test that checks p is ignored in favor of metric_params={'p': <val>}
|
||||
db = DBSCAN(
|
||||
metric="minkowski",
|
||||
metric_params={"p": p},
|
||||
eps=eps,
|
||||
p=p + 1,
|
||||
min_samples=min_samples,
|
||||
algorithm="ball_tree",
|
||||
).fit(X)
|
||||
core_sample_4, labels_4 = db.core_sample_indices_, db.labels_
|
||||
|
||||
assert_array_equal(core_sample_1, core_sample_4)
|
||||
assert_array_equal(labels_1, labels_4)
|
||||
|
||||
|
||||
def test_dbscan_balltree():
|
||||
# Tests the DBSCAN algorithm with balltree for neighbor calculation.
|
||||
eps = 0.8
|
||||
min_samples = 10
|
||||
|
||||
D = pairwise_distances(X)
|
||||
core_samples, labels = dbscan(
|
||||
D, metric="precomputed", eps=eps, min_samples=min_samples
|
||||
)
|
||||
|
||||
# number of clusters, ignoring noise if present
|
||||
n_clusters_1 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_1 == n_clusters
|
||||
|
||||
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_2 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_2 == n_clusters
|
||||
|
||||
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree")
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_3 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_3 == n_clusters
|
||||
|
||||
db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_4 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_4 == n_clusters
|
||||
|
||||
db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree")
|
||||
labels = db.fit(X).labels_
|
||||
|
||||
n_clusters_5 = len(set(labels)) - int(-1 in labels)
|
||||
assert n_clusters_5 == n_clusters
|
||||
|
||||
|
||||
def test_input_validation():
|
||||
# DBSCAN.fit should accept a list of lists.
|
||||
X = [[1.0, 2.0], [3.0, 4.0]]
|
||||
DBSCAN().fit(X) # must not raise exception
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"args",
|
||||
[
|
||||
{"algorithm": "blah"},
|
||||
{"metric": "blah"},
|
||||
],
|
||||
)
|
||||
def test_dbscan_badargs(args):
|
||||
# Test bad argument values: these should all raise ValueErrors
|
||||
with pytest.raises(ValueError):
|
||||
dbscan(X, **args)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
obj = DBSCAN()
|
||||
s = pickle.dumps(obj)
|
||||
assert type(pickle.loads(s)) == obj.__class__
|
||||
|
||||
|
||||
def test_boundaries():
|
||||
# ensure min_samples is inclusive of core point
|
||||
core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
|
||||
assert 0 in core
|
||||
# ensure eps is inclusive of circumference
|
||||
core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
|
||||
assert 0 in core
|
||||
core, _ = dbscan([[0], [1], [1]], eps=0.99, min_samples=2)
|
||||
assert 0 not in core
|
||||
|
||||
|
||||
def test_weighted_dbscan():
|
||||
# ensure sample_weight is validated
|
||||
with pytest.raises(ValueError):
|
||||
dbscan([[0], [1]], sample_weight=[2])
|
||||
with pytest.raises(ValueError):
|
||||
dbscan([[0], [1]], sample_weight=[2, 3, 4])
|
||||
|
||||
# ensure sample_weight has an effect
|
||||
assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0])
|
||||
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0])
|
||||
assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0])
|
||||
assert_array_equal(
|
||||
[0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]
|
||||
)
|
||||
|
||||
# points within eps of each other:
|
||||
assert_array_equal(
|
||||
[0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]
|
||||
)
|
||||
# and effect of non-positive and non-integer sample_weight:
|
||||
assert_array_equal(
|
||||
[], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]
|
||||
)
|
||||
assert_array_equal(
|
||||
[0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]
|
||||
)
|
||||
assert_array_equal(
|
||||
[0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]
|
||||
)
|
||||
assert_array_equal(
|
||||
[], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]
|
||||
)
|
||||
|
||||
# for non-negative sample_weight, cores should be identical to repetition
|
||||
rng = np.random.RandomState(42)
|
||||
sample_weight = rng.randint(0, 5, X.shape[0])
|
||||
core1, label1 = dbscan(X, sample_weight=sample_weight)
|
||||
assert len(label1) == len(X)
|
||||
|
||||
X_repeated = np.repeat(X, sample_weight, axis=0)
|
||||
core_repeated, label_repeated = dbscan(X_repeated)
|
||||
core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
|
||||
core_repeated_mask[core_repeated] = True
|
||||
core_mask = np.zeros(X.shape[0], dtype=bool)
|
||||
core_mask[core1] = True
|
||||
assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)
|
||||
|
||||
# sample_weight should work with precomputed distance matrix
|
||||
D = pairwise_distances(X)
|
||||
core3, label3 = dbscan(D, sample_weight=sample_weight, metric="precomputed")
|
||||
assert_array_equal(core1, core3)
|
||||
assert_array_equal(label1, label3)
|
||||
|
||||
# sample_weight should work with estimator
|
||||
est = DBSCAN().fit(X, sample_weight=sample_weight)
|
||||
core4 = est.core_sample_indices_
|
||||
label4 = est.labels_
|
||||
assert_array_equal(core1, core4)
|
||||
assert_array_equal(label1, label4)
|
||||
|
||||
est = DBSCAN()
|
||||
label5 = est.fit_predict(X, sample_weight=sample_weight)
|
||||
core5 = est.core_sample_indices_
|
||||
assert_array_equal(core1, core5)
|
||||
assert_array_equal(label1, label5)
|
||||
assert_array_equal(label1, est.labels_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["brute", "kd_tree", "ball_tree"])
|
||||
def test_dbscan_core_samples_toy(algorithm):
|
||||
X = [[0], [2], [3], [4], [6], [8], [10]]
|
||||
n_samples = len(X)
|
||||
|
||||
# Degenerate case: every sample is a core sample, either with its own
|
||||
# cluster or including other close core samples.
|
||||
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1)
|
||||
assert_array_equal(core_samples, np.arange(n_samples))
|
||||
assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])
|
||||
|
||||
# With eps=1 and min_samples=2 only the 3 samples from the denser area
|
||||
# are core samples. All other points are isolated and considered noise.
|
||||
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2)
|
||||
assert_array_equal(core_samples, [1, 2, 3])
|
||||
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
|
||||
|
||||
# Only the sample in the middle of the dense area is core. Its two
|
||||
# neighbors are edge samples. Remaining samples are noise.
|
||||
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3)
|
||||
assert_array_equal(core_samples, [2])
|
||||
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
|
||||
|
||||
# It's no longer possible to extract core samples with eps=1:
|
||||
# everything is noise.
|
||||
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4)
|
||||
assert_array_equal(core_samples, [])
|
||||
assert_array_equal(labels, np.full(n_samples, -1.0))
|
||||
|
||||
|
||||
def test_dbscan_precomputed_metric_with_degenerate_input_arrays():
|
||||
# see https://github.com/scikit-learn/scikit-learn/issues/4641 for
|
||||
# more details
|
||||
X = np.eye(10)
|
||||
labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
|
||||
assert len(set(labels)) == 1
|
||||
|
||||
X = np.zeros((10, 10))
|
||||
labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
|
||||
assert len(set(labels)) == 1
|
||||
|
||||
|
||||
def test_dbscan_precomputed_metric_with_initial_rows_zero():
|
||||
# sample matrix with initial two row all zero
|
||||
ar = np.array(
|
||||
[
|
||||
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3],
|
||||
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1],
|
||||
[0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0],
|
||||
]
|
||||
)
|
||||
matrix = sparse.csr_matrix(ar)
|
||||
labels = DBSCAN(eps=0.2, metric="precomputed", min_samples=2).fit(matrix).labels_
|
||||
assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_type, err_msg",
|
||||
[
|
||||
({"eps": -1.0}, ValueError, "eps == -1.0, must be > 0.0."),
|
||||
({"eps": 0.0}, ValueError, "eps == 0.0, must be > 0.0."),
|
||||
({"min_samples": 0}, ValueError, "min_samples == 0, must be >= 1."),
|
||||
(
|
||||
{"min_samples": 1.5},
|
||||
TypeError,
|
||||
"min_samples must be an instance of int, not float.",
|
||||
),
|
||||
({"min_samples": -2}, ValueError, "min_samples == -2, must be >= 1."),
|
||||
({"leaf_size": 0}, ValueError, "leaf_size == 0, must be >= 1."),
|
||||
(
|
||||
{"leaf_size": 2.5},
|
||||
TypeError,
|
||||
"leaf_size must be an instance of int, not float.",
|
||||
),
|
||||
({"leaf_size": -3}, ValueError, "leaf_size == -3, must be >= 1."),
|
||||
({"p": -2}, ValueError, "p == -2, must be >= 0.0."),
|
||||
(
|
||||
{"n_jobs": 2.5},
|
||||
TypeError,
|
||||
"n_jobs must be an instance of int, not float.",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_dbscan_params_validation(params, err_type, err_msg):
|
||||
"""Check the parameters validation in `DBSCAN`."""
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
DBSCAN(**params).fit(X)
|
||||
@@ -0,0 +1,55 @@
|
||||
"""
|
||||
Tests for sklearn.cluster._feature_agglomeration
|
||||
"""
|
||||
# Authors: Sergul Aydore 2017
|
||||
import numpy as np
|
||||
|
||||
from numpy.testing import assert_array_equal
|
||||
from sklearn.cluster import FeatureAgglomeration
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.datasets import make_blobs
|
||||
|
||||
|
||||
def test_feature_agglomeration():
|
||||
n_clusters = 1
|
||||
X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features)
|
||||
|
||||
agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean)
|
||||
agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median)
|
||||
agglo_mean.fit(X)
|
||||
agglo_median.fit(X)
|
||||
|
||||
assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
|
||||
assert np.size(np.unique(agglo_median.labels_)) == n_clusters
|
||||
assert np.size(agglo_mean.labels_) == X.shape[1]
|
||||
assert np.size(agglo_median.labels_) == X.shape[1]
|
||||
|
||||
# Test transform
|
||||
Xt_mean = agglo_mean.transform(X)
|
||||
Xt_median = agglo_median.transform(X)
|
||||
assert Xt_mean.shape[1] == n_clusters
|
||||
assert Xt_median.shape[1] == n_clusters
|
||||
assert Xt_mean == np.array([1 / 3.0])
|
||||
assert Xt_median == np.array([0.0])
|
||||
|
||||
# Test inverse transform
|
||||
X_full_mean = agglo_mean.inverse_transform(Xt_mean)
|
||||
X_full_median = agglo_median.inverse_transform(Xt_median)
|
||||
assert np.unique(X_full_mean[0]).size == n_clusters
|
||||
assert np.unique(X_full_median[0]).size == n_clusters
|
||||
|
||||
assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
|
||||
assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
|
||||
|
||||
|
||||
def test_feature_agglomeration_feature_names_out():
|
||||
"""Check `get_feature_names_out` for `FeatureAgglomeration`."""
|
||||
X, _ = make_blobs(n_features=6, random_state=0)
|
||||
agglo = FeatureAgglomeration(n_clusters=3)
|
||||
agglo.fit(X)
|
||||
n_clusters = agglo.n_clusters_
|
||||
|
||||
names_out = agglo.get_feature_names_out()
|
||||
assert_array_equal(
|
||||
[f"featureagglomeration{i}" for i in range(n_clusters)], names_out
|
||||
)
|
||||
@@ -0,0 +1,918 @@
|
||||
"""
|
||||
Several basic tests for hierarchical clustering procedures
|
||||
|
||||
"""
|
||||
# Authors: Vincent Michel, 2010, Gael Varoquaux 2012,
|
||||
# Matteo Visconti di Oleggio Castello 2014
|
||||
# License: BSD 3 clause
|
||||
import itertools
|
||||
from tempfile import mkdtemp
|
||||
import shutil
|
||||
import pytest
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from scipy.cluster import hierarchy
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
|
||||
from sklearn.metrics.cluster import adjusted_rand_score
|
||||
from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
|
||||
from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
from sklearn.cluster import ward_tree
|
||||
from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
|
||||
from sklearn.cluster._agglomerative import (
|
||||
_hc_cut,
|
||||
_TREE_BUILDERS,
|
||||
linkage_tree,
|
||||
_fix_connectivity,
|
||||
)
|
||||
from sklearn.feature_extraction.image import grid_to_graph
|
||||
from sklearn.metrics import DistanceMetric
|
||||
from sklearn.metrics.pairwise import (
|
||||
PAIRED_DISTANCES,
|
||||
cosine_distances,
|
||||
manhattan_distances,
|
||||
pairwise_distances,
|
||||
)
|
||||
from sklearn.metrics.cluster import normalized_mutual_info_score
|
||||
from sklearn.neighbors import kneighbors_graph
|
||||
from sklearn.cluster._hierarchical_fast import (
|
||||
average_merge,
|
||||
max_merge,
|
||||
mst_linkage_core,
|
||||
)
|
||||
from sklearn.utils._fast_dict import IntFloatDict
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.datasets import make_moons, make_circles
|
||||
|
||||
|
||||
def test_linkage_misc():
|
||||
# Misc tests on linkage
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.normal(size=(5, 5))
|
||||
with pytest.raises(ValueError):
|
||||
AgglomerativeClustering(linkage="foo").fit(X)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
linkage_tree(X, linkage="foo")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
linkage_tree(X, connectivity=np.ones((4, 4)))
|
||||
|
||||
# Smoke test FeatureAgglomeration
|
||||
FeatureAgglomeration().fit(X)
|
||||
|
||||
# test hierarchical clustering on a precomputed distances matrix
|
||||
dis = cosine_distances(X)
|
||||
|
||||
res = linkage_tree(dis, affinity="precomputed")
|
||||
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
|
||||
|
||||
# test hierarchical clustering on a precomputed distances matrix
|
||||
res = linkage_tree(X, affinity=manhattan_distances)
|
||||
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
|
||||
|
||||
|
||||
def test_structured_linkage_tree():
|
||||
# Check that we obtain the correct solution for structured linkage trees.
|
||||
rng = np.random.RandomState(0)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
# Avoiding a mask with only 'True' entries
|
||||
mask[4:7, 4:7] = 0
|
||||
X = rng.randn(50, 100)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
for tree_builder in _TREE_BUILDERS.values():
|
||||
children, n_components, n_leaves, parent = tree_builder(
|
||||
X.T, connectivity=connectivity
|
||||
)
|
||||
n_nodes = 2 * X.shape[1] - 1
|
||||
assert len(children) + n_leaves == n_nodes
|
||||
# Check that ward_tree raises a ValueError with a connectivity matrix
|
||||
# of the wrong shape
|
||||
with pytest.raises(ValueError):
|
||||
tree_builder(X.T, connectivity=np.ones((4, 4)))
|
||||
# Check that fitting with no samples raises an error
|
||||
with pytest.raises(ValueError):
|
||||
tree_builder(X.T[:0], connectivity=connectivity)
|
||||
|
||||
|
||||
def test_unstructured_linkage_tree():
|
||||
# Check that we obtain the correct solution for unstructured linkage trees.
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(50, 100)
|
||||
for this_X in (X, X[0]):
|
||||
# With specified a number of clusters just for the sake of
|
||||
# raising a warning and testing the warning code
|
||||
with ignore_warnings():
|
||||
with pytest.warns(UserWarning):
|
||||
children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10)
|
||||
n_nodes = 2 * X.shape[1] - 1
|
||||
assert len(children) + n_leaves == n_nodes
|
||||
|
||||
for tree_builder in _TREE_BUILDERS.values():
|
||||
for this_X in (X, X[0]):
|
||||
with ignore_warnings():
|
||||
with pytest.warns(UserWarning):
|
||||
children, n_nodes, n_leaves, parent = tree_builder(
|
||||
this_X.T, n_clusters=10
|
||||
)
|
||||
n_nodes = 2 * X.shape[1] - 1
|
||||
assert len(children) + n_leaves == n_nodes
|
||||
|
||||
|
||||
def test_height_linkage_tree():
|
||||
# Check that the height of the results of linkage tree is sorted.
|
||||
rng = np.random.RandomState(0)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
X = rng.randn(50, 100)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
for linkage_func in _TREE_BUILDERS.values():
|
||||
children, n_nodes, n_leaves, parent = linkage_func(
|
||||
X.T, connectivity=connectivity
|
||||
)
|
||||
n_nodes = 2 * X.shape[1] - 1
|
||||
assert len(children) + n_leaves == n_nodes
|
||||
|
||||
|
||||
def test_agglomerative_clustering_wrong_arg_memory():
|
||||
# Test either if an error is raised when memory is not
|
||||
# either a str or a joblib.Memory instance
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.randn(n_samples, 50)
|
||||
memory = 5
|
||||
clustering = AgglomerativeClustering(memory=memory)
|
||||
with pytest.raises(ValueError):
|
||||
clustering.fit(X)
|
||||
|
||||
|
||||
def test_zero_cosine_linkage_tree():
|
||||
# Check that zero vectors in X produce an error when
|
||||
# 'cosine' affinity is used
|
||||
X = np.array([[0, 1], [0, 0]])
|
||||
msg = "Cosine affinity cannot be used when X contains zero vectors"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
linkage_tree(X, affinity="cosine")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_clusters, distance_threshold", [(None, 0.5), (10, None)])
|
||||
@pytest.mark.parametrize("compute_distances", [True, False])
|
||||
@pytest.mark.parametrize("linkage", ["ward", "complete", "average", "single"])
|
||||
def test_agglomerative_clustering_distances(
|
||||
n_clusters, compute_distances, distance_threshold, linkage
|
||||
):
|
||||
# Check that when `compute_distances` is True or `distance_threshold` is
|
||||
# given, the fitted model has an attribute `distances_`.
|
||||
rng = np.random.RandomState(0)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
n_samples = 100
|
||||
X = rng.randn(n_samples, 50)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=n_clusters,
|
||||
connectivity=connectivity,
|
||||
linkage=linkage,
|
||||
distance_threshold=distance_threshold,
|
||||
compute_distances=compute_distances,
|
||||
)
|
||||
clustering.fit(X)
|
||||
if compute_distances or (distance_threshold is not None):
|
||||
assert hasattr(clustering, "distances_")
|
||||
n_children = clustering.children_.shape[0]
|
||||
n_nodes = n_children + 1
|
||||
assert clustering.distances_.shape == (n_nodes - 1,)
|
||||
else:
|
||||
assert not hasattr(clustering, "distances_")
|
||||
|
||||
|
||||
def test_agglomerative_clustering():
|
||||
# Check that we obtain the correct number of clusters with
|
||||
# agglomerative clustering.
|
||||
rng = np.random.RandomState(0)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
n_samples = 100
|
||||
X = rng.randn(n_samples, 50)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
for linkage in ("ward", "complete", "average", "single"):
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10, connectivity=connectivity, linkage=linkage
|
||||
)
|
||||
clustering.fit(X)
|
||||
# test caching
|
||||
try:
|
||||
tempdir = mkdtemp()
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10,
|
||||
connectivity=connectivity,
|
||||
memory=tempdir,
|
||||
linkage=linkage,
|
||||
)
|
||||
clustering.fit(X)
|
||||
labels = clustering.labels_
|
||||
assert np.size(np.unique(labels)) == 10
|
||||
finally:
|
||||
shutil.rmtree(tempdir)
|
||||
# Turn caching off now
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10, connectivity=connectivity, linkage=linkage
|
||||
)
|
||||
# Check that we obtain the same solution with early-stopping of the
|
||||
# tree building
|
||||
clustering.compute_full_tree = False
|
||||
clustering.fit(X)
|
||||
assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1)
|
||||
clustering.connectivity = None
|
||||
clustering.fit(X)
|
||||
assert np.size(np.unique(clustering.labels_)) == 10
|
||||
# Check that we raise a TypeError on dense matrices
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10,
|
||||
connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]),
|
||||
linkage=linkage,
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
clustering.fit(X)
|
||||
|
||||
# Test that using ward with another metric than euclidean raises an
|
||||
# exception
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10,
|
||||
connectivity=connectivity.toarray(),
|
||||
affinity="manhattan",
|
||||
linkage="ward",
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
clustering.fit(X)
|
||||
|
||||
# Test using another metric than euclidean works with linkage complete
|
||||
for affinity in PAIRED_DISTANCES.keys():
|
||||
# Compare our (structured) implementation to scipy
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10,
|
||||
connectivity=np.ones((n_samples, n_samples)),
|
||||
affinity=affinity,
|
||||
linkage="complete",
|
||||
)
|
||||
clustering.fit(X)
|
||||
clustering2 = AgglomerativeClustering(
|
||||
n_clusters=10, connectivity=None, affinity=affinity, linkage="complete"
|
||||
)
|
||||
clustering2.fit(X)
|
||||
assert_almost_equal(
|
||||
normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1
|
||||
)
|
||||
|
||||
# Test that using a distance matrix (affinity = 'precomputed') has same
|
||||
# results (with connectivity constraints)
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=10, connectivity=connectivity, linkage="complete"
|
||||
)
|
||||
clustering.fit(X)
|
||||
X_dist = pairwise_distances(X)
|
||||
clustering2 = AgglomerativeClustering(
|
||||
n_clusters=10,
|
||||
connectivity=connectivity,
|
||||
affinity="precomputed",
|
||||
linkage="complete",
|
||||
)
|
||||
clustering2.fit(X_dist)
|
||||
assert_array_equal(clustering.labels_, clustering2.labels_)
|
||||
|
||||
|
||||
def test_agglomerative_clustering_memory_mapped():
|
||||
"""AgglomerativeClustering must work on mem-mapped dataset.
|
||||
|
||||
Non-regression test for issue #19875.
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
Xmm = create_memmap_backed_data(rng.randn(50, 100))
|
||||
AgglomerativeClustering(affinity="euclidean", linkage="single").fit(Xmm)
|
||||
|
||||
|
||||
def test_ward_agglomeration():
|
||||
# Check that we obtain the correct solution in a simplistic case
|
||||
rng = np.random.RandomState(0)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
X = rng.randn(50, 100)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
|
||||
agglo.fit(X)
|
||||
assert np.size(np.unique(agglo.labels_)) == 5
|
||||
|
||||
X_red = agglo.transform(X)
|
||||
assert X_red.shape[1] == 5
|
||||
X_full = agglo.inverse_transform(X_red)
|
||||
assert np.unique(X_full[0]).size == 5
|
||||
assert_array_almost_equal(agglo.transform(X_full), X_red)
|
||||
|
||||
# Check that fitting with no samples raises a ValueError
|
||||
with pytest.raises(ValueError):
|
||||
agglo.fit(X[:0])
|
||||
|
||||
|
||||
def test_single_linkage_clustering():
|
||||
# Check that we get the correct result in two emblematic cases
|
||||
moons, moon_labels = make_moons(noise=0.05, random_state=42)
|
||||
clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
|
||||
clustering.fit(moons)
|
||||
assert_almost_equal(
|
||||
normalized_mutual_info_score(clustering.labels_, moon_labels), 1
|
||||
)
|
||||
|
||||
circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42)
|
||||
clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
|
||||
clustering.fit(circles)
|
||||
assert_almost_equal(
|
||||
normalized_mutual_info_score(clustering.labels_, circle_labels), 1
|
||||
)
|
||||
|
||||
|
||||
def assess_same_labelling(cut1, cut2):
|
||||
"""Util for comparison with scipy"""
|
||||
co_clust = []
|
||||
for cut in [cut1, cut2]:
|
||||
n = len(cut)
|
||||
k = cut.max() + 1
|
||||
ecut = np.zeros((n, k))
|
||||
ecut[np.arange(n), cut] = 1
|
||||
co_clust.append(np.dot(ecut, ecut.T))
|
||||
assert (co_clust[0] == co_clust[1]).all()
|
||||
|
||||
|
||||
def test_sparse_scikit_vs_scipy():
|
||||
# Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
|
||||
n, p, k = 10, 5, 3
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
# Not using a lil_matrix here, just to check that non sparse
|
||||
# matrices are well handled
|
||||
connectivity = np.ones((n, n))
|
||||
for linkage in _TREE_BUILDERS.keys():
|
||||
for i in range(5):
|
||||
X = 0.1 * rng.normal(size=(n, p))
|
||||
X -= 4.0 * np.arange(n)[:, np.newaxis]
|
||||
X -= X.mean(axis=1)[:, np.newaxis]
|
||||
|
||||
out = hierarchy.linkage(X, method=linkage)
|
||||
|
||||
children_ = out[:, :2].astype(int, copy=False)
|
||||
children, _, n_leaves, _ = _TREE_BUILDERS[linkage](
|
||||
X, connectivity=connectivity
|
||||
)
|
||||
|
||||
# Sort the order of child nodes per row for consistency
|
||||
children.sort(axis=1)
|
||||
assert_array_equal(
|
||||
children,
|
||||
children_,
|
||||
"linkage tree differs from scipy impl for linkage: " + linkage,
|
||||
)
|
||||
|
||||
cut = _hc_cut(k, children, n_leaves)
|
||||
cut_ = _hc_cut(k, children_, n_leaves)
|
||||
assess_same_labelling(cut, cut_)
|
||||
|
||||
# Test error management in _hc_cut
|
||||
with pytest.raises(ValueError):
|
||||
_hc_cut(n_leaves + 1, children, n_leaves)
|
||||
|
||||
|
||||
# Make sure our custom mst_linkage_core gives
|
||||
# the same results as scipy's builtin
|
||||
@pytest.mark.parametrize("seed", range(5))
|
||||
def test_vector_scikit_single_vs_scipy_single(seed):
|
||||
n_samples, n_features, n_clusters = 10, 5, 3
|
||||
rng = np.random.RandomState(seed)
|
||||
X = 0.1 * rng.normal(size=(n_samples, n_features))
|
||||
X -= 4.0 * np.arange(n_samples)[:, np.newaxis]
|
||||
X -= X.mean(axis=1)[:, np.newaxis]
|
||||
|
||||
out = hierarchy.linkage(X, method="single")
|
||||
children_scipy = out[:, :2].astype(int)
|
||||
|
||||
children, _, n_leaves, _ = _TREE_BUILDERS["single"](X)
|
||||
|
||||
# Sort the order of child nodes per row for consistency
|
||||
children.sort(axis=1)
|
||||
assert_array_equal(
|
||||
children,
|
||||
children_scipy,
|
||||
"linkage tree differs from scipy impl for single linkage.",
|
||||
)
|
||||
|
||||
cut = _hc_cut(n_clusters, children, n_leaves)
|
||||
cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
|
||||
assess_same_labelling(cut, cut_scipy)
|
||||
|
||||
|
||||
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
|
||||
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
|
||||
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
|
||||
def test_mst_linkage_core_memory_mapped(metric_param_grid):
|
||||
"""The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
|
||||
|
||||
Non-regression test for issue #19875.
|
||||
"""
|
||||
rng = np.random.RandomState(seed=1)
|
||||
X = rng.normal(size=(20, 4))
|
||||
Xmm = create_memmap_backed_data(X)
|
||||
metric, param_grid = metric_param_grid
|
||||
keys = param_grid.keys()
|
||||
for vals in itertools.product(*param_grid.values()):
|
||||
kwargs = dict(zip(keys, vals))
|
||||
distance_metric = DistanceMetric.get_metric(metric, **kwargs)
|
||||
mst = mst_linkage_core(X, distance_metric)
|
||||
mst_mm = mst_linkage_core(Xmm, distance_metric)
|
||||
np.testing.assert_equal(mst, mst_mm)
|
||||
|
||||
|
||||
def test_identical_points():
|
||||
# Ensure identical points are handled correctly when using mst with
|
||||
# a sparse connectivity matrix
|
||||
X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]])
|
||||
true_labels = np.array([0, 0, 1, 1, 2, 2])
|
||||
connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
|
||||
connectivity = 0.5 * (connectivity + connectivity.T)
|
||||
connectivity, n_components = _fix_connectivity(X, connectivity, "euclidean")
|
||||
|
||||
for linkage in ("single", "average", "average", "ward"):
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=3, linkage=linkage, connectivity=connectivity
|
||||
)
|
||||
clustering.fit(X)
|
||||
|
||||
assert_almost_equal(
|
||||
normalized_mutual_info_score(clustering.labels_, true_labels), 1
|
||||
)
|
||||
|
||||
|
||||
def test_connectivity_propagation():
|
||||
# Check that connectivity in the ward tree is propagated correctly during
|
||||
# merging.
|
||||
X = np.array(
|
||||
[
|
||||
(0.014, 0.120),
|
||||
(0.014, 0.099),
|
||||
(0.014, 0.097),
|
||||
(0.017, 0.153),
|
||||
(0.017, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.153),
|
||||
(0.018, 0.152),
|
||||
(0.018, 0.149),
|
||||
(0.018, 0.144),
|
||||
]
|
||||
)
|
||||
connectivity = kneighbors_graph(X, 10, include_self=False)
|
||||
ward = AgglomerativeClustering(
|
||||
n_clusters=4, connectivity=connectivity, linkage="ward"
|
||||
)
|
||||
# If changes are not propagated correctly, fit crashes with an
|
||||
# IndexError
|
||||
ward.fit(X)
|
||||
|
||||
|
||||
def test_ward_tree_children_order():
|
||||
# Check that children are ordered in the same way for both structured and
|
||||
# unstructured versions of ward_tree.
|
||||
|
||||
# test on five random datasets
|
||||
n, p = 10, 5
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
connectivity = np.ones((n, n))
|
||||
for i in range(5):
|
||||
X = 0.1 * rng.normal(size=(n, p))
|
||||
X -= 4.0 * np.arange(n)[:, np.newaxis]
|
||||
X -= X.mean(axis=1)[:, np.newaxis]
|
||||
|
||||
out_unstructured = ward_tree(X)
|
||||
out_structured = ward_tree(X, connectivity=connectivity)
|
||||
|
||||
assert_array_equal(out_unstructured[0], out_structured[0])
|
||||
|
||||
|
||||
def test_ward_linkage_tree_return_distance():
|
||||
# Test return_distance option on linkage and ward trees
|
||||
|
||||
# test that return_distance when set true, gives same
|
||||
# output on both structured and unstructured clustering.
|
||||
n, p = 10, 5
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
connectivity = np.ones((n, n))
|
||||
for i in range(5):
|
||||
X = 0.1 * rng.normal(size=(n, p))
|
||||
X -= 4.0 * np.arange(n)[:, np.newaxis]
|
||||
X -= X.mean(axis=1)[:, np.newaxis]
|
||||
|
||||
out_unstructured = ward_tree(X, return_distance=True)
|
||||
out_structured = ward_tree(X, connectivity=connectivity, return_distance=True)
|
||||
|
||||
# get children
|
||||
children_unstructured = out_unstructured[0]
|
||||
children_structured = out_structured[0]
|
||||
|
||||
# check if we got the same clusters
|
||||
assert_array_equal(children_unstructured, children_structured)
|
||||
|
||||
# check if the distances are the same
|
||||
dist_unstructured = out_unstructured[-1]
|
||||
dist_structured = out_structured[-1]
|
||||
|
||||
assert_array_almost_equal(dist_unstructured, dist_structured)
|
||||
|
||||
for linkage in ["average", "complete", "single"]:
|
||||
structured_items = linkage_tree(
|
||||
X, connectivity=connectivity, linkage=linkage, return_distance=True
|
||||
)[-1]
|
||||
unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[
|
||||
-1
|
||||
]
|
||||
structured_dist = structured_items[-1]
|
||||
unstructured_dist = unstructured_items[-1]
|
||||
structured_children = structured_items[0]
|
||||
unstructured_children = unstructured_items[0]
|
||||
assert_array_almost_equal(structured_dist, unstructured_dist)
|
||||
assert_array_almost_equal(structured_children, unstructured_children)
|
||||
|
||||
# test on the following dataset where we know the truth
|
||||
# taken from scipy/cluster/tests/hierarchy_test_data.py
|
||||
X = np.array(
|
||||
[
|
||||
[1.43054825, -7.5693489],
|
||||
[6.95887839, 6.82293382],
|
||||
[2.87137846, -9.68248579],
|
||||
[7.87974764, -6.05485803],
|
||||
[8.24018364, -6.09495602],
|
||||
[7.39020262, 8.54004355],
|
||||
]
|
||||
)
|
||||
# truth
|
||||
linkage_X_ward = np.array(
|
||||
[
|
||||
[3.0, 4.0, 0.36265956, 2.0],
|
||||
[1.0, 5.0, 1.77045373, 2.0],
|
||||
[0.0, 2.0, 2.55760419, 2.0],
|
||||
[6.0, 8.0, 9.10208346, 4.0],
|
||||
[7.0, 9.0, 24.7784379, 6.0],
|
||||
]
|
||||
)
|
||||
|
||||
linkage_X_complete = np.array(
|
||||
[
|
||||
[3.0, 4.0, 0.36265956, 2.0],
|
||||
[1.0, 5.0, 1.77045373, 2.0],
|
||||
[0.0, 2.0, 2.55760419, 2.0],
|
||||
[6.0, 8.0, 6.96742194, 4.0],
|
||||
[7.0, 9.0, 18.77445997, 6.0],
|
||||
]
|
||||
)
|
||||
|
||||
linkage_X_average = np.array(
|
||||
[
|
||||
[3.0, 4.0, 0.36265956, 2.0],
|
||||
[1.0, 5.0, 1.77045373, 2.0],
|
||||
[0.0, 2.0, 2.55760419, 2.0],
|
||||
[6.0, 8.0, 6.55832839, 4.0],
|
||||
[7.0, 9.0, 15.44089605, 6.0],
|
||||
]
|
||||
)
|
||||
|
||||
n_samples, n_features = np.shape(X)
|
||||
connectivity_X = np.ones((n_samples, n_samples))
|
||||
|
||||
out_X_unstructured = ward_tree(X, return_distance=True)
|
||||
out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True)
|
||||
|
||||
# check that the labels are the same
|
||||
assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
|
||||
assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])
|
||||
|
||||
# check that the distances are correct
|
||||
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
|
||||
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
|
||||
|
||||
linkage_options = ["complete", "average", "single"]
|
||||
X_linkage_truth = [linkage_X_complete, linkage_X_average]
|
||||
for linkage, X_truth in zip(linkage_options, X_linkage_truth):
|
||||
out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage)
|
||||
out_X_structured = linkage_tree(
|
||||
X, connectivity=connectivity_X, linkage=linkage, return_distance=True
|
||||
)
|
||||
|
||||
# check that the labels are the same
|
||||
assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
|
||||
assert_array_equal(X_truth[:, :2], out_X_structured[0])
|
||||
|
||||
# check that the distances are correct
|
||||
assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
|
||||
assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
|
||||
|
||||
|
||||
def test_connectivity_fixing_non_lil():
|
||||
# Check non regression of a bug if a non item assignable connectivity is
|
||||
# provided with more than one component.
|
||||
# create dummy data
|
||||
x = np.array([[0, 0], [1, 1]])
|
||||
# create a mask with several components to force connectivity fixing
|
||||
m = np.array([[True, False], [False, True]])
|
||||
c = grid_to_graph(n_x=2, n_y=2, mask=m)
|
||||
w = AgglomerativeClustering(connectivity=c, linkage="ward")
|
||||
with pytest.warns(UserWarning):
|
||||
w.fit(x)
|
||||
|
||||
|
||||
def test_int_float_dict():
|
||||
rng = np.random.RandomState(0)
|
||||
keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
|
||||
values = rng.rand(len(keys))
|
||||
|
||||
d = IntFloatDict(keys, values)
|
||||
for key, value in zip(keys, values):
|
||||
assert d[key] == value
|
||||
|
||||
other_keys = np.arange(50, dtype=np.intp)[::2]
|
||||
other_values = np.full(50, 0.5)[::2]
|
||||
other = IntFloatDict(other_keys, other_values)
|
||||
# Complete smoke test
|
||||
max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
|
||||
average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
|
||||
|
||||
|
||||
def test_connectivity_callable():
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(20, 5)
|
||||
connectivity = kneighbors_graph(X, 3, include_self=False)
|
||||
aglc1 = AgglomerativeClustering(connectivity=connectivity)
|
||||
aglc2 = AgglomerativeClustering(
|
||||
connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)
|
||||
)
|
||||
aglc1.fit(X)
|
||||
aglc2.fit(X)
|
||||
assert_array_equal(aglc1.labels_, aglc2.labels_)
|
||||
|
||||
|
||||
def test_connectivity_ignores_diagonal():
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(20, 5)
|
||||
connectivity = kneighbors_graph(X, 3, include_self=False)
|
||||
connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
|
||||
aglc1 = AgglomerativeClustering(connectivity=connectivity)
|
||||
aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
|
||||
aglc1.fit(X)
|
||||
aglc2.fit(X)
|
||||
assert_array_equal(aglc1.labels_, aglc2.labels_)
|
||||
|
||||
|
||||
def test_compute_full_tree():
|
||||
# Test that the full tree is computed if n_clusters is small
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 2)
|
||||
connectivity = kneighbors_graph(X, 5, include_self=False)
|
||||
|
||||
# When n_clusters is less, the full tree should be built
|
||||
# that is the number of merges should be n_samples - 1
|
||||
agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
|
||||
agc.fit(X)
|
||||
n_samples = X.shape[0]
|
||||
n_nodes = agc.children_.shape[0]
|
||||
assert n_nodes == n_samples - 1
|
||||
|
||||
# When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
|
||||
# we should stop when there are n_clusters.
|
||||
n_clusters = 101
|
||||
X = rng.randn(200, 2)
|
||||
connectivity = kneighbors_graph(X, 10, include_self=False)
|
||||
agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity)
|
||||
agc.fit(X)
|
||||
n_samples = X.shape[0]
|
||||
n_nodes = agc.children_.shape[0]
|
||||
assert n_nodes == n_samples - n_clusters
|
||||
|
||||
|
||||
def test_n_components():
|
||||
# Test n_components returned by linkage, average and ward tree
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(5, 5)
|
||||
|
||||
# Connectivity matrix having five components.
|
||||
connectivity = np.eye(5)
|
||||
|
||||
for linkage_func in _TREE_BUILDERS.values():
|
||||
assert ignore_warnings(linkage_func)(X, connectivity=connectivity)[1] == 5
|
||||
|
||||
|
||||
def test_agg_n_clusters():
|
||||
# Test that an error is raised when n_clusters <= 0
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(20, 10)
|
||||
for n_clus in [-1, 0]:
|
||||
agc = AgglomerativeClustering(n_clusters=n_clus)
|
||||
msg = "n_clusters should be an integer greater than 0. %s was provided." % str(
|
||||
agc.n_clusters
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
agc.fit(X)
|
||||
|
||||
|
||||
def test_affinity_passed_to_fix_connectivity():
|
||||
# Test that the affinity parameter is actually passed to the pairwise
|
||||
# function
|
||||
|
||||
size = 2
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(size, size)
|
||||
mask = np.array([True, False, False, True])
|
||||
|
||||
connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
|
||||
|
||||
class FakeAffinity:
|
||||
def __init__(self):
|
||||
self.counter = 0
|
||||
|
||||
def increment(self, *args, **kwargs):
|
||||
self.counter += 1
|
||||
return self.counter
|
||||
|
||||
fa = FakeAffinity()
|
||||
|
||||
linkage_tree(X, connectivity=connectivity, affinity=fa.increment)
|
||||
|
||||
assert fa.counter == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
|
||||
def test_agglomerative_clustering_with_distance_threshold(linkage):
|
||||
# Check that we obtain the correct number of clusters with
|
||||
# agglomerative clustering with distance_threshold.
|
||||
rng = np.random.RandomState(0)
|
||||
mask = np.ones([10, 10], dtype=bool)
|
||||
n_samples = 100
|
||||
X = rng.randn(n_samples, 50)
|
||||
connectivity = grid_to_graph(*mask.shape)
|
||||
# test when distance threshold is set to 10
|
||||
distance_threshold = 10
|
||||
for conn in [None, connectivity]:
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=None,
|
||||
distance_threshold=distance_threshold,
|
||||
connectivity=conn,
|
||||
linkage=linkage,
|
||||
)
|
||||
clustering.fit(X)
|
||||
clusters_produced = clustering.labels_
|
||||
num_clusters_produced = len(np.unique(clustering.labels_))
|
||||
# test if the clusters produced match the point in the linkage tree
|
||||
# where the distance exceeds the threshold
|
||||
tree_builder = _TREE_BUILDERS[linkage]
|
||||
children, n_components, n_leaves, parent, distances = tree_builder(
|
||||
X, connectivity=conn, n_clusters=None, return_distance=True
|
||||
)
|
||||
num_clusters_at_threshold = (
|
||||
np.count_nonzero(distances >= distance_threshold) + 1
|
||||
)
|
||||
# test number of clusters produced
|
||||
assert num_clusters_at_threshold == num_clusters_produced
|
||||
# test clusters produced
|
||||
clusters_at_threshold = _hc_cut(
|
||||
n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves
|
||||
)
|
||||
assert np.array_equiv(clusters_produced, clusters_at_threshold)
|
||||
|
||||
|
||||
def test_small_distance_threshold():
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 10
|
||||
X = rng.randint(-300, 300, size=(n_samples, 3))
|
||||
# this should result in all data in their own clusters, given that
|
||||
# their pairwise distances are bigger than .1 (which may not be the case
|
||||
# with a different random seed).
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=None, distance_threshold=1.0, linkage="single"
|
||||
).fit(X)
|
||||
# check that the pairwise distances are indeed all larger than .1
|
||||
all_distances = pairwise_distances(X, metric="minkowski", p=2)
|
||||
np.fill_diagonal(all_distances, np.inf)
|
||||
assert np.all(all_distances > 0.1)
|
||||
assert clustering.n_clusters_ == n_samples
|
||||
|
||||
|
||||
def test_cluster_distances_with_distance_threshold():
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.randint(-10, 10, size=(n_samples, 3))
|
||||
# check the distances within the clusters and with other clusters
|
||||
distance_threshold = 4
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=None, distance_threshold=distance_threshold, linkage="single"
|
||||
).fit(X)
|
||||
labels = clustering.labels_
|
||||
D = pairwise_distances(X, metric="minkowski", p=2)
|
||||
# to avoid taking the 0 diagonal in min()
|
||||
np.fill_diagonal(D, np.inf)
|
||||
for label in np.unique(labels):
|
||||
in_cluster_mask = labels == label
|
||||
max_in_cluster_distance = (
|
||||
D[in_cluster_mask][:, in_cluster_mask].min(axis=0).max()
|
||||
)
|
||||
min_out_cluster_distance = (
|
||||
D[in_cluster_mask][:, ~in_cluster_mask].min(axis=0).min()
|
||||
)
|
||||
# single data point clusters only have that inf diagonal here
|
||||
if in_cluster_mask.sum() > 1:
|
||||
assert max_in_cluster_distance < distance_threshold
|
||||
assert min_out_cluster_distance >= distance_threshold
|
||||
|
||||
|
||||
@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
|
||||
@pytest.mark.parametrize(
|
||||
("threshold", "y_true"), [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])]
|
||||
)
|
||||
def test_agglomerative_clustering_with_distance_threshold_edge_case(
|
||||
linkage, threshold, y_true
|
||||
):
|
||||
# test boundary case of distance_threshold matching the distance
|
||||
X = [[0], [1]]
|
||||
clusterer = AgglomerativeClustering(
|
||||
n_clusters=None, distance_threshold=threshold, linkage=linkage
|
||||
)
|
||||
y_pred = clusterer.fit_predict(X)
|
||||
assert adjusted_rand_score(y_true, y_pred) == 1
|
||||
|
||||
|
||||
def test_dist_threshold_invalid_parameters():
|
||||
X = [[0], [1]]
|
||||
with pytest.raises(ValueError, match="Exactly one of "):
|
||||
AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X)
|
||||
|
||||
with pytest.raises(ValueError, match="Exactly one of "):
|
||||
AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X)
|
||||
|
||||
X = [[0], [1]]
|
||||
with pytest.raises(ValueError, match="compute_full_tree must be True if"):
|
||||
AgglomerativeClustering(
|
||||
n_clusters=None, distance_threshold=1, compute_full_tree=False
|
||||
).fit(X)
|
||||
|
||||
|
||||
def test_invalid_shape_precomputed_dist_matrix():
|
||||
# Check that an error is raised when affinity='precomputed'
|
||||
# and a non square matrix is passed (PR #16257).
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(5, 3)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=r"Distance matrix should be square, got matrix of shape \(5, 3\)",
|
||||
):
|
||||
AgglomerativeClustering(affinity="precomputed", linkage="complete").fit(X)
|
||||
|
||||
|
||||
def test_precomputed_connectivity_affinity_with_2_connected_components():
|
||||
"""Check that connecting components works when connectivity and
|
||||
affinity are both precomputed and the number of connected components is
|
||||
greater than 1. Non-regression test for #16151.
|
||||
"""
|
||||
|
||||
connectivity_matrix = np.array(
|
||||
[
|
||||
[0, 1, 1, 0, 0],
|
||||
[0, 0, 1, 0, 0],
|
||||
[0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 1],
|
||||
[0, 0, 0, 0, 0],
|
||||
]
|
||||
)
|
||||
# ensure that connectivity_matrix has two connected components
|
||||
assert connected_components(connectivity_matrix)[0] == 2
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(5, 10)
|
||||
|
||||
X_dist = pairwise_distances(X)
|
||||
clusterer_precomputed = AgglomerativeClustering(
|
||||
affinity="precomputed", connectivity=connectivity_matrix, linkage="complete"
|
||||
)
|
||||
msg = "Completing it to avoid stopping the tree early"
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
clusterer_precomputed.fit(X_dist)
|
||||
|
||||
clusterer = AgglomerativeClustering(
|
||||
connectivity=connectivity_matrix, linkage="complete"
|
||||
)
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
clusterer.fit(X)
|
||||
|
||||
assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_)
|
||||
assert_array_equal(clusterer.children_, clusterer_precomputed.children_)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
Testing for mean shift clustering methods
|
||||
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import warnings
|
||||
import pytest
|
||||
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
from sklearn.cluster import MeanShift
|
||||
from sklearn.cluster import mean_shift
|
||||
from sklearn.cluster import estimate_bandwidth
|
||||
from sklearn.cluster import get_bin_seeds
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.metrics import v_measure_score
|
||||
|
||||
|
||||
n_clusters = 3
|
||||
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
|
||||
X, _ = make_blobs(
|
||||
n_samples=300,
|
||||
n_features=2,
|
||||
centers=centers,
|
||||
cluster_std=0.4,
|
||||
shuffle=True,
|
||||
random_state=11,
|
||||
)
|
||||
|
||||
|
||||
def test_estimate_bandwidth():
|
||||
# Test estimate_bandwidth
|
||||
bandwidth = estimate_bandwidth(X, n_samples=200)
|
||||
assert 0.9 <= bandwidth <= 1.5
|
||||
|
||||
|
||||
def test_estimate_bandwidth_1sample():
|
||||
# Test estimate_bandwidth when n_samples=1 and quantile<1, so that
|
||||
# n_neighbors is set to 1.
|
||||
bandwidth = estimate_bandwidth(X, n_samples=1, quantile=0.3)
|
||||
assert bandwidth == pytest.approx(0.0, abs=1e-5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bandwidth, cluster_all, expected, first_cluster_label",
|
||||
[(1.2, True, 3, 0), (1.2, False, 4, -1)],
|
||||
)
|
||||
def test_mean_shift(bandwidth, cluster_all, expected, first_cluster_label):
|
||||
# Test MeanShift algorithm
|
||||
ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all)
|
||||
labels = ms.fit(X).labels_
|
||||
labels_unique = np.unique(labels)
|
||||
n_clusters_ = len(labels_unique)
|
||||
assert n_clusters_ == expected
|
||||
assert labels_unique[0] == first_cluster_label
|
||||
|
||||
cluster_centers, labels_mean_shift = mean_shift(X, cluster_all=cluster_all)
|
||||
labels_mean_shift_unique = np.unique(labels_mean_shift)
|
||||
n_clusters_mean_shift = len(labels_mean_shift_unique)
|
||||
assert n_clusters_mean_shift == expected
|
||||
assert labels_mean_shift_unique[0] == first_cluster_label
|
||||
|
||||
|
||||
def test_mean_shift_negative_bandwidth():
|
||||
bandwidth = -1
|
||||
ms = MeanShift(bandwidth=bandwidth)
|
||||
msg = r"bandwidth needs to be greater than zero or None," r" got -1\.000000"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ms.fit(X)
|
||||
|
||||
|
||||
def test_estimate_bandwidth_with_sparse_matrix():
|
||||
# Test estimate_bandwidth with sparse matrix
|
||||
X = sparse.lil_matrix((1000, 1000))
|
||||
msg = "A sparse matrix was passed, but dense data is required."
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
estimate_bandwidth(X)
|
||||
|
||||
|
||||
def test_parallel():
|
||||
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
|
||||
X, _ = make_blobs(
|
||||
n_samples=50,
|
||||
n_features=2,
|
||||
centers=centers,
|
||||
cluster_std=0.4,
|
||||
shuffle=True,
|
||||
random_state=11,
|
||||
)
|
||||
|
||||
ms1 = MeanShift(n_jobs=2)
|
||||
ms1.fit(X)
|
||||
|
||||
ms2 = MeanShift()
|
||||
ms2.fit(X)
|
||||
|
||||
assert_array_almost_equal(ms1.cluster_centers_, ms2.cluster_centers_)
|
||||
assert_array_equal(ms1.labels_, ms2.labels_)
|
||||
|
||||
|
||||
def test_meanshift_predict():
|
||||
# Test MeanShift.predict
|
||||
ms = MeanShift(bandwidth=1.2)
|
||||
labels = ms.fit_predict(X)
|
||||
labels2 = ms.predict(X)
|
||||
assert_array_equal(labels, labels2)
|
||||
|
||||
|
||||
def test_meanshift_all_orphans():
|
||||
# init away from the data, crash with a sensible warning
|
||||
ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]])
|
||||
msg = "No point was within bandwidth=0.1"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ms.fit(
|
||||
X,
|
||||
)
|
||||
|
||||
|
||||
def test_unfitted():
|
||||
# Non-regression: before fit, there should be not fitted attributes.
|
||||
ms = MeanShift()
|
||||
assert not hasattr(ms, "cluster_centers_")
|
||||
assert not hasattr(ms, "labels_")
|
||||
|
||||
|
||||
def test_cluster_intensity_tie():
|
||||
X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]])
|
||||
c1 = MeanShift(bandwidth=2).fit(X)
|
||||
|
||||
X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]])
|
||||
c2 = MeanShift(bandwidth=2).fit(X)
|
||||
assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0])
|
||||
assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1])
|
||||
|
||||
|
||||
def test_bin_seeds():
|
||||
# Test the bin seeding technique which can be used in the mean shift
|
||||
# algorithm
|
||||
# Data is just 6 points in the plane
|
||||
X = np.array(
|
||||
[[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]]
|
||||
)
|
||||
|
||||
# With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
|
||||
# found
|
||||
ground_truth = {(1.0, 1.0), (2.0, 1.0), (0.0, 0.0)}
|
||||
test_bins = get_bin_seeds(X, 1, 1)
|
||||
test_result = set(tuple(p) for p in test_bins)
|
||||
assert len(ground_truth.symmetric_difference(test_result)) == 0
|
||||
|
||||
# With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
|
||||
# found
|
||||
ground_truth = {(1.0, 1.0), (2.0, 1.0)}
|
||||
test_bins = get_bin_seeds(X, 1, 2)
|
||||
test_result = set(tuple(p) for p in test_bins)
|
||||
assert len(ground_truth.symmetric_difference(test_result)) == 0
|
||||
|
||||
# With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
|
||||
# we bail and use the whole data here.
|
||||
with warnings.catch_warnings(record=True):
|
||||
test_bins = get_bin_seeds(X, 0.01, 1)
|
||||
assert_array_almost_equal(test_bins, X)
|
||||
|
||||
# tight clusters around [0, 0] and [1, 1], only get two bins
|
||||
X, _ = make_blobs(
|
||||
n_samples=100,
|
||||
n_features=2,
|
||||
centers=[[0, 0], [1, 1]],
|
||||
cluster_std=0.1,
|
||||
random_state=0,
|
||||
)
|
||||
test_bins = get_bin_seeds(X, 1)
|
||||
assert_array_equal(test_bins, [[0, 0], [1, 1]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_iter", [1, 100])
|
||||
def test_max_iter(max_iter):
|
||||
clusters1, _ = mean_shift(X, max_iter=max_iter)
|
||||
ms = MeanShift(max_iter=max_iter).fit(X)
|
||||
clusters2 = ms.cluster_centers_
|
||||
|
||||
assert ms.n_iter_ <= ms.max_iter
|
||||
assert len(clusters1) == len(clusters2)
|
||||
|
||||
for c1, c2 in zip(clusters1, clusters2):
|
||||
assert np.allclose(c1, c2)
|
||||
|
||||
|
||||
def test_mean_shift_zero_bandwidth():
|
||||
# Check that mean shift works when the estimated bandwidth is 0.
|
||||
X = np.array([1, 1, 1, 2, 2, 2, 3, 3]).reshape(-1, 1)
|
||||
|
||||
# estimate_bandwidth with default args returns 0 on this dataset
|
||||
bandwidth = estimate_bandwidth(X)
|
||||
assert bandwidth == 0
|
||||
|
||||
# get_bin_seeds with a 0 bin_size should return the dataset itself
|
||||
assert get_bin_seeds(X, bin_size=bandwidth) is X
|
||||
|
||||
# MeanShift with binning and a 0 estimated bandwidth should be equivalent
|
||||
# to no binning.
|
||||
ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X)
|
||||
ms_nobinning = MeanShift(bin_seeding=False).fit(X)
|
||||
expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2])
|
||||
|
||||
assert v_measure_score(ms_binning.labels_, expected_labels) == 1
|
||||
assert v_measure_score(ms_nobinning.labels_, expected_labels) == 1
|
||||
assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_)
|
||||
@@ -0,0 +1,806 @@
|
||||
# Authors: Shane Grigsby <refuge@rocktalus.com>
|
||||
# Adrin Jalali <adrin.jalali@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
import numpy as np
|
||||
import pytest
|
||||
import warnings
|
||||
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.cluster import OPTICS
|
||||
from sklearn.cluster._optics import _extend_region, _extract_xi_labels
|
||||
from sklearn.exceptions import DataConversionWarning
|
||||
from sklearn.metrics.cluster import contingency_matrix
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
from sklearn.cluster.tests.common import generate_clustered_data
|
||||
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
n_points_per_cluster = 10
|
||||
C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
|
||||
C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
|
||||
C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
|
||||
C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2)
|
||||
C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2)
|
||||
X = np.vstack((C1, C2, C3, C4, C5, C6))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("r_plot", "end"),
|
||||
[
|
||||
[[10, 8.9, 8.8, 8.7, 7, 10], 3],
|
||||
[[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0],
|
||||
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
|
||||
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
|
||||
],
|
||||
)
|
||||
def test_extend_downward(r_plot, end):
|
||||
r_plot = np.array(r_plot)
|
||||
ratio = r_plot[:-1] / r_plot[1:]
|
||||
steep_downward = ratio >= 1 / 0.9
|
||||
upward = ratio < 1
|
||||
|
||||
e = _extend_region(steep_downward, upward, 0, 2)
|
||||
assert e == end
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("r_plot", "end"),
|
||||
[
|
||||
[[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6],
|
||||
[[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0],
|
||||
[[1, 2, 2.1, 2, np.inf], 0],
|
||||
[[1, 2, 2.1, np.inf], 2],
|
||||
],
|
||||
)
|
||||
def test_extend_upward(r_plot, end):
|
||||
r_plot = np.array(r_plot)
|
||||
ratio = r_plot[:-1] / r_plot[1:]
|
||||
steep_upward = ratio <= 0.9
|
||||
downward = ratio > 1
|
||||
|
||||
e = _extend_region(steep_upward, downward, 0, 2)
|
||||
assert e == end
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("ordering", "clusters", "expected"),
|
||||
[
|
||||
[[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]],
|
||||
[[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]],
|
||||
[[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]],
|
||||
[[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]],
|
||||
],
|
||||
)
|
||||
def test_the_extract_xi_labels(ordering, clusters, expected):
|
||||
labels = _extract_xi_labels(ordering, clusters)
|
||||
|
||||
assert_array_equal(labels, expected)
|
||||
|
||||
|
||||
def test_extract_xi(global_dtype):
|
||||
# small and easy test (no clusters around other clusters)
|
||||
# but with a clear noise data.
|
||||
rng = np.random.RandomState(0)
|
||||
n_points_per_cluster = 5
|
||||
|
||||
C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
|
||||
C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
|
||||
C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
|
||||
C5 = [3, -2] + 0.6 * rng.randn(n_points_per_cluster, 2)
|
||||
C6 = [5, 6] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
|
||||
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)).astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5]
|
||||
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
|
||||
|
||||
clust = OPTICS(
|
||||
min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4
|
||||
).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
# check float min_samples and min_cluster_size
|
||||
clust = OPTICS(
|
||||
min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4
|
||||
).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)).astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
expected_labels = np.r_[
|
||||
[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5
|
||||
]
|
||||
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
|
||||
|
||||
clust = OPTICS(
|
||||
min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3
|
||||
).fit(X)
|
||||
# this may fail if the predecessor correction is not at work!
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]]
|
||||
C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
|
||||
C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
|
||||
X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
|
||||
expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
|
||||
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
|
||||
|
||||
clust = OPTICS(
|
||||
min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04
|
||||
).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
|
||||
def test_cluster_hierarchy_(global_dtype):
|
||||
rng = np.random.RandomState(0)
|
||||
n_points_per_cluster = 100
|
||||
C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2).astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2).astype(
|
||||
global_dtype, copy=False
|
||||
)
|
||||
X = np.vstack((C1, C2))
|
||||
X = shuffle(X, random_state=0)
|
||||
|
||||
clusters = OPTICS(min_samples=20, xi=0.1).fit(X).cluster_hierarchy_
|
||||
assert clusters.shape == (2, 2)
|
||||
diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
|
||||
assert diff / len(X) < 0.05
|
||||
|
||||
|
||||
def test_correct_number_of_clusters():
|
||||
# in 'auto' mode
|
||||
|
||||
n_clusters = 3
|
||||
X = generate_clustered_data(n_clusters=n_clusters)
|
||||
# Parameters chosen specifically for this task.
|
||||
# Compute OPTICS
|
||||
clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1)
|
||||
clust.fit(X)
|
||||
# number of clusters, ignoring noise if present
|
||||
n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
|
||||
assert n_clusters_1 == n_clusters
|
||||
|
||||
# check attribute types and sizes
|
||||
assert clust.labels_.shape == (len(X),)
|
||||
assert clust.labels_.dtype.kind == "i"
|
||||
|
||||
assert clust.reachability_.shape == (len(X),)
|
||||
assert clust.reachability_.dtype.kind == "f"
|
||||
|
||||
assert clust.core_distances_.shape == (len(X),)
|
||||
assert clust.core_distances_.dtype.kind == "f"
|
||||
|
||||
assert clust.ordering_.shape == (len(X),)
|
||||
assert clust.ordering_.dtype.kind == "i"
|
||||
assert set(clust.ordering_) == set(range(len(X)))
|
||||
|
||||
|
||||
def test_minimum_number_of_sample_check():
|
||||
# test that we check a minimum number of samples
|
||||
msg = "min_samples must be no greater than"
|
||||
|
||||
# Compute OPTICS
|
||||
X = [[1, 1]]
|
||||
clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1)
|
||||
|
||||
# Run the fit
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clust.fit(X)
|
||||
|
||||
|
||||
def test_bad_extract():
|
||||
# Test an extraction of eps too close to original eps
|
||||
msg = "Specify an epsilon smaller than 0.15. Got 0.3."
|
||||
centers = [[1, 1], [-1, -1], [1, -1]]
|
||||
X, labels_true = make_blobs(
|
||||
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
|
||||
)
|
||||
|
||||
# Compute OPTICS
|
||||
clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clust.fit(X)
|
||||
|
||||
|
||||
def test_bad_reachability():
|
||||
msg = "All reachability values are inf. Set a larger max_eps."
|
||||
centers = [[1, 1], [-1, -1], [1, -1]]
|
||||
X, labels_true = make_blobs(
|
||||
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
|
||||
)
|
||||
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
|
||||
clust.fit(X)
|
||||
|
||||
|
||||
def test_nowarn_if_metric_bool_data_bool():
|
||||
# make sure no warning is raised if metric and data are both boolean
|
||||
# non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/18996
|
||||
|
||||
pairwise_metric = "rogerstanimoto"
|
||||
X = np.random.randint(2, size=(5, 2), dtype=bool)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", DataConversionWarning)
|
||||
|
||||
OPTICS(metric=pairwise_metric).fit(X)
|
||||
|
||||
|
||||
def test_warn_if_metric_bool_data_no_bool():
|
||||
# make sure a *single* conversion warning is raised if metric is boolean
|
||||
# but data isn't
|
||||
# non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/18996
|
||||
|
||||
pairwise_metric = "rogerstanimoto"
|
||||
X = np.random.randint(2, size=(5, 2), dtype=np.int32)
|
||||
msg = f"Data will be converted to boolean for metric {pairwise_metric}"
|
||||
|
||||
with pytest.warns(DataConversionWarning, match=msg) as warn_record:
|
||||
OPTICS(metric=pairwise_metric).fit(X)
|
||||
assert len(warn_record) == 1
|
||||
|
||||
|
||||
def test_nowarn_if_metric_no_bool():
|
||||
# make sure no conversion warning is raised if
|
||||
# metric isn't boolean, no matter what the data type is
|
||||
pairwise_metric = "minkowski"
|
||||
X_bool = np.random.randint(2, size=(5, 2), dtype=bool)
|
||||
X_num = np.random.randint(2, size=(5, 2), dtype=np.int32)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", DataConversionWarning)
|
||||
|
||||
# fit boolean data
|
||||
OPTICS(metric=pairwise_metric).fit(X_bool)
|
||||
# fit numeric data
|
||||
OPTICS(metric=pairwise_metric).fit(X_num)
|
||||
|
||||
|
||||
def test_close_extract():
|
||||
# Test extract where extraction eps is close to scaled max_eps
|
||||
|
||||
centers = [[1, 1], [-1, -1], [1, -1]]
|
||||
X, labels_true = make_blobs(
|
||||
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
|
||||
)
|
||||
|
||||
# Compute OPTICS
|
||||
clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X)
|
||||
# Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
|
||||
assert max(clust.labels_) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("eps", [0.1, 0.3, 0.5])
|
||||
@pytest.mark.parametrize("min_samples", [3, 10, 20])
|
||||
def test_dbscan_optics_parity(eps, min_samples, global_dtype):
|
||||
# Test that OPTICS clustering labels are <= 5% difference of DBSCAN
|
||||
|
||||
centers = [[1, 1], [-1, -1], [1, -1]]
|
||||
X, labels_true = make_blobs(
|
||||
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
|
||||
)
|
||||
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
# calculate optics with dbscan extract at 0.3 epsilon
|
||||
op = OPTICS(min_samples=min_samples, cluster_method="dbscan", eps=eps).fit(X)
|
||||
|
||||
# calculate dbscan labels
|
||||
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
|
||||
|
||||
contingency = contingency_matrix(db.labels_, op.labels_)
|
||||
agree = min(
|
||||
np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))
|
||||
)
|
||||
disagree = X.shape[0] - agree
|
||||
|
||||
percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
|
||||
|
||||
# verify label mismatch is <= 5% labels
|
||||
assert percent_mismatch <= 0.05
|
||||
|
||||
|
||||
def test_min_samples_edge_case(global_dtype):
|
||||
C1 = [[0, 0], [0, 0.1], [0, -0.1]]
|
||||
C2 = [[10, 10], [10, 9], [10, 11]]
|
||||
C3 = [[100, 100], [100, 96], [100, 106]]
|
||||
X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
|
||||
|
||||
expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
|
||||
clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
|
||||
clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
expected_labels = np.r_[[-1] * 9]
|
||||
with pytest.warns(UserWarning, match="All reachability values"):
|
||||
clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
|
||||
assert_array_equal(clust.labels_, expected_labels)
|
||||
|
||||
|
||||
# try arbitrary minimum sizes
|
||||
@pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23))
|
||||
def test_min_cluster_size(min_cluster_size, global_dtype):
|
||||
redX = X[::2].astype(global_dtype, copy=False) # reduce for speed
|
||||
clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX)
|
||||
cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
|
||||
if cluster_sizes.size:
|
||||
assert min(cluster_sizes) >= min_cluster_size
|
||||
# check behaviour is the same when min_cluster_size is a fraction
|
||||
clust_frac = OPTICS(
|
||||
min_samples=9, min_cluster_size=min_cluster_size / redX.shape[0]
|
||||
)
|
||||
clust_frac.fit(redX)
|
||||
assert_array_equal(clust.labels_, clust_frac.labels_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("min_cluster_size", [0, -1, 1.1, 2.2])
|
||||
def test_min_cluster_size_invalid(min_cluster_size):
|
||||
clust = OPTICS(min_cluster_size=min_cluster_size)
|
||||
with pytest.raises(ValueError, match="must be a positive integer or a "):
|
||||
clust.fit(X)
|
||||
|
||||
|
||||
def test_min_cluster_size_invalid2():
|
||||
clust = OPTICS(min_cluster_size=len(X) + 1)
|
||||
with pytest.raises(ValueError, match="must be no greater than the "):
|
||||
clust.fit(X)
|
||||
|
||||
|
||||
def test_processing_order():
|
||||
# Ensure that we consider all unprocessed points,
|
||||
# not only direct neighbors. when picking the next point.
|
||||
Y = [[0], [10], [-10], [25]]
|
||||
clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
|
||||
assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
|
||||
assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
|
||||
assert_array_equal(clust.ordering_, [0, 1, 2, 3])
|
||||
|
||||
|
||||
def test_compare_to_ELKI():
|
||||
# Expected values, computed with (future) ELKI 0.7.5 using:
|
||||
# java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter
|
||||
# -algorithm clustering.optics.OPTICSHeap -optics.minpts 5
|
||||
# where the FixedDBIDsFilter gives 0-indexed ids.
|
||||
r1 = [
|
||||
np.inf,
|
||||
1.0574896366427478,
|
||||
0.7587934993548423,
|
||||
0.7290174038973836,
|
||||
0.7290174038973836,
|
||||
0.7290174038973836,
|
||||
0.6861627576116127,
|
||||
0.7587934993548423,
|
||||
0.9280118450166668,
|
||||
1.1748022534146194,
|
||||
3.3355455741292257,
|
||||
0.49618389254482587,
|
||||
0.2552805046961355,
|
||||
0.2552805046961355,
|
||||
0.24944622248445714,
|
||||
0.24944622248445714,
|
||||
0.24944622248445714,
|
||||
0.2552805046961355,
|
||||
0.2552805046961355,
|
||||
0.3086779122185853,
|
||||
4.163024452756142,
|
||||
1.623152630340929,
|
||||
0.45315840475822655,
|
||||
0.25468325192031926,
|
||||
0.2254004358159971,
|
||||
0.18765711877083036,
|
||||
0.1821471333893275,
|
||||
0.1821471333893275,
|
||||
0.18765711877083036,
|
||||
0.18765711877083036,
|
||||
0.2240202988740153,
|
||||
1.154337614548715,
|
||||
1.342604473837069,
|
||||
1.323308536402633,
|
||||
0.8607514948648837,
|
||||
0.27219111215810565,
|
||||
0.13260875220533205,
|
||||
0.13260875220533205,
|
||||
0.09890587675958984,
|
||||
0.09890587675958984,
|
||||
0.13548790801634494,
|
||||
0.1575483940837384,
|
||||
0.17515137170530226,
|
||||
0.17575920159442388,
|
||||
0.27219111215810565,
|
||||
0.6101447895405373,
|
||||
1.3189208094864302,
|
||||
1.323308536402633,
|
||||
2.2509184159764577,
|
||||
2.4517810628594527,
|
||||
3.675977064404973,
|
||||
3.8264795626020365,
|
||||
2.9130735341510614,
|
||||
2.9130735341510614,
|
||||
2.9130735341510614,
|
||||
2.9130735341510614,
|
||||
2.8459300127258036,
|
||||
2.8459300127258036,
|
||||
2.8459300127258036,
|
||||
3.0321982337972537,
|
||||
]
|
||||
o1 = [
|
||||
0,
|
||||
3,
|
||||
6,
|
||||
4,
|
||||
7,
|
||||
8,
|
||||
2,
|
||||
9,
|
||||
5,
|
||||
1,
|
||||
31,
|
||||
30,
|
||||
32,
|
||||
34,
|
||||
33,
|
||||
38,
|
||||
39,
|
||||
35,
|
||||
37,
|
||||
36,
|
||||
44,
|
||||
21,
|
||||
23,
|
||||
24,
|
||||
22,
|
||||
25,
|
||||
27,
|
||||
29,
|
||||
26,
|
||||
28,
|
||||
20,
|
||||
40,
|
||||
45,
|
||||
46,
|
||||
10,
|
||||
15,
|
||||
11,
|
||||
13,
|
||||
17,
|
||||
19,
|
||||
18,
|
||||
12,
|
||||
16,
|
||||
14,
|
||||
47,
|
||||
49,
|
||||
43,
|
||||
48,
|
||||
42,
|
||||
41,
|
||||
53,
|
||||
57,
|
||||
51,
|
||||
52,
|
||||
56,
|
||||
59,
|
||||
54,
|
||||
55,
|
||||
58,
|
||||
50,
|
||||
]
|
||||
p1 = [
|
||||
-1,
|
||||
0,
|
||||
3,
|
||||
6,
|
||||
6,
|
||||
6,
|
||||
8,
|
||||
3,
|
||||
7,
|
||||
5,
|
||||
1,
|
||||
31,
|
||||
30,
|
||||
30,
|
||||
34,
|
||||
34,
|
||||
34,
|
||||
32,
|
||||
32,
|
||||
37,
|
||||
36,
|
||||
44,
|
||||
21,
|
||||
23,
|
||||
24,
|
||||
22,
|
||||
25,
|
||||
25,
|
||||
22,
|
||||
22,
|
||||
22,
|
||||
21,
|
||||
40,
|
||||
45,
|
||||
46,
|
||||
10,
|
||||
15,
|
||||
15,
|
||||
13,
|
||||
13,
|
||||
15,
|
||||
11,
|
||||
19,
|
||||
15,
|
||||
10,
|
||||
47,
|
||||
12,
|
||||
45,
|
||||
14,
|
||||
43,
|
||||
42,
|
||||
53,
|
||||
57,
|
||||
57,
|
||||
57,
|
||||
57,
|
||||
59,
|
||||
59,
|
||||
59,
|
||||
58,
|
||||
]
|
||||
|
||||
# Tests against known extraction array
|
||||
# Does NOT work with metric='euclidean', because sklearn euclidean has
|
||||
# worse numeric precision. 'minkowski' is slower but more accurate.
|
||||
clust1 = OPTICS(min_samples=5).fit(X)
|
||||
|
||||
assert_array_equal(clust1.ordering_, np.array(o1))
|
||||
assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
|
||||
assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1))
|
||||
# ELKI currently does not print the core distances (which are not used much
|
||||
# in literature, but we can at least ensure to have this consistency:
|
||||
for i in clust1.ordering_[1:]:
|
||||
assert clust1.reachability_[i] >= clust1.core_distances_[clust1.predecessor_[i]]
|
||||
|
||||
# Expected values, computed with (future) ELKI 0.7.5 using
|
||||
r2 = [
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
0.27219111215810565,
|
||||
0.13260875220533205,
|
||||
0.13260875220533205,
|
||||
0.09890587675958984,
|
||||
0.09890587675958984,
|
||||
0.13548790801634494,
|
||||
0.1575483940837384,
|
||||
0.17515137170530226,
|
||||
0.17575920159442388,
|
||||
0.27219111215810565,
|
||||
0.4928068613197889,
|
||||
np.inf,
|
||||
0.2666183922512113,
|
||||
0.18765711877083036,
|
||||
0.1821471333893275,
|
||||
0.1821471333893275,
|
||||
0.1821471333893275,
|
||||
0.18715928772277457,
|
||||
0.18765711877083036,
|
||||
0.18765711877083036,
|
||||
0.25468325192031926,
|
||||
np.inf,
|
||||
0.2552805046961355,
|
||||
0.2552805046961355,
|
||||
0.24944622248445714,
|
||||
0.24944622248445714,
|
||||
0.24944622248445714,
|
||||
0.2552805046961355,
|
||||
0.2552805046961355,
|
||||
0.3086779122185853,
|
||||
0.34466409325984865,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
np.inf,
|
||||
]
|
||||
o2 = [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
15,
|
||||
11,
|
||||
13,
|
||||
17,
|
||||
19,
|
||||
18,
|
||||
12,
|
||||
16,
|
||||
14,
|
||||
47,
|
||||
46,
|
||||
20,
|
||||
22,
|
||||
25,
|
||||
23,
|
||||
27,
|
||||
29,
|
||||
24,
|
||||
26,
|
||||
28,
|
||||
21,
|
||||
30,
|
||||
32,
|
||||
34,
|
||||
33,
|
||||
38,
|
||||
39,
|
||||
35,
|
||||
37,
|
||||
36,
|
||||
31,
|
||||
40,
|
||||
41,
|
||||
42,
|
||||
43,
|
||||
44,
|
||||
45,
|
||||
48,
|
||||
49,
|
||||
50,
|
||||
51,
|
||||
52,
|
||||
53,
|
||||
54,
|
||||
55,
|
||||
56,
|
||||
57,
|
||||
58,
|
||||
59,
|
||||
]
|
||||
p2 = [
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
10,
|
||||
15,
|
||||
15,
|
||||
13,
|
||||
13,
|
||||
15,
|
||||
11,
|
||||
19,
|
||||
15,
|
||||
10,
|
||||
47,
|
||||
-1,
|
||||
20,
|
||||
22,
|
||||
25,
|
||||
25,
|
||||
25,
|
||||
25,
|
||||
22,
|
||||
22,
|
||||
23,
|
||||
-1,
|
||||
30,
|
||||
30,
|
||||
34,
|
||||
34,
|
||||
34,
|
||||
32,
|
||||
32,
|
||||
37,
|
||||
38,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
-1,
|
||||
]
|
||||
clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)
|
||||
|
||||
assert_array_equal(clust2.ordering_, np.array(o2))
|
||||
assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))
|
||||
assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2))
|
||||
|
||||
index = np.where(clust1.core_distances_ <= 0.5)[0]
|
||||
assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index])
|
||||
|
||||
|
||||
def test_wrong_cluster_method():
|
||||
clust = OPTICS(cluster_method="superfancy")
|
||||
with pytest.raises(ValueError, match="cluster_method should be one of "):
|
||||
clust.fit(X)
|
||||
|
||||
|
||||
def test_extract_dbscan(global_dtype):
|
||||
# testing an easy dbscan case. Not including clusters with different
|
||||
# densities.
|
||||
rng = np.random.RandomState(0)
|
||||
n_points_per_cluster = 20
|
||||
C1 = [-5, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
C2 = [4, -1] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
C3 = [1, 2] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
C4 = [-2, 3] + 0.2 * rng.randn(n_points_per_cluster, 2)
|
||||
X = np.vstack((C1, C2, C3, C4)).astype(global_dtype, copy=False)
|
||||
|
||||
clust = OPTICS(cluster_method="dbscan", eps=0.5).fit(X)
|
||||
assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])
|
||||
|
||||
|
||||
def test_precomputed_dists(global_dtype):
|
||||
redX = X[::2].astype(global_dtype, copy=False)
|
||||
dists = pairwise_distances(redX, metric="euclidean")
|
||||
clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(dists)
|
||||
clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX)
|
||||
|
||||
assert_allclose(clust1.reachability_, clust2.reachability_)
|
||||
assert_array_equal(clust1.labels_, clust2.labels_)
|
||||
@@ -0,0 +1,414 @@
|
||||
"""Testing for Spectral Clustering methods"""
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from scipy.linalg import LinAlgError
|
||||
|
||||
import pytest
|
||||
|
||||
import pickle
|
||||
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
from sklearn.cluster import SpectralClustering, spectral_clustering
|
||||
from sklearn.cluster._spectral import discretize, cluster_qr
|
||||
from sklearn.feature_extraction import img_to_graph
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.metrics import adjusted_rand_score
|
||||
from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.datasets import make_blobs
|
||||
|
||||
try:
|
||||
from pyamg import smoothed_aggregation_solver # noqa
|
||||
|
||||
amg_loaded = True
|
||||
except ImportError:
|
||||
amg_loaded = False
|
||||
|
||||
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
|
||||
X, _ = make_blobs(
|
||||
n_samples=60,
|
||||
n_features=2,
|
||||
centers=centers,
|
||||
cluster_std=0.4,
|
||||
shuffle=True,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg"))
|
||||
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
|
||||
def test_spectral_clustering(eigen_solver, assign_labels):
|
||||
S = np.array(
|
||||
[
|
||||
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
|
||||
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
|
||||
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
|
||||
[0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
|
||||
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
|
||||
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
|
||||
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
|
||||
]
|
||||
)
|
||||
|
||||
for mat in (S, sparse.csr_matrix(S)):
|
||||
model = SpectralClustering(
|
||||
random_state=0,
|
||||
n_clusters=2,
|
||||
affinity="precomputed",
|
||||
eigen_solver=eigen_solver,
|
||||
assign_labels=assign_labels,
|
||||
).fit(mat)
|
||||
labels = model.labels_
|
||||
if labels[0] == 0:
|
||||
labels = 1 - labels
|
||||
|
||||
assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1
|
||||
|
||||
model_copy = pickle.loads(pickle.dumps(model))
|
||||
assert model_copy.n_clusters == model.n_clusters
|
||||
assert model_copy.eigen_solver == model.eigen_solver
|
||||
assert_array_equal(model_copy.labels_, model.labels_)
|
||||
|
||||
|
||||
def test_spectral_unknown_mode():
|
||||
# Test that SpectralClustering fails with an unknown mode set.
|
||||
centers = np.array(
|
||||
[
|
||||
[0.0, 0.0, 0.0],
|
||||
[10.0, 10.0, 10.0],
|
||||
[20.0, 20.0, 20.0],
|
||||
]
|
||||
)
|
||||
X, true_labels = make_blobs(
|
||||
n_samples=100, centers=centers, cluster_std=1.0, random_state=42
|
||||
)
|
||||
D = pairwise_distances(X) # Distance matrix
|
||||
S = np.max(D) - D # Similarity matrix
|
||||
S = sparse.coo_matrix(S)
|
||||
with pytest.raises(ValueError):
|
||||
spectral_clustering(S, n_clusters=2, random_state=0, eigen_solver="<unknown>")
|
||||
|
||||
|
||||
def test_spectral_unknown_assign_labels():
|
||||
# Test that SpectralClustering fails with an unknown assign_labels set.
|
||||
centers = np.array(
|
||||
[
|
||||
[0.0, 0.0, 0.0],
|
||||
[10.0, 10.0, 10.0],
|
||||
[20.0, 20.0, 20.0],
|
||||
]
|
||||
)
|
||||
X, true_labels = make_blobs(
|
||||
n_samples=100, centers=centers, cluster_std=1.0, random_state=42
|
||||
)
|
||||
D = pairwise_distances(X) # Distance matrix
|
||||
S = np.max(D) - D # Similarity matrix
|
||||
S = sparse.coo_matrix(S)
|
||||
with pytest.raises(ValueError):
|
||||
spectral_clustering(S, n_clusters=2, random_state=0, assign_labels="<unknown>")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input, params, err_type, err_msg",
|
||||
[
|
||||
(X, {"n_clusters": -1}, ValueError, "n_clusters == -1, must be >= 1"),
|
||||
(X, {"n_clusters": 0}, ValueError, "n_clusters == 0, must be >= 1"),
|
||||
(
|
||||
X,
|
||||
{"n_clusters": 1.5},
|
||||
TypeError,
|
||||
"n_clusters must be an instance of int, not float",
|
||||
),
|
||||
(X, {"n_init": -1}, ValueError, "n_init == -1, must be >= 1"),
|
||||
(X, {"n_init": 0}, ValueError, "n_init == 0, must be >= 1"),
|
||||
(
|
||||
X,
|
||||
{"n_init": 1.5},
|
||||
TypeError,
|
||||
"n_init must be an instance of int, not float",
|
||||
),
|
||||
(X, {"gamma": -1}, ValueError, "gamma == -1, must be >= 1"),
|
||||
(X, {"gamma": 0}, ValueError, "gamma == 0, must be >= 1"),
|
||||
(X, {"n_neighbors": -1}, ValueError, "n_neighbors == -1, must be >= 1"),
|
||||
(X, {"n_neighbors": 0}, ValueError, "n_neighbors == 0, must be >= 1"),
|
||||
(
|
||||
X,
|
||||
{"eigen_tol": -1, "eigen_solver": "arpack"},
|
||||
ValueError,
|
||||
"eigen_tol == -1, must be >= 0",
|
||||
),
|
||||
(X, {"degree": -1}, ValueError, "degree == -1, must be >= 1"),
|
||||
(X, {"degree": 0}, ValueError, "degree == 0, must be >= 1"),
|
||||
],
|
||||
)
|
||||
def test_spectral_params_validation(input, params, err_type, err_msg):
|
||||
"""Check the parameters validation in `SpectralClustering`."""
|
||||
est = SpectralClustering(**params)
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
est.fit(input)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
|
||||
def test_spectral_clustering_sparse(assign_labels):
|
||||
X, y = make_blobs(
|
||||
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
||||
)
|
||||
|
||||
S = rbf_kernel(X, gamma=1)
|
||||
S = np.maximum(S - 1e-4, 0)
|
||||
S = sparse.coo_matrix(S)
|
||||
|
||||
labels = (
|
||||
SpectralClustering(
|
||||
random_state=0,
|
||||
n_clusters=2,
|
||||
affinity="precomputed",
|
||||
assign_labels=assign_labels,
|
||||
)
|
||||
.fit(S)
|
||||
.labels_
|
||||
)
|
||||
assert adjusted_rand_score(y, labels) == 1
|
||||
|
||||
|
||||
def test_precomputed_nearest_neighbors_filtering():
|
||||
# Test precomputed graph filtering when containing too many neighbors
|
||||
X, y = make_blobs(
|
||||
n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
||||
)
|
||||
|
||||
n_neighbors = 2
|
||||
results = []
|
||||
for additional_neighbors in [0, 10]:
|
||||
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(X)
|
||||
graph = nn.kneighbors_graph(X, mode="connectivity")
|
||||
labels = (
|
||||
SpectralClustering(
|
||||
random_state=0,
|
||||
n_clusters=2,
|
||||
affinity="precomputed_nearest_neighbors",
|
||||
n_neighbors=n_neighbors,
|
||||
)
|
||||
.fit(graph)
|
||||
.labels_
|
||||
)
|
||||
results.append(labels)
|
||||
|
||||
assert_array_equal(results[0], results[1])
|
||||
|
||||
|
||||
def test_affinities():
|
||||
# Note: in the following, random_state has been selected to have
|
||||
# a dataset that yields a stable eigen decomposition both when built
|
||||
# on OSX and Linux
|
||||
X, y = make_blobs(
|
||||
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
||||
)
|
||||
# nearest neighbors affinity
|
||||
sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
|
||||
with pytest.warns(UserWarning, match="not fully connected"):
|
||||
sp.fit(X)
|
||||
assert adjusted_rand_score(y, sp.labels_) == 1
|
||||
|
||||
sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
|
||||
labels = sp.fit(X).labels_
|
||||
assert adjusted_rand_score(y, labels) == 1
|
||||
|
||||
X = check_random_state(10).rand(10, 5) * 10
|
||||
|
||||
kernels_available = kernel_metrics()
|
||||
for kern in kernels_available:
|
||||
# Additive chi^2 gives a negative similarity matrix which
|
||||
# doesn't make sense for spectral clustering
|
||||
if kern != "additive_chi2":
|
||||
sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0)
|
||||
labels = sp.fit(X).labels_
|
||||
assert (X.shape[0],) == labels.shape
|
||||
|
||||
sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1, random_state=0)
|
||||
labels = sp.fit(X).labels_
|
||||
assert (X.shape[0],) == labels.shape
|
||||
|
||||
def histogram(x, y, **kwargs):
|
||||
# Histogram kernel implemented as a callable.
|
||||
assert kwargs == {} # no kernel_params that we didn't ask for
|
||||
return np.minimum(x, y).sum()
|
||||
|
||||
sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
|
||||
labels = sp.fit(X).labels_
|
||||
assert (X.shape[0],) == labels.shape
|
||||
|
||||
# raise error on unknown affinity
|
||||
sp = SpectralClustering(n_clusters=2, affinity="<unknown>")
|
||||
with pytest.raises(ValueError):
|
||||
sp.fit(X)
|
||||
|
||||
|
||||
def test_cluster_qr():
|
||||
# cluster_qr by itself should not be used for clustering generic data
|
||||
# other than the rows of the eigenvectors within spectral clustering,
|
||||
# but cluster_qr must still preserve the labels for different dtypes
|
||||
# of the generic fixed input even if the labels may be meaningless.
|
||||
random_state = np.random.RandomState(seed=8)
|
||||
n_samples, n_components = 10, 5
|
||||
data = random_state.randn(n_samples, n_components)
|
||||
labels_float64 = cluster_qr(data.astype(np.float64))
|
||||
# Each sample is assigned a cluster identifier
|
||||
assert labels_float64.shape == (n_samples,)
|
||||
# All components should be covered by the assignment
|
||||
assert np.array_equal(np.unique(labels_float64), np.arange(n_components))
|
||||
# Single precision data should yield the same cluster assignments
|
||||
labels_float32 = cluster_qr(data.astype(np.float32))
|
||||
assert np.array_equal(labels_float64, labels_float32)
|
||||
|
||||
|
||||
def test_cluster_qr_permutation_invariance():
|
||||
# cluster_qr must be invariant to sample permutation.
|
||||
random_state = np.random.RandomState(seed=8)
|
||||
n_samples, n_components = 100, 5
|
||||
data = random_state.randn(n_samples, n_components)
|
||||
perm = random_state.permutation(n_samples)
|
||||
assert np.array_equal(
|
||||
cluster_qr(data)[perm],
|
||||
cluster_qr(data[perm]),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_samples", [50, 100, 150, 500])
|
||||
def test_discretize(n_samples):
|
||||
# Test the discretize using a noise assignment matrix
|
||||
random_state = np.random.RandomState(seed=8)
|
||||
for n_class in range(2, 10):
|
||||
# random class labels
|
||||
y_true = random_state.randint(0, n_class + 1, n_samples)
|
||||
y_true = np.array(y_true, float)
|
||||
# noise class assignment matrix
|
||||
y_indicator = sparse.coo_matrix(
|
||||
(np.ones(n_samples), (np.arange(n_samples), y_true)),
|
||||
shape=(n_samples, n_class + 1),
|
||||
)
|
||||
y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn(
|
||||
n_samples, n_class + 1
|
||||
)
|
||||
y_pred = discretize(y_true_noisy, random_state=random_state)
|
||||
assert adjusted_rand_score(y_true, y_pred) > 0.8
|
||||
|
||||
|
||||
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/15913
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of np.float
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of pinv2
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
def test_spectral_clustering_with_arpack_amg_solvers():
|
||||
# Test that spectral_clustering is the same for arpack and amg solver
|
||||
# Based on toy example from plot_segmentation_toy.py
|
||||
|
||||
# a small two coin image
|
||||
x, y = np.indices((40, 40))
|
||||
|
||||
center1, center2 = (14, 12), (20, 25)
|
||||
radius1, radius2 = 8, 7
|
||||
|
||||
circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1**2
|
||||
circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2**2
|
||||
|
||||
circles = circle1 | circle2
|
||||
mask = circles.copy()
|
||||
img = circles.astype(float)
|
||||
|
||||
graph = img_to_graph(img, mask=mask)
|
||||
graph.data = np.exp(-graph.data / graph.data.std())
|
||||
|
||||
labels_arpack = spectral_clustering(
|
||||
graph, n_clusters=2, eigen_solver="arpack", random_state=0
|
||||
)
|
||||
|
||||
assert len(np.unique(labels_arpack)) == 2
|
||||
|
||||
if amg_loaded:
|
||||
labels_amg = spectral_clustering(
|
||||
graph, n_clusters=2, eigen_solver="amg", random_state=0
|
||||
)
|
||||
assert adjusted_rand_score(labels_arpack, labels_amg) == 1
|
||||
else:
|
||||
with pytest.raises(ValueError):
|
||||
spectral_clustering(graph, n_clusters=2, eigen_solver="amg", random_state=0)
|
||||
|
||||
|
||||
def test_n_components():
|
||||
# Test that after adding n_components, result is different and
|
||||
# n_components = n_clusters by default
|
||||
X, y = make_blobs(
|
||||
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
||||
)
|
||||
sp = SpectralClustering(n_clusters=2, random_state=0)
|
||||
labels = sp.fit(X).labels_
|
||||
# set n_components = n_cluster and test if result is the same
|
||||
labels_same_ncomp = (
|
||||
SpectralClustering(n_clusters=2, n_components=2, random_state=0).fit(X).labels_
|
||||
)
|
||||
# test that n_components=n_clusters by default
|
||||
assert_array_equal(labels, labels_same_ncomp)
|
||||
|
||||
# test that n_components affect result
|
||||
# n_clusters=8 by default, and set n_components=2
|
||||
labels_diff_ncomp = (
|
||||
SpectralClustering(n_components=2, random_state=0).fit(X).labels_
|
||||
)
|
||||
assert not np.array_equal(labels, labels_diff_ncomp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
|
||||
def test_verbose(assign_labels, capsys):
|
||||
# Check verbose mode of KMeans for better coverage.
|
||||
X, y = make_blobs(
|
||||
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
||||
)
|
||||
|
||||
SpectralClustering(n_clusters=2, random_state=42, verbose=1).fit(X)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
|
||||
assert re.search(r"Computing label assignment using", captured.out)
|
||||
|
||||
if assign_labels == "kmeans":
|
||||
assert re.search(r"Initialization complete", captured.out)
|
||||
assert re.search(r"Iteration [0-9]+, inertia", captured.out)
|
||||
|
||||
|
||||
def test_spectral_clustering_np_matrix_raises():
|
||||
"""Check that spectral_clustering raises an informative error when passed
|
||||
a np.matrix. See #10993"""
|
||||
X = np.matrix([[0.0, 2.0], [2.0, 0.0]])
|
||||
|
||||
msg = r"spectral_clustering does not support passing in affinity as an np\.matrix"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
spectral_clustering(X)
|
||||
|
||||
|
||||
def test_spectral_clustering_not_infinite_loop(capsys, monkeypatch):
|
||||
"""Check that discretize raises LinAlgError when svd never converges.
|
||||
|
||||
Non-regression test for #21380
|
||||
"""
|
||||
|
||||
def new_svd(*args, **kwargs):
|
||||
raise LinAlgError()
|
||||
|
||||
monkeypatch.setattr(np.linalg, "svd", new_svd)
|
||||
vectors = np.ones((10, 4))
|
||||
|
||||
with pytest.raises(LinAlgError, match="SVD did not converge"):
|
||||
discretize(vectors)
|
||||
Reference in New Issue
Block a user