first commit

This commit is contained in:
Carla Floricel
2022-08-02 09:52:52 -04:00
parent 417ea8660b
commit 05e52aa52b
10444 changed files with 2300232 additions and 0 deletions

View File

@@ -0,0 +1,21 @@
"""
The :mod:`sklearn.manifold` module implements data embedding techniques.
"""
from ._locally_linear import locally_linear_embedding, LocallyLinearEmbedding
from ._isomap import Isomap
from ._mds import MDS, smacof
from ._spectral_embedding import SpectralEmbedding, spectral_embedding
from ._t_sne import TSNE, trustworthiness
__all__ = [
"locally_linear_embedding",
"LocallyLinearEmbedding",
"Isomap",
"MDS",
"smacof",
"SpectralEmbedding",
"spectral_embedding",
"TSNE",
"trustworthiness",
]

View File

@@ -0,0 +1,394 @@
"""Isomap for manifold learning"""
# Author: Jake Vanderplas -- <vanderplas@astro.washington.edu>
# License: BSD 3 clause (C) 2011
import warnings
import numpy as np
from scipy.sparse import issparse
from scipy.sparse.csgraph import shortest_path
from scipy.sparse.csgraph import connected_components
from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
from ..neighbors import NearestNeighbors, kneighbors_graph
from ..neighbors import radius_neighbors_graph
from ..utils.validation import check_is_fitted
from ..decomposition import KernelPCA
from ..preprocessing import KernelCenterer
from ..utils.graph import _fix_connected_components
class Isomap(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
"""Isomap Embedding.
Non-linear dimensionality reduction through Isometric Mapping
Read more in the :ref:`User Guide <isomap>`.
Parameters
----------
n_neighbors : int or None, default=5
Number of neighbors to consider for each point. If `n_neighbors` is an int,
then `radius` must be `None`.
radius : float or None, default=None
Limiting distance of neighbors to return. If `radius` is a float,
then `n_neighbors` must be set to `None`.
.. versionadded:: 1.1
n_components : int, default=2
Number of coordinates for the manifold.
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
'auto' : Attempt to choose the most efficient solver
for the given problem.
'arpack' : Use Arnoldi decomposition to find the eigenvalues
and eigenvectors.
'dense' : Use a direct solver (i.e. LAPACK)
for the eigenvalue decomposition.
tol : float, default=0
Convergence tolerance passed to arpack or lobpcg.
not used if eigen_solver == 'dense'.
max_iter : int, default=None
Maximum number of iterations for the arpack solver.
not used if eigen_solver == 'dense'.
path_method : {'auto', 'FW', 'D'}, default='auto'
Method to use in finding shortest path.
'auto' : attempt to choose the best algorithm automatically.
'FW' : Floyd-Warshall algorithm.
'D' : Dijkstra's algorithm.
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
default='auto'
Algorithm to use for nearest neighbors search,
passed to neighbors.NearestNeighbors instance.
n_jobs : int or None, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
metric : str, or callable, default="minkowski"
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a :term:`Glossary <sparse graph>`.
.. versionadded:: 0.22
p : int, default=2
Parameter for the Minkowski metric from
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
.. versionadded:: 0.22
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.22
Attributes
----------
embedding_ : array-like, shape (n_samples, n_components)
Stores the embedding vectors.
kernel_pca_ : object
:class:`~sklearn.decomposition.KernelPCA` object used to implement the
embedding.
nbrs_ : sklearn.neighbors.NearestNeighbors instance
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.
dist_matrix_ : array-like, shape (n_samples, n_samples)
Stores the geodesic distance matrix of training data.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
sklearn.decomposition.PCA : Principal component analysis that is a linear
dimensionality reduction method.
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
kernels and PCA.
MDS : Manifold learning using multidimensional scaling.
TSNE : T-distributed Stochastic Neighbor Embedding.
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
References
----------
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
framework for nonlinear dimensionality reduction. Science 290 (5500)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import Isomap
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = Isomap(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
"""
def __init__(
self,
*,
n_neighbors=5,
radius=None,
n_components=2,
eigen_solver="auto",
tol=0,
max_iter=None,
path_method="auto",
neighbors_algorithm="auto",
n_jobs=None,
metric="minkowski",
p=2,
metric_params=None,
):
self.n_neighbors = n_neighbors
self.radius = radius
self.n_components = n_components
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.path_method = path_method
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs
self.metric = metric
self.p = p
self.metric_params = metric_params
def _fit_transform(self, X):
if self.n_neighbors is not None and self.radius is not None:
raise ValueError(
"Both n_neighbors and radius are provided. Use"
f" Isomap(radius={self.radius}, n_neighbors=None) if intended to use"
" radius-based neighbors"
)
self.nbrs_ = NearestNeighbors(
n_neighbors=self.n_neighbors,
radius=self.radius,
algorithm=self.neighbors_algorithm,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
n_jobs=self.n_jobs,
)
self.nbrs_.fit(X)
self.n_features_in_ = self.nbrs_.n_features_in_
if hasattr(self.nbrs_, "feature_names_in_"):
self.feature_names_in_ = self.nbrs_.feature_names_in_
self.kernel_pca_ = KernelPCA(
n_components=self.n_components,
kernel="precomputed",
eigen_solver=self.eigen_solver,
tol=self.tol,
max_iter=self.max_iter,
n_jobs=self.n_jobs,
)
if self.n_neighbors is not None:
nbg = kneighbors_graph(
self.nbrs_,
self.n_neighbors,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
mode="distance",
n_jobs=self.n_jobs,
)
else:
nbg = radius_neighbors_graph(
self.nbrs_,
radius=self.radius,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
mode="distance",
n_jobs=self.n_jobs,
)
# Compute the number of connected components, and connect the different
# components to be able to compute a shortest path between all pairs
# of samples in the graph.
# Similar fix to cluster._agglomerative._fix_connectivity.
n_connected_components, labels = connected_components(nbg)
if n_connected_components > 1:
if self.metric == "precomputed" and issparse(X):
raise RuntimeError(
"The number of connected components of the neighbors graph"
f" is {n_connected_components} > 1. The graph cannot be "
"completed with metric='precomputed', and Isomap cannot be"
"fitted. Increase the number of neighbors to avoid this "
"issue, or precompute the full distance matrix instead "
"of passing a sparse neighbors graph."
)
warnings.warn(
"The number of connected components of the neighbors graph "
f"is {n_connected_components} > 1. Completing the graph to fit"
" Isomap might be slow. Increase the number of neighbors to "
"avoid this issue.",
stacklevel=2,
)
# use array validated by NearestNeighbors
nbg = _fix_connected_components(
X=self.nbrs_._fit_X,
graph=nbg,
n_connected_components=n_connected_components,
component_labels=labels,
mode="distance",
metric=self.nbrs_.effective_metric_,
**self.nbrs_.effective_metric_params_,
)
self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False)
G = self.dist_matrix_**2
G *= -0.5
self.embedding_ = self.kernel_pca_.fit_transform(G)
self._n_features_out = self.embedding_.shape[1]
def reconstruction_error(self):
"""Compute the reconstruction error for the embedding.
Returns
-------
reconstruction_error : float
Reconstruction error.
Notes
-----
The cost function of an isomap embedding is
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
Where D is the matrix of distances for the input data X,
D_fit is the matrix of distances for the output embedding X_fit,
and K is the isomap kernel:
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
"""
G = -0.5 * self.dist_matrix_**2
G_center = KernelCenterer().fit_transform(G)
evals = self.kernel_pca_.eigenvalues_
return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]
def fit(self, X, y=None):
"""Compute the embedding vectors for data X.
Parameters
----------
X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array, sparse graph, precomputed tree, or NearestNeighbors
object.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns a fitted instance of self.
"""
self._fit_transform(X)
return self
def fit_transform(self, X, y=None):
"""Fit the model from data in X and transform X.
Parameters
----------
X : {array-like, sparse graph, BallTree, KDTree}
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
X transformed in the new space.
"""
self._fit_transform(X)
return self.embedding_
def transform(self, X):
"""Transform X.
This is implemented by linking the points X into the graph of geodesic
distances of the training data. First the `n_neighbors` nearest
neighbors of X are found in the training data, and from these the
shortest geodesic distances from each point in X to each point in
the training data are computed in order to construct the kernel.
The embedding of X is the projection of this kernel onto the
embedding vectors of the training set.
Parameters
----------
X : array-like, shape (n_queries, n_features)
If neighbors_algorithm='precomputed', X is assumed to be a
distance matrix or a sparse graph of shape
(n_queries, n_samples_fit).
Returns
-------
X_new : array-like, shape (n_queries, n_components)
X transformed in the new space.
"""
check_is_fitted(self)
if self.n_neighbors is not None:
distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
else:
distances, indices = self.nbrs_.radius_neighbors(X, return_distance=True)
# Create the graph of shortest distances from X to
# training data via the nearest neighbors of X.
# This can be done as a single array operation, but it potentially
# takes a lot of memory. To avoid that, use a loop:
n_samples_fit = self.nbrs_.n_samples_fit_
n_queries = distances.shape[0]
G_X = np.zeros((n_queries, n_samples_fit))
for i in range(n_queries):
G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)
G_X **= 2
G_X *= -0.5
return self.kernel_pca_.transform(G_X)

View File

@@ -0,0 +1,810 @@
"""Locally Linear Embedding"""
# Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
# Jake Vanderplas -- <vanderplas@astro.washington.edu>
# License: BSD 3 clause (C) INRIA 2011
import numpy as np
from scipy.linalg import eigh, svd, qr, solve
from scipy.sparse import eye, csr_matrix
from scipy.sparse.linalg import eigsh
from ..base import (
BaseEstimator,
TransformerMixin,
_UnstableArchMixin,
_ClassNamePrefixFeaturesOutMixin,
)
from ..utils import check_random_state, check_array
from ..utils._arpack import _init_arpack_v0
from ..utils.extmath import stable_cumsum
from ..utils.validation import check_is_fitted
from ..utils.validation import FLOAT_DTYPES
from ..neighbors import NearestNeighbors
def barycenter_weights(X, Y, indices, reg=1e-3):
"""Compute barycenter weights of X from Y along the first axis
We estimate the weights to assign to each point in Y[indices] to recover
the point X[i]. The barycenter weights sum to 1.
Parameters
----------
X : array-like, shape (n_samples, n_dim)
Y : array-like, shape (n_samples, n_dim)
indices : array-like, shape (n_samples, n_dim)
Indices of the points in Y used to compute the barycenter
reg : float, default=1e-3
amount of regularization to add for the problem to be
well-posed in the case of n_neighbors > n_dim
Returns
-------
B : array-like, shape (n_samples, n_neighbors)
Notes
-----
See developers note for more information.
"""
X = check_array(X, dtype=FLOAT_DTYPES)
Y = check_array(Y, dtype=FLOAT_DTYPES)
indices = check_array(indices, dtype=int)
n_samples, n_neighbors = indices.shape
assert X.shape[0] == n_samples
B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
v = np.ones(n_neighbors, dtype=X.dtype)
# this might raise a LinalgError if G is singular and has trace
# zero
for i, ind in enumerate(indices):
A = Y[ind]
C = A - X[i] # broadcasting
G = np.dot(C, C.T)
trace = np.trace(G)
if trace > 0:
R = reg * trace
else:
R = reg
G.flat[:: n_neighbors + 1] += R
w = solve(G, v, sym_pos=True)
B[i, :] = w / np.sum(w)
return B
def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
"""Computes the barycenter weighted graph of k-Neighbors for points in X
Parameters
----------
X : {array-like, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array or a NearestNeighbors object.
n_neighbors : int
Number of neighbors for each sample.
reg : float, default=1e-3
Amount of regularization when solving the least-squares
problem. Only relevant if mode='barycenter'. If None, use the
default.
n_jobs : int or None, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
A : sparse matrix in CSR format, shape = [n_samples, n_samples]
A[i, j] is assigned the weight of edge that connects i to j.
See Also
--------
sklearn.neighbors.kneighbors_graph
sklearn.neighbors.radius_neighbors_graph
"""
knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)
X = knn._fit_X
n_samples = knn.n_samples_fit_
ind = knn.kneighbors(X, return_distance=False)[:, 1:]
data = barycenter_weights(X, X, ind, reg=reg)
indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples))
def null_space(
M, k, k_skip=1, eigen_solver="arpack", tol=1e-6, max_iter=100, random_state=None
):
"""
Find the null space of a matrix M.
Parameters
----------
M : {array, matrix, sparse matrix, LinearOperator}
Input covariance matrix: should be symmetric positive semi-definite
k : int
Number of eigenvalues/vectors to return
k_skip : int, default=1
Number of low eigenvalues to skip.
eigen_solver : {'auto', 'arpack', 'dense'}, default='arpack'
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems. It is
best to try several random seeds in order to check results.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, default=1e-6
Tolerance for 'arpack' method.
Not used if eigen_solver=='dense'.
max_iter : int, default=100
Maximum number of iterations for 'arpack' method.
Not used if eigen_solver=='dense'
random_state : int, RandomState instance, default=None
Determines the random number generator when ``solver`` == 'arpack'.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
"""
if eigen_solver == "auto":
if M.shape[0] > 200 and k + k_skip < 10:
eigen_solver = "arpack"
else:
eigen_solver = "dense"
if eigen_solver == "arpack":
v0 = _init_arpack_v0(M.shape[0], random_state)
try:
eigen_values, eigen_vectors = eigsh(
M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0
)
except RuntimeError as e:
raise ValueError(
"Error in determining null-space with ARPACK. Error message: "
"'%s'. Note that eigen_solver='arpack' can fail when the "
"weight matrix is singular or otherwise ill-behaved. In that "
"case, eigen_solver='dense' is recommended. See online "
"documentation for more information." % e
) from e
return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])
elif eigen_solver == "dense":
if hasattr(M, "toarray"):
M = M.toarray()
eigen_values, eigen_vectors = eigh(
M, eigvals=(k_skip, k + k_skip - 1), overwrite_a=True
)
index = np.argsort(np.abs(eigen_values))
return eigen_vectors[:, index], np.sum(eigen_values)
else:
raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
def locally_linear_embedding(
X,
*,
n_neighbors,
n_components,
reg=1e-3,
eigen_solver="auto",
tol=1e-6,
max_iter=100,
method="standard",
hessian_tol=1e-4,
modified_tol=1e-12,
random_state=None,
n_jobs=None,
):
"""Perform a Locally Linear Embedding analysis on the data.
Read more in the :ref:`User Guide <locally_linear_embedding>`.
Parameters
----------
X : {array-like, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array or a NearestNeighbors object.
n_neighbors : int
number of neighbors to consider for each point.
n_components : int
number of coordinates for the manifold.
reg : float, default=1e-3
regularization constant, multiplies the trace of the local covariance
matrix of the distances.
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems. It is
best to try several random seeds in order to check results.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, default=1e-6
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.
max_iter : int, default=100
maximum number of iterations for the arpack solver.
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
standard : use the standard locally linear embedding algorithm.
see reference [1]_
hessian : use the Hessian eigenmap method. This method requires
n_neighbors > n_components * (1 + (n_components + 1) / 2.
see reference [2]_
modified : use the modified locally linear embedding algorithm.
see reference [3]_
ltsa : use local tangent space alignment algorithm
see reference [4]_
hessian_tol : float, default=1e-4
Tolerance for Hessian eigenmapping method.
Only used if method == 'hessian'
modified_tol : float, default=1e-12
Tolerance for modified LLE method.
Only used if method == 'modified'
random_state : int, RandomState instance, default=None
Determines the random number generator when ``solver`` == 'arpack'.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
n_jobs : int or None, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
Y : array-like, shape [n_samples, n_components]
Embedding vectors.
squared_error : float
Reconstruction error for the embedding vectors. Equivalent to
``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.
References
----------
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
by locally linear embedding. Science 290:2323 (2000).
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
linear embedding techniques for high-dimensional data.
Proc Natl Acad Sci U S A. 100:5591 (2003).
.. [3] Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
Embedding Using Multiple Weights.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
dimensionality reduction via tangent space alignment.
Journal of Shanghai Univ. 8:406 (2004)
"""
if eigen_solver not in ("auto", "arpack", "dense"):
raise ValueError("unrecognized eigen_solver '%s'" % eigen_solver)
if method not in ("standard", "hessian", "modified", "ltsa"):
raise ValueError("unrecognized method '%s'" % method)
nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs)
nbrs.fit(X)
X = nbrs._fit_X
N, d_in = X.shape
if n_components > d_in:
raise ValueError(
"output dimension must be less than or equal to input dimension"
)
if n_neighbors >= N:
raise ValueError(
"Expected n_neighbors <= n_samples, but n_samples = %d, n_neighbors = %d"
% (N, n_neighbors)
)
if n_neighbors <= 0:
raise ValueError("n_neighbors must be positive")
M_sparse = eigen_solver != "dense"
if method == "standard":
W = barycenter_kneighbors_graph(
nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs
)
# we'll compute M = (I-W)'(I-W)
# depending on the solver, we'll do this differently
if M_sparse:
M = eye(*W.shape, format=W.format) - W
M = (M.T * M).tocsr()
else:
M = (W.T * W - W.T - W).toarray()
M.flat[:: M.shape[0] + 1] += 1 # W = W - I = W - I
elif method == "hessian":
dp = n_components * (n_components + 1) // 2
if n_neighbors <= n_components + dp:
raise ValueError(
"for method='hessian', n_neighbors must be "
"greater than "
"[n_components * (n_components + 3) / 2]"
)
neighbors = nbrs.kneighbors(
X, n_neighbors=n_neighbors + 1, return_distance=False
)
neighbors = neighbors[:, 1:]
Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)
Yi[:, 0] = 1
M = np.zeros((N, N), dtype=np.float64)
use_svd = n_neighbors > d_in
for i in range(N):
Gi = X[neighbors[i]]
Gi -= Gi.mean(0)
# build Hessian estimator
if use_svd:
U = svd(Gi, full_matrices=0)[0]
else:
Ci = np.dot(Gi, Gi.T)
U = eigh(Ci)[1][:, ::-1]
Yi[:, 1 : 1 + n_components] = U[:, :n_components]
j = 1 + n_components
for k in range(n_components):
Yi[:, j : j + n_components - k] = U[:, k : k + 1] * U[:, k:n_components]
j += n_components - k
Q, R = qr(Yi)
w = Q[:, n_components + 1 :]
S = w.sum(0)
S[np.where(abs(S) < hessian_tol)] = 1
w /= S
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] += np.dot(w, w.T)
if M_sparse:
M = csr_matrix(M)
elif method == "modified":
if n_neighbors < n_components:
raise ValueError("modified LLE requires n_neighbors >= n_components")
neighbors = nbrs.kneighbors(
X, n_neighbors=n_neighbors + 1, return_distance=False
)
neighbors = neighbors[:, 1:]
# find the eigenvectors and eigenvalues of each local covariance
# matrix. We want V[i] to be a [n_neighbors x n_neighbors] matrix,
# where the columns are eigenvectors
V = np.zeros((N, n_neighbors, n_neighbors))
nev = min(d_in, n_neighbors)
evals = np.zeros([N, nev])
# choose the most efficient way to find the eigenvectors
use_svd = n_neighbors > d_in
if use_svd:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
V[i], evals[i], _ = svd(X_nbrs, full_matrices=True)
evals **= 2
else:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
C_nbrs = np.dot(X_nbrs, X_nbrs.T)
evi, vi = eigh(C_nbrs)
evals[i] = evi[::-1]
V[i] = vi[:, ::-1]
# find regularized weights: this is like normal LLE.
# because we've already computed the SVD of each covariance matrix,
# it's faster to use this rather than np.linalg.solve
reg = 1e-3 * evals.sum(1)
tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors))
tmp[:, :nev] /= evals + reg[:, None]
tmp[:, nev:] /= reg[:, None]
w_reg = np.zeros((N, n_neighbors))
for i in range(N):
w_reg[i] = np.dot(V[i], tmp[i])
w_reg /= w_reg.sum(1)[:, None]
# calculate eta: the median of the ratio of small to large eigenvalues
# across the points. This is used to determine s_i, below
rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1)
eta = np.median(rho)
# find s_i, the size of the "almost null space" for each point:
# this is the size of the largest set of eigenvalues
# such that Sum[v; v in set]/Sum[v; v not in set] < eta
s_range = np.zeros(N, dtype=int)
evals_cumsum = stable_cumsum(evals, 1)
eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1
for i in range(N):
s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)
s_range += n_neighbors - nev # number of zero eigenvalues
# Now calculate M.
# This is the [N x N] matrix whose null space is the desired embedding
M = np.zeros((N, N), dtype=np.float64)
for i in range(N):
s_i = s_range[i]
# select bottom s_i eigenvectors and calculate alpha
Vi = V[i, :, n_neighbors - s_i :]
alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
# compute Householder matrix which satisfies
# Hi*Vi.T*ones(n_neighbors) = alpha_i*ones(s)
# using prescription from paper
h = np.full(s_i, alpha_i) - np.dot(Vi.T, np.ones(n_neighbors))
norm_h = np.linalg.norm(h)
if norm_h < modified_tol:
h *= 0
else:
h /= norm_h
# Householder matrix is
# >> Hi = np.identity(s_i) - 2*np.outer(h,h)
# Then the weight matrix is
# >> Wi = np.dot(Vi,Hi) + (1-alpha_i) * w_reg[i,:,None]
# We do this much more efficiently:
Wi = Vi - 2 * np.outer(np.dot(Vi, h), h) + (1 - alpha_i) * w_reg[i, :, None]
# Update M as follows:
# >> W_hat = np.zeros( (N,s_i) )
# >> W_hat[neighbors[i],:] = Wi
# >> W_hat[i] -= 1
# >> M += np.dot(W_hat,W_hat.T)
# We can do this much more efficiently:
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)
Wi_sum1 = Wi.sum(1)
M[i, neighbors[i]] -= Wi_sum1
M[neighbors[i], i] -= Wi_sum1
M[i, i] += s_i
if M_sparse:
M = csr_matrix(M)
elif method == "ltsa":
neighbors = nbrs.kneighbors(
X, n_neighbors=n_neighbors + 1, return_distance=False
)
neighbors = neighbors[:, 1:]
M = np.zeros((N, N))
use_svd = n_neighbors > d_in
for i in range(N):
Xi = X[neighbors[i]]
Xi -= Xi.mean(0)
# compute n_components largest eigenvalues of Xi * Xi^T
if use_svd:
v = svd(Xi, full_matrices=True)[0]
else:
Ci = np.dot(Xi, Xi.T)
v = eigh(Ci)[1][:, ::-1]
Gi = np.zeros((n_neighbors, n_components + 1))
Gi[:, 1:] = v[:, :n_components]
Gi[:, 0] = 1.0 / np.sqrt(n_neighbors)
GiGiT = np.dot(Gi, Gi.T)
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] -= GiGiT
M[neighbors[i], neighbors[i]] += 1
return null_space(
M,
n_components,
k_skip=1,
eigen_solver=eigen_solver,
tol=tol,
max_iter=max_iter,
random_state=random_state,
)
class LocallyLinearEmbedding(
_ClassNamePrefixFeaturesOutMixin,
TransformerMixin,
_UnstableArchMixin,
BaseEstimator,
):
"""Locally Linear Embedding.
Read more in the :ref:`User Guide <locally_linear_embedding>`.
Parameters
----------
n_neighbors : int, default=5
Number of neighbors to consider for each point.
n_components : int, default=2
Number of coordinates for the manifold.
reg : float, default=1e-3
Regularization constant, multiplies the trace of the local covariance
matrix of the distances.
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
The solver used to compute the eigenvectors. The available options are:
- `'auto'` : algorithm will attempt to choose the best method for input
data.
- `'arpack'` : use arnoldi iteration in shift-invert mode. For this
method, M may be a dense matrix, sparse matrix, or general linear
operator.
- `'dense'` : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array or matrix type.
This method should be avoided for large problems.
.. warning::
ARPACK can be unstable for some problems. It is best to try several
random seeds in order to check results.
tol : float, default=1e-6
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.
max_iter : int, default=100
Maximum number of iterations for the arpack solver.
Not used if eigen_solver=='dense'.
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
- `standard`: use the standard locally linear embedding algorithm. see
reference [1]_
- `hessian`: use the Hessian eigenmap method. This method requires
``n_neighbors > n_components * (1 + (n_components + 1) / 2``. see
reference [2]_
- `modified`: use the modified locally linear embedding algorithm.
see reference [3]_
- `ltsa`: use local tangent space alignment algorithm. see
reference [4]_
hessian_tol : float, default=1e-4
Tolerance for Hessian eigenmapping method.
Only used if ``method == 'hessian'``.
modified_tol : float, default=1e-12
Tolerance for modified LLE method.
Only used if ``method == 'modified'``.
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
default='auto'
Algorithm to use for nearest neighbors search, passed to
:class:`~sklearn.neighbors.NearestNeighbors` instance.
random_state : int, RandomState instance, default=None
Determines the random number generator when
``eigen_solver`` == 'arpack'. Pass an int for reproducible results
across multiple function calls. See :term:`Glossary <random_state>`.
n_jobs : int or None, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
embedding_ : array-like, shape [n_samples, n_components]
Stores the embedding vectors
reconstruction_error_ : float
Reconstruction error associated with `embedding_`
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
nbrs_ : NearestNeighbors object
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.
See Also
--------
SpectralEmbedding : Spectral embedding for non-linear dimensionality
reduction.
TSNE : Distributed Stochastic Neighbor Embedding.
References
----------
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
by locally linear embedding. Science 290:2323 (2000).
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
linear embedding techniques for high-dimensional data.
Proc Natl Acad Sci U S A. 100:5591 (2003).
.. [3] Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
Embedding Using Multiple Weights.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
dimensionality reduction via tangent space alignment.
Journal of Shanghai Univ. 8:406 (2004)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import LocallyLinearEmbedding
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = LocallyLinearEmbedding(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
"""
def __init__(
self,
*,
n_neighbors=5,
n_components=2,
reg=1e-3,
eigen_solver="auto",
tol=1e-6,
max_iter=100,
method="standard",
hessian_tol=1e-4,
modified_tol=1e-12,
neighbors_algorithm="auto",
random_state=None,
n_jobs=None,
):
self.n_neighbors = n_neighbors
self.n_components = n_components
self.reg = reg
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.method = method
self.hessian_tol = hessian_tol
self.modified_tol = modified_tol
self.random_state = random_state
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs
def _fit_transform(self, X):
self.nbrs_ = NearestNeighbors(
n_neighbors=self.n_neighbors,
algorithm=self.neighbors_algorithm,
n_jobs=self.n_jobs,
)
random_state = check_random_state(self.random_state)
X = self._validate_data(X, dtype=float)
self.nbrs_.fit(X)
self.embedding_, self.reconstruction_error_ = locally_linear_embedding(
X=self.nbrs_,
n_neighbors=self.n_neighbors,
n_components=self.n_components,
eigen_solver=self.eigen_solver,
tol=self.tol,
max_iter=self.max_iter,
method=self.method,
hessian_tol=self.hessian_tol,
modified_tol=self.modified_tol,
random_state=random_state,
reg=self.reg,
n_jobs=self.n_jobs,
)
self._n_features_out = self.embedding_.shape[1]
def fit(self, X, y=None):
"""Compute the embedding vectors for data X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
Fitted `LocallyLinearEmbedding` class instance.
"""
self._fit_transform(X)
return self
def fit_transform(self, X, y=None):
"""Compute the embedding vectors for data X and transform X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
Returns the instance itself.
"""
self._fit_transform(X)
return self.embedding_
def transform(self, X):
"""
Transform new points into embedding space.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
Returns
-------
X_new : ndarray of shape (n_samples, n_components)
Returns the instance itself.
Notes
-----
Because of scaling performed by this method, it is discouraged to use
it together with methods that are not scale-invariant (like SVMs).
"""
check_is_fitted(self)
X = self._validate_data(X, reset=False)
ind = self.nbrs_.kneighbors(
X, n_neighbors=self.n_neighbors, return_distance=False
)
weights = barycenter_weights(X, self.nbrs_._fit_X, ind, reg=self.reg)
X_new = np.empty((X.shape[0], self.n_components))
for i in range(X.shape[0]):
X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])
return X_new

View File

@@ -0,0 +1,537 @@
"""
Multi-dimensional Scaling (MDS).
"""
# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
# License: BSD
import numpy as np
from joblib import Parallel, effective_n_jobs
import warnings
from ..base import BaseEstimator
from ..metrics import euclidean_distances
from ..utils import check_random_state, check_array, check_symmetric
from ..isotonic import IsotonicRegression
from ..utils.fixes import delayed
def _smacof_single(
dissimilarities,
metric=True,
n_components=2,
init=None,
max_iter=300,
verbose=0,
eps=1e-3,
random_state=None,
):
"""Computes multidimensional scaling using SMACOF algorithm.
Parameters
----------
dissimilarities : ndarray of shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Must be symmetric.
metric : bool, default=True
Compute metric or nonmetric SMACOF algorithm.
n_components : int, default=2
Number of dimensions in which to immerse the dissimilarities. If an
``init`` array is provided, this option is overridden and the shape of
``init`` is used to determine the dimensionality of the embedding
space.
init : ndarray of shape (n_samples, n_components), default=None
Starting configuration of the embedding to initialize the algorithm. By
default, the algorithm is initialized with a randomly chosen array.
max_iter : int, default=300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, default=0
Level of verbosity.
eps : float, default=1e-3
Relative tolerance with respect to stress at which to declare
convergence.
random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
Returns
-------
X : ndarray of shape (n_samples, n_components)
Coordinates of the points in a ``n_components``-space.
stress : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
n_iter : int
The number of iterations corresponding to the best stress.
"""
dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
n_samples = dissimilarities.shape[0]
random_state = check_random_state(random_state)
sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
sim_flat_w = sim_flat[sim_flat != 0]
if init is None:
# Randomly choose initial configuration
X = random_state.uniform(size=n_samples * n_components)
X = X.reshape((n_samples, n_components))
else:
# overrides the parameter p
n_components = init.shape[1]
if n_samples != init.shape[0]:
raise ValueError(
"init matrix should be of shape (%d, %d)" % (n_samples, n_components)
)
X = init
old_stress = None
ir = IsotonicRegression()
for it in range(max_iter):
# Compute distance and monotonic regression
dis = euclidean_distances(X)
if metric:
disparities = dissimilarities
else:
dis_flat = dis.ravel()
# dissimilarities with 0 are considered as missing values
dis_flat_w = dis_flat[sim_flat != 0]
# Compute the disparities using a monotonic regression
disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
disparities = dis_flat.copy()
disparities[sim_flat != 0] = disparities_flat
disparities = disparities.reshape((n_samples, n_samples))
disparities *= np.sqrt(
(n_samples * (n_samples - 1) / 2) / (disparities**2).sum()
)
# Compute stress
stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
# Update X using the Guttman transform
dis[dis == 0] = 1e-5
ratio = disparities / dis
B = -ratio
B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
X = 1.0 / n_samples * np.dot(B, X)
dis = np.sqrt((X**2).sum(axis=1)).sum()
if verbose >= 2:
print("it: %d, stress %s" % (it, stress))
if old_stress is not None:
if (old_stress - stress / dis) < eps:
if verbose:
print("breaking at iteration %d with stress %s" % (it, stress))
break
old_stress = stress / dis
return X, stress, it + 1
def smacof(
dissimilarities,
*,
metric=True,
n_components=2,
init=None,
n_init=8,
n_jobs=None,
max_iter=300,
verbose=0,
eps=1e-3,
random_state=None,
return_n_iter=False,
):
"""Compute multidimensional scaling using the SMACOF algorithm.
The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
multidimensional scaling algorithm which minimizes an objective function
(the *stress*) using a majorization technique. Stress majorization, also
known as the Guttman Transform, guarantees a monotone convergence of
stress, and is more powerful than traditional techniques such as gradient
descent.
The SMACOF algorithm for metric MDS can be summarized by the following
steps:
1. Set an initial start configuration, randomly or not.
2. Compute the stress
3. Compute the Guttman Transform
4. Iterate 2 and 3 until convergence.
The nonmetric algorithm adds a monotonic regression step before computing
the stress.
Parameters
----------
dissimilarities : ndarray of shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Must be symmetric.
metric : bool, default=True
Compute metric or nonmetric SMACOF algorithm.
n_components : int, default=2
Number of dimensions in which to immerse the dissimilarities. If an
``init`` array is provided, this option is overridden and the shape of
``init`` is used to determine the dimensionality of the embedding
space.
init : ndarray of shape (n_samples, n_components), default=None
Starting configuration of the embedding to initialize the algorithm. By
default, the algorithm is initialized with a randomly chosen array.
n_init : int, default=8
Number of times the SMACOF algorithm will be run with different
initializations. The final results will be the best output of the runs,
determined by the run with the smallest final stress. If ``init`` is
provided, this option is overridden and a single run is performed.
n_jobs : int, default=None
The number of jobs to use for the computation. If multiple
initializations are used (``n_init``), each run of the algorithm is
computed in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
max_iter : int, default=300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, default=0
Level of verbosity.
eps : float, default=1e-3
Relative tolerance with respect to stress at which to declare
convergence.
random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
return_n_iter : bool, default=False
Whether or not to return the number of iterations.
Returns
-------
X : ndarray of shape (n_samples, n_components)
Coordinates of the points in a ``n_components``-space.
stress : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
n_iter : int
The number of iterations corresponding to the best stress. Returned
only if ``return_n_iter`` is set to ``True``.
Notes
-----
"Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
"Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
Psychometrika, 29 (1964)
"Multidimensional scaling by optimizing goodness of fit to a nonmetric
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
"""
dissimilarities = check_array(dissimilarities)
random_state = check_random_state(random_state)
if hasattr(init, "__array__"):
init = np.asarray(init).copy()
if not n_init == 1:
warnings.warn(
"Explicit initial positions passed: "
"performing only one init of the MDS instead of %d" % n_init
)
n_init = 1
best_pos, best_stress = None, None
if effective_n_jobs(n_jobs) == 1:
for it in range(n_init):
pos, stress, n_iter_ = _smacof_single(
dissimilarities,
metric=metric,
n_components=n_components,
init=init,
max_iter=max_iter,
verbose=verbose,
eps=eps,
random_state=random_state,
)
if best_stress is None or stress < best_stress:
best_stress = stress
best_pos = pos.copy()
best_iter = n_iter_
else:
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
delayed(_smacof_single)(
dissimilarities,
metric=metric,
n_components=n_components,
init=init,
max_iter=max_iter,
verbose=verbose,
eps=eps,
random_state=seed,
)
for seed in seeds
)
positions, stress, n_iters = zip(*results)
best = np.argmin(stress)
best_stress = stress[best]
best_pos = positions[best]
best_iter = n_iters[best]
if return_n_iter:
return best_pos, best_stress, best_iter
else:
return best_pos, best_stress
class MDS(BaseEstimator):
"""Multidimensional scaling.
Read more in the :ref:`User Guide <multidimensional_scaling>`.
Parameters
----------
n_components : int, default=2
Number of dimensions in which to immerse the dissimilarities.
metric : bool, default=True
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
n_init : int, default=4
Number of times the SMACOF algorithm will be run with different
initializations. The final results will be the best output of the runs,
determined by the run with the smallest final stress.
max_iter : int, default=300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, default=0
Level of verbosity.
eps : float, default=1e-3
Relative tolerance with respect to stress at which to declare
convergence.
n_jobs : int, default=None
The number of jobs to use for the computation. If multiple
initializations are used (``n_init``), each run of the algorithm is
computed in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
dissimilarity : {'euclidean', 'precomputed'}, default='euclidean'
Dissimilarity measure to use:
- 'euclidean':
Pairwise Euclidean distances between points in the dataset.
- 'precomputed':
Pre-computed dissimilarities are passed directly to ``fit`` and
``fit_transform``.
Attributes
----------
embedding_ : ndarray of shape (n_samples, n_components)
Stores the position of the dataset in the embedding space.
stress_ : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Symmetric matrix that:
- either uses a custom dissimilarity matrix by setting `dissimilarity`
to 'precomputed';
- or constructs a dissimilarity matrix from data using
Euclidean distances.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
The number of iterations corresponding to the best stress.
See Also
--------
sklearn.decomposition.PCA : Principal component analysis that is a linear
dimensionality reduction method.
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
kernels and PCA.
TSNE : T-distributed Stochastic Neighbor Embedding.
Isomap : Manifold learning based on Isometric Mapping.
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
References
----------
"Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
"Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
Psychometrika, 29 (1964)
"Multidimensional scaling by optimizing goodness of fit to a nonmetric
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import MDS
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = MDS(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
"""
def __init__(
self,
n_components=2,
*,
metric=True,
n_init=4,
max_iter=300,
verbose=0,
eps=1e-3,
n_jobs=None,
random_state=None,
dissimilarity="euclidean",
):
self.n_components = n_components
self.dissimilarity = dissimilarity
self.metric = metric
self.n_init = n_init
self.max_iter = max_iter
self.eps = eps
self.verbose = verbose
self.n_jobs = n_jobs
self.random_state = random_state
def _more_tags(self):
return {"pairwise": self.dissimilarity == "precomputed"}
def fit(self, X, y=None, init=None):
"""
Compute the position of the points in the embedding space.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
Input data. If ``dissimilarity=='precomputed'``, the input should
be the dissimilarity matrix.
y : Ignored
Not used, present for API consistency by convention.
init : ndarray of shape (n_samples,), default=None
Starting configuration of the embedding to initialize the SMACOF
algorithm. By default, the algorithm is initialized with a randomly
chosen array.
Returns
-------
self : object
Fitted estimator.
"""
self.fit_transform(X, init=init)
return self
def fit_transform(self, X, y=None, init=None):
"""
Fit the data from `X`, and returns the embedded coordinates.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
Input data. If ``dissimilarity=='precomputed'``, the input should
be the dissimilarity matrix.
y : Ignored
Not used, present for API consistency by convention.
init : ndarray of shape (n_samples,), default=None
Starting configuration of the embedding to initialize the SMACOF
algorithm. By default, the algorithm is initialized with a randomly
chosen array.
Returns
-------
X_new : ndarray of shape (n_samples, n_components)
X transformed in the new space.
"""
X = self._validate_data(X)
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
warnings.warn(
"The MDS API has changed. ``fit`` now constructs an"
" dissimilarity matrix from data. To use a custom "
"dissimilarity matrix, set "
"``dissimilarity='precomputed'``."
)
if self.dissimilarity == "precomputed":
self.dissimilarity_matrix_ = X
elif self.dissimilarity == "euclidean":
self.dissimilarity_matrix_ = euclidean_distances(X)
else:
raise ValueError(
"Proximity must be 'precomputed' or 'euclidean'. Got %s instead"
% str(self.dissimilarity)
)
self.embedding_, self.stress_, self.n_iter_ = smacof(
self.dissimilarity_matrix_,
metric=self.metric,
n_components=self.n_components,
init=init,
n_init=self.n_init,
n_jobs=self.n_jobs,
max_iter=self.max_iter,
verbose=self.verbose,
eps=self.eps,
random_state=self.random_state,
return_n_iter=True,
)
return self.embedding_

View File

@@ -0,0 +1,671 @@
"""Spectral Embedding."""
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
# Wei LI <kuantkid@gmail.com>
# License: BSD 3 clause
import warnings
import numpy as np
from scipy import sparse
from scipy.linalg import eigh
from scipy.sparse.linalg import eigsh
from scipy.sparse.csgraph import connected_components
from scipy.sparse.csgraph import laplacian as csgraph_laplacian
from ..base import BaseEstimator
from ..utils import (
check_array,
check_random_state,
check_symmetric,
)
from ..utils._arpack import _init_arpack_v0
from ..utils.extmath import _deterministic_vector_sign_flip
from ..utils.fixes import lobpcg
from ..metrics.pairwise import rbf_kernel
from ..neighbors import kneighbors_graph, NearestNeighbors
def _graph_connected_component(graph, node_id):
"""Find the largest graph connected components that contains one
given node.
Parameters
----------
graph : array-like of shape (n_samples, n_samples)
Adjacency matrix of the graph, non-zero weight means an edge
between the nodes.
node_id : int
The index of the query node of the graph.
Returns
-------
connected_components_matrix : array-like of shape (n_samples,)
An array of bool value indicating the indexes of the nodes
belonging to the largest connected components of the given query
node.
"""
n_node = graph.shape[0]
if sparse.issparse(graph):
# speed up row-wise access to boolean connection mask
graph = graph.tocsr()
connected_nodes = np.zeros(n_node, dtype=bool)
nodes_to_explore = np.zeros(n_node, dtype=bool)
nodes_to_explore[node_id] = True
for _ in range(n_node):
last_num_component = connected_nodes.sum()
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
if last_num_component >= connected_nodes.sum():
break
indices = np.where(nodes_to_explore)[0]
nodes_to_explore.fill(False)
for i in indices:
if sparse.issparse(graph):
neighbors = graph[i].toarray().ravel()
else:
neighbors = graph[i]
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
return connected_nodes
def _graph_is_connected(graph):
"""Return whether the graph is connected (True) or Not (False).
Parameters
----------
graph : {array-like, sparse matrix} of shape (n_samples, n_samples)
Adjacency matrix of the graph, non-zero weight means an edge
between the nodes.
Returns
-------
is_connected : bool
True means the graph is fully connected and False means not.
"""
if sparse.isspmatrix(graph):
# sparse graph, find all the connected components
n_connected_components, _ = connected_components(graph)
return n_connected_components == 1
else:
# dense graph, find all connected components start from node 0
return _graph_connected_component(graph, 0).sum() == graph.shape[0]
def _set_diag(laplacian, value, norm_laplacian):
"""Set the diagonal of the laplacian matrix and convert it to a
sparse format well suited for eigenvalue decomposition.
Parameters
----------
laplacian : {ndarray, sparse matrix}
The graph laplacian.
value : float
The value of the diagonal.
norm_laplacian : bool
Whether the value of the diagonal should be changed or not.
Returns
-------
laplacian : {array, sparse matrix}
An array of matrix in a form that is well suited to fast
eigenvalue decomposition, depending on the band width of the
matrix.
"""
n_nodes = laplacian.shape[0]
# We need all entries in the diagonal to values
if not sparse.isspmatrix(laplacian):
if norm_laplacian:
laplacian.flat[:: n_nodes + 1] = value
else:
laplacian = laplacian.tocoo()
if norm_laplacian:
diag_idx = laplacian.row == laplacian.col
laplacian.data[diag_idx] = value
# If the matrix has a small number of diagonals (as in the
# case of structured matrices coming from images), the
# dia format might be best suited for matvec products:
n_diags = np.unique(laplacian.row - laplacian.col).size
if n_diags <= 7:
# 3 or less outer diagonals on each side
laplacian = laplacian.todia()
else:
# csr has the fastest matvec and is thus best suited to
# arpack
laplacian = laplacian.tocsr()
return laplacian
def spectral_embedding(
adjacency,
*,
n_components=8,
eigen_solver=None,
random_state=None,
eigen_tol=0.0,
norm_laplacian=True,
drop_first=True,
):
"""Project the sample on the first eigenvectors of the graph Laplacian.
The adjacency matrix is used to compute a normalized graph Laplacian
whose spectrum (especially the eigenvectors associated to the
smallest eigenvalues) has an interpretation in terms of minimal
number of cuts necessary to split the graph into comparably sized
components.
This embedding can also 'work' even if the ``adjacency`` variable is
not strictly the adjacency matrix of a graph but more generally
an affinity or similarity matrix between samples (for instance the
heat kernel of a euclidean distance matrix or a k-NN matrix).
However care must taken to always make the affinity matrix symmetric
so that the eigenvector decomposition works as expected.
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
Read more in the :ref:`User Guide <spectral_embedding>`.
Parameters
----------
adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)
The adjacency matrix of the graph to embed.
n_components : int, default=8
The dimension of the projection subspace.
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities. If None, then ``'arpack'`` is
used.
random_state : int, RandomState instance or None, default=None
A pseudo random number generator used for the initialization
of the lobpcg eigen vectors decomposition when `eigen_solver ==
'amg'`, and for the K-Means initialization. Use an int to make
the results deterministic across calls (See
:term:`Glossary <random_state>`).
.. note::
When using `eigen_solver == 'amg'`,
it is necessary to also fix the global numpy seed with
`np.random.seed(int)` to get deterministic results. See
https://github.com/pyamg/pyamg/issues/139 for further
information.
eigen_tol : float, default=0.0
Stopping criterion for eigendecomposition of the Laplacian matrix
when using arpack eigen_solver.
norm_laplacian : bool, default=True
If True, then compute symmetric normalized Laplacian.
drop_first : bool, default=True
Whether to drop the first eigenvector. For spectral embedding, this
should be True as the first eigenvector should be constant vector for
connected graph, but for spectral clustering, this should be kept as
False to retain the first eigenvector.
Returns
-------
embedding : ndarray of shape (n_samples, n_components)
The reduced samples.
Notes
-----
Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
has one connected component. If there graph has many components, the first
few eigenvectors will simply uncover the connected components of the graph.
References
----------
* https://en.wikipedia.org/wiki/LOBPCG
* :doi:`"Toward the Optimal Preconditioned Eigensolver: Locally Optimal
Block Preconditioned Conjugate Gradient Method",
Andrew V. Knyazev
<10.1137/S1064827500366124>`
"""
adjacency = check_symmetric(adjacency)
try:
from pyamg import smoothed_aggregation_solver
except ImportError as e:
if eigen_solver == "amg":
raise ValueError(
"The eigen_solver was set to 'amg', but pyamg is not available."
) from e
if eigen_solver is None:
eigen_solver = "arpack"
elif eigen_solver not in ("arpack", "lobpcg", "amg"):
raise ValueError(
"Unknown value for eigen_solver: '%s'."
"Should be 'amg', 'arpack', or 'lobpcg'" % eigen_solver
)
random_state = check_random_state(random_state)
n_nodes = adjacency.shape[0]
# Whether to drop the first eigenvector
if drop_first:
n_components = n_components + 1
if not _graph_is_connected(adjacency):
warnings.warn(
"Graph is not fully connected, spectral embedding may not work as expected."
)
laplacian, dd = csgraph_laplacian(
adjacency, normed=norm_laplacian, return_diag=True
)
if (
eigen_solver == "arpack"
or eigen_solver != "lobpcg"
and (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)
):
# lobpcg used with eigen_solver='amg' has bugs for low number of nodes
# for details see the source code in scipy:
# https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
# /lobpcg/lobpcg.py#L237
# or matlab:
# https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# Here we'll use shift-invert mode for fast eigenvalues
# (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
# for a short explanation of what this means)
# Because the normalized Laplacian has eigenvalues between 0 and 2,
# I - L has eigenvalues between -1 and 1. ARPACK is most efficient
# when finding eigenvalues of largest magnitude (keyword which='LM')
# and when these eigenvalues are very large compared to the rest.
# For very large, very sparse graphs, I - L can have many, many
# eigenvalues very near 1.0. This leads to slow convergence. So
# instead, we'll use ARPACK's shift-invert mode, asking for the
# eigenvalues near 1.0. This effectively spreads-out the spectrum
# near 1.0 and leads to much faster convergence: potentially an
# orders-of-magnitude speedup over simply using keyword which='LA'
# in standard mode.
try:
# We are computing the opposite of the laplacian inplace so as
# to spare a memory allocation of a possibly very large array
laplacian *= -1
v0 = _init_arpack_v0(laplacian.shape[0], random_state)
_, diffusion_map = eigsh(
laplacian, k=n_components, sigma=1.0, which="LM", tol=eigen_tol, v0=v0
)
embedding = diffusion_map.T[n_components::-1]
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
except RuntimeError:
# When submatrices are exactly singular, an LU decomposition
# in arpack fails. We fallback to lobpcg
eigen_solver = "lobpcg"
# Revert the laplacian to its opposite to have lobpcg work
laplacian *= -1
elif eigen_solver == "amg":
# Use AMG to get a preconditioner and speed up the eigenvalue
# problem.
if not sparse.issparse(laplacian):
warnings.warn("AMG works better for sparse matrices")
laplacian = check_array(
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
)
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# The Laplacian matrix is always singular, having at least one zero
# eigenvalue, corresponding to the trivial eigenvector, which is a
# constant. Using a singular matrix for preconditioning may result in
# random failures in LOBPCG and is not supported by the existing
# theory:
# see https://doi.org/10.1007/s10208-015-9297-1
# Shift the Laplacian so its diagononal is not all ones. The shift
# does change the eigenpairs however, so we'll feed the shifted
# matrix to the solver and afterward set it back to the original.
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
laplacian += diag_shift
ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr"))
laplacian -= diag_shift
M = ml.aspreconditioner()
# Create initial approximation X to eigenvectors
X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1))
X[:, 0] = dd.ravel()
X = X.astype(laplacian.dtype)
_, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.0e-5, largest=False)
embedding = diffusion_map.T
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
if embedding.shape[0] == 1:
raise ValueError
if eigen_solver == "lobpcg":
laplacian = check_array(
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
)
if n_nodes < 5 * n_components + 1:
# see note above under arpack why lobpcg has problems with small
# number of nodes
# lobpcg will fallback to eigh, so we short circuit it
if sparse.isspmatrix(laplacian):
laplacian = laplacian.toarray()
_, diffusion_map = eigh(laplacian, check_finite=False)
embedding = diffusion_map.T[:n_components]
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
else:
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# We increase the number of eigenvectors requested, as lobpcg
# doesn't behave well in low dimension and create initial
# approximation X to eigenvectors
X = random_state.standard_normal(
size=(laplacian.shape[0], n_components + 1)
)
X[:, 0] = dd.ravel()
X = X.astype(laplacian.dtype)
_, diffusion_map = lobpcg(
laplacian, X, tol=1e-5, largest=False, maxiter=2000
)
embedding = diffusion_map.T[:n_components]
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
if embedding.shape[0] == 1:
raise ValueError
embedding = _deterministic_vector_sign_flip(embedding)
if drop_first:
return embedding[1:n_components].T
else:
return embedding[:n_components].T
class SpectralEmbedding(BaseEstimator):
"""Spectral embedding for non-linear dimensionality reduction.
Forms an affinity matrix given by the specified function and
applies spectral decomposition to the corresponding graph laplacian.
The resulting transformation is given by the value of the
eigenvectors for each data point.
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
Read more in the :ref:`User Guide <spectral_embedding>`.
Parameters
----------
n_components : int, default=2
The dimension of the projected subspace.
affinity : {'nearest_neighbors', 'rbf', 'precomputed', \
'precomputed_nearest_neighbors'} or callable, \
default='nearest_neighbors'
How to construct the affinity matrix.
- 'nearest_neighbors' : construct the affinity matrix by computing a
graph of nearest neighbors.
- 'rbf' : construct the affinity matrix by computing a radial basis
function (RBF) kernel.
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
of precomputed nearest neighbors, and constructs the affinity matrix
by selecting the ``n_neighbors`` nearest neighbors.
- callable : use passed in function as affinity
the function takes in data matrix (n_samples, n_features)
and return affinity matrix (n_samples, n_samples).
gamma : float, default=None
Kernel coefficient for rbf kernel. If None, gamma will be set to
1/n_features.
random_state : int, RandomState instance or None, default=None
A pseudo random number generator used for the initialization
of the lobpcg eigen vectors decomposition when `eigen_solver ==
'amg'`, and for the K-Means initialization. Use an int to make
the results deterministic across calls (See
:term:`Glossary <random_state>`).
.. note::
When using `eigen_solver == 'amg'`,
it is necessary to also fix the global numpy seed with
`np.random.seed(int)` to get deterministic results. See
https://github.com/pyamg/pyamg/issues/139 for further
information.
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems.
If None, then ``'arpack'`` is used.
n_neighbors : int, default=None
Number of nearest neighbors for nearest_neighbors graph building.
If None, n_neighbors will be set to max(n_samples/10, 1).
n_jobs : int, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
embedding_ : ndarray of shape (n_samples, n_components)
Spectral embedding of the training matrix.
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
Affinity_matrix constructed from samples or precomputed.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_neighbors_ : int
Number of nearest neighbors effectively used.
See Also
--------
Isomap : Non-linear dimensionality reduction through Isometric Mapping.
References
----------
- :doi:`A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
<10.1007/s11222-007-9033-z>`
- On Spectral Clustering: Analysis and an algorithm, 2001
Andrew Y. Ng, Michael I. Jordan, Yair Weiss
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.8100
- :doi:`Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
<10.1109/34.868688>`
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import SpectralEmbedding
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = SpectralEmbedding(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
"""
def __init__(
self,
n_components=2,
*,
affinity="nearest_neighbors",
gamma=None,
random_state=None,
eigen_solver=None,
n_neighbors=None,
n_jobs=None,
):
self.n_components = n_components
self.affinity = affinity
self.gamma = gamma
self.random_state = random_state
self.eigen_solver = eigen_solver
self.n_neighbors = n_neighbors
self.n_jobs = n_jobs
def _more_tags(self):
return {
"pairwise": self.affinity
in ["precomputed", "precomputed_nearest_neighbors"]
}
def _get_affinity_matrix(self, X, Y=None):
"""Calculate the affinity matrix from data
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
If affinity is "precomputed"
X : array-like of shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
Y: Ignored
Returns
-------
affinity_matrix of shape (n_samples, n_samples)
"""
if self.affinity == "precomputed":
self.affinity_matrix_ = X
return self.affinity_matrix_
if self.affinity == "precomputed_nearest_neighbors":
estimator = NearestNeighbors(
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
).fit(X)
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
return self.affinity_matrix_
if self.affinity == "nearest_neighbors":
if sparse.issparse(X):
warnings.warn(
"Nearest neighbors affinity currently does "
"not support sparse input, falling back to "
"rbf affinity"
)
self.affinity = "rbf"
else:
self.n_neighbors_ = (
self.n_neighbors
if self.n_neighbors is not None
else max(int(X.shape[0] / 10), 1)
)
self.affinity_matrix_ = kneighbors_graph(
X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs
)
# currently only symmetric affinity_matrix supported
self.affinity_matrix_ = 0.5 * (
self.affinity_matrix_ + self.affinity_matrix_.T
)
return self.affinity_matrix_
if self.affinity == "rbf":
self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
return self.affinity_matrix_
self.affinity_matrix_ = self.affinity(X)
return self.affinity_matrix_
def fit(self, X, y=None):
"""Fit the model from data in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
If affinity is "precomputed"
X : {array-like, sparse matrix}, shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns the instance itself.
"""
X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2)
random_state = check_random_state(self.random_state)
if isinstance(self.affinity, str):
if self.affinity not in {
"nearest_neighbors",
"rbf",
"precomputed",
"precomputed_nearest_neighbors",
}:
raise ValueError(
"%s is not a valid affinity. Expected "
"'precomputed', 'rbf', 'nearest_neighbors' "
"or a callable."
% self.affinity
)
elif not callable(self.affinity):
raise ValueError(
"'affinity' is expected to be an affinity name or a callable. Got: %s"
% self.affinity
)
affinity_matrix = self._get_affinity_matrix(X)
self.embedding_ = spectral_embedding(
affinity_matrix,
n_components=self.n_components,
eigen_solver=self.eigen_solver,
random_state=random_state,
)
return self
def fit_transform(self, X, y=None):
"""Fit the model from data in X and transform X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
If affinity is "precomputed"
X : {array-like, sparse matrix} of shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
X_new : array-like of shape (n_samples, n_components)
Spectral embedding of the training matrix.
"""
self.fit(X)
return self.embedding_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,39 @@
import os
import numpy
def configuration(parent_package="", top_path=None):
from numpy.distutils.misc_util import Configuration
config = Configuration("manifold", parent_package, top_path)
libraries = []
if os.name == "posix":
libraries.append("m")
config.add_extension(
"_utils",
sources=["_utils.pyx"],
include_dirs=[numpy.get_include()],
libraries=libraries,
extra_compile_args=["-O3"],
)
config.add_extension(
"_barnes_hut_tsne",
sources=["_barnes_hut_tsne.pyx"],
include_dirs=[numpy.get_include()],
libraries=libraries,
extra_compile_args=["-O3"],
)
config.add_subpackage("tests")
return config
if __name__ == "__main__":
from numpy.distutils.core import setup
setup(**configuration().todict())

View File

@@ -0,0 +1,293 @@
from itertools import product
import numpy as np
import math
from numpy.testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
)
import pytest
from sklearn import datasets
from sklearn import manifold
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.utils._testing import assert_allclose, assert_allclose_dense_sparse
from scipy.sparse import rand as sparse_rand
eigen_solvers = ["auto", "dense", "arpack"]
path_methods = ["auto", "FW", "D"]
def create_sample_data(n_pts=25, add_noise=False):
# grid of equidistant points in 2D, n_components = n_dim
n_per_side = int(math.sqrt(n_pts))
X = np.array(list(product(range(n_per_side), repeat=2)))
if add_noise:
# add noise in a third dimension
rng = np.random.RandomState(0)
noise = 0.1 * rng.randn(n_pts, 1)
X = np.concatenate((X, noise), 1)
return X
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
def test_isomap_simple_grid(n_neighbors, radius):
# Isomap should preserve distances when all neighbors are used
n_pts = 25
X = create_sample_data(n_pts=n_pts, add_noise=False)
# distances from each point to all others
if n_neighbors is not None:
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance")
else:
G = neighbors.radius_neighbors_graph(X, radius, mode="distance")
for eigen_solver in eigen_solvers:
for path_method in path_methods:
clf = manifold.Isomap(
n_neighbors=n_neighbors,
radius=radius,
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
)
clf.fit(X)
if n_neighbors is not None:
G_iso = neighbors.kneighbors_graph(
clf.embedding_, n_neighbors, mode="distance"
)
else:
G_iso = neighbors.radius_neighbors_graph(
clf.embedding_, radius, mode="distance"
)
assert_allclose_dense_sparse(G, G_iso)
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
def test_isomap_reconstruction_error(n_neighbors, radius):
# Same setup as in test_isomap_simple_grid, with an added dimension
n_pts = 25
X = create_sample_data(n_pts=n_pts, add_noise=True)
# compute input kernel
if n_neighbors is not None:
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()
else:
G = neighbors.radius_neighbors_graph(X, radius, mode="distance").toarray()
centerer = preprocessing.KernelCenterer()
K = centerer.fit_transform(-0.5 * G**2)
for eigen_solver in eigen_solvers:
for path_method in path_methods:
clf = manifold.Isomap(
n_neighbors=n_neighbors,
radius=radius,
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
)
clf.fit(X)
# compute output kernel
if n_neighbors is not None:
G_iso = neighbors.kneighbors_graph(
clf.embedding_, n_neighbors, mode="distance"
)
else:
G_iso = neighbors.radius_neighbors_graph(
clf.embedding_, radius, mode="distance"
)
G_iso = G_iso.toarray()
K_iso = centerer.fit_transform(-0.5 * G_iso**2)
# make sure error agrees
reconstruction_error = np.linalg.norm(K - K_iso) / n_pts
assert_almost_equal(reconstruction_error, clf.reconstruction_error())
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 0.5)])
def test_transform(n_neighbors, radius):
n_samples = 200
n_components = 10
noise_scale = 0.01
# Create S-curve dataset
X, y = datasets.make_s_curve(n_samples, random_state=0)
# Compute isomap embedding
iso = manifold.Isomap(
n_components=n_components, n_neighbors=n_neighbors, radius=radius
)
X_iso = iso.fit_transform(X)
# Re-embed a noisy version of the points
rng = np.random.RandomState(0)
noise = noise_scale * rng.randn(*X.shape)
X_iso2 = iso.transform(X + noise)
# Make sure the rms error on re-embedding is comparable to noise_scale
assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 10.0)])
def test_pipeline(n_neighbors, radius):
# check that Isomap works fine as a transformer in a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
X, y = datasets.make_blobs(random_state=0)
clf = pipeline.Pipeline(
[
("isomap", manifold.Isomap(n_neighbors=n_neighbors, radius=radius)),
("clf", neighbors.KNeighborsClassifier()),
]
)
clf.fit(X, y)
assert 0.9 < clf.score(X, y)
def test_pipeline_with_nearest_neighbors_transformer():
# Test chaining NearestNeighborsTransformer and Isomap with
# neighbors_algorithm='precomputed'
algorithm = "auto"
n_neighbors = 10
X, _ = datasets.make_blobs(random_state=0)
X2, _ = datasets.make_blobs(random_state=1)
# compare the chained version and the compact version
est_chain = pipeline.make_pipeline(
neighbors.KNeighborsTransformer(
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
),
manifold.Isomap(n_neighbors=n_neighbors, metric="precomputed"),
)
est_compact = manifold.Isomap(
n_neighbors=n_neighbors, neighbors_algorithm=algorithm
)
Xt_chain = est_chain.fit_transform(X)
Xt_compact = est_compact.fit_transform(X)
assert_array_almost_equal(Xt_chain, Xt_compact)
Xt_chain = est_chain.transform(X2)
Xt_compact = est_compact.transform(X2)
assert_array_almost_equal(Xt_chain, Xt_compact)
def test_different_metric():
# Test that the metric parameters work correctly, and default to euclidean
def custom_metric(x1, x2):
return np.sqrt(np.sum(x1**2 + x2**2))
# metric, p, is_euclidean
metrics = [
("euclidean", 2, True),
("manhattan", 1, False),
("minkowski", 1, False),
("minkowski", 2, True),
(custom_metric, 2, False),
]
X, _ = datasets.make_blobs(random_state=0)
reference = manifold.Isomap().fit_transform(X)
for metric, p, is_euclidean in metrics:
embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
if is_euclidean:
assert_array_almost_equal(embedding, reference)
else:
with pytest.raises(AssertionError, match="not almost equal"):
assert_array_almost_equal(embedding, reference)
def test_isomap_clone_bug():
# regression test for bug reported in #6062
model = manifold.Isomap()
for n_neighbors in [10, 15, 20]:
model.set_params(n_neighbors=n_neighbors)
model.fit(np.random.rand(50, 2))
assert model.nbrs_.n_neighbors == n_neighbors
def test_sparse_input():
X = sparse_rand(100, 3, density=0.1, format="csr")
# Should not error
for eigen_solver in eigen_solvers:
for path_method in path_methods:
clf = manifold.Isomap(
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
n_neighbors=8,
)
clf.fit(X)
def test_isomap_fit_precomputed_radius_graph():
# Isomap.fit_transform must yield similar result when using
# a precomputed distance matrix.
X, y = datasets.make_s_curve(200, random_state=0)
radius = 10
g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance")
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed")
isomap.fit(g)
precomputed_result = isomap.embedding_
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski")
result = isomap.fit_transform(X)
assert_allclose(precomputed_result, result)
def test_isomap_raise_error_when_neighbor_and_radius_both_set():
# Isomap.fit_transform must raise a ValueError if
# radius and n_neighbors are provided.
X, _ = datasets.load_digits(return_X_y=True)
isomap = manifold.Isomap(n_neighbors=3, radius=5.5)
msg = "Both n_neighbors and radius are provided"
with pytest.raises(ValueError, match=msg):
isomap.fit_transform(X)
def test_multiple_connected_components():
# Test that a warning is raised when the graph has multiple components
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
with pytest.warns(UserWarning, match="number of connected components"):
manifold.Isomap(n_neighbors=2).fit(X)
def test_multiple_connected_components_metric_precomputed():
# Test that an error is raised when the graph has multiple components
# and when X is a precomputed neighbors graph.
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
# works with a precomputed distance matrix (dense)
X_distances = pairwise_distances(X)
with pytest.warns(UserWarning, match="number of connected components"):
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_distances)
# does not work with a precomputed neighbors graph (sparse)
X_graph = neighbors.kneighbors_graph(X, n_neighbors=2, mode="distance")
with pytest.raises(RuntimeError, match="number of connected components"):
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_graph)
def test_get_feature_names_out():
"""Check get_feature_names_out for Isomap."""
X, y = make_blobs(random_state=0, n_features=4)
n_components = 2
iso = manifold.Isomap(n_components=n_components)
iso.fit_transform(X)
names = iso.get_feature_names_out()
assert_array_equal([f"isomap{i}" for i in range(n_components)], names)

View File

@@ -0,0 +1,186 @@
from itertools import product
import numpy as np
from sklearn.utils._testing import (
assert_allclose,
assert_array_equal,
)
from scipy import linalg
import pytest
from sklearn import neighbors, manifold
from sklearn.datasets import make_blobs
from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
from sklearn.utils._testing import ignore_warnings
eigen_solvers = ["dense", "arpack"]
# ----------------------------------------------------------------------
# Test utility routines
def test_barycenter_kneighbors_graph(global_dtype):
X = np.array([[0, 1], [1.01, 1.0], [2, 0]], dtype=global_dtype)
graph = barycenter_kneighbors_graph(X, 1)
expected_graph = np.array(
[[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=global_dtype
)
assert graph.dtype == global_dtype
assert_allclose(graph.toarray(), expected_graph)
graph = barycenter_kneighbors_graph(X, 2)
# check that columns sum to one
assert_allclose(np.sum(graph.toarray(), axis=1), np.ones(3))
pred = np.dot(graph.toarray(), X)
assert linalg.norm(pred - X) / X.shape[0] < 1
# ----------------------------------------------------------------------
# Test LLE by computing the reconstruction error on some manifolds.
def test_lle_simple_grid(global_dtype):
# note: ARPACK is numerically unstable, so this test will fail for
# some random seeds. We choose 42 because the tests pass.
# for arm64 platforms 2 makes the test fail.
# TODO: rewrite this test to make less sensitive to the random seed,
# irrespective of the platform.
rng = np.random.RandomState(42)
# grid of equidistant points in 2D, n_components = n_dim
X = np.array(list(product(range(5), repeat=2)))
X = X + 1e-10 * rng.uniform(size=X.shape)
X = X.astype(global_dtype, copy=False)
n_components = 2
clf = manifold.LocallyLinearEmbedding(
n_neighbors=5, n_components=n_components, random_state=rng
)
tol = 0.1
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
reconstruction_error = linalg.norm(np.dot(N, X) - X, "fro")
assert reconstruction_error < tol
for solver in eigen_solvers:
clf.set_params(eigen_solver=solver)
clf.fit(X)
assert clf.embedding_.shape[1] == n_components
reconstruction_error = (
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
)
assert reconstruction_error < tol
assert_allclose(clf.reconstruction_error_, reconstruction_error, atol=1e-1)
# re-embed a noisy version of X using the transform method
noise = rng.randn(*X.shape).astype(global_dtype, copy=False) / 100
X_reembedded = clf.transform(X + noise)
assert linalg.norm(X_reembedded - clf.embedding_) < tol
@pytest.mark.parametrize("method", ["standard", "hessian", "modified", "ltsa"])
@pytest.mark.parametrize("solver", eigen_solvers)
def test_lle_manifold(global_dtype, method, solver):
rng = np.random.RandomState(0)
# similar test on a slightly more complex manifold
X = np.array(list(product(np.arange(18), repeat=2)))
X = np.c_[X, X[:, 0] ** 2 / 18]
X = X + 1e-10 * rng.uniform(size=X.shape)
X = X.astype(global_dtype, copy=False)
n_components = 2
clf = manifold.LocallyLinearEmbedding(
n_neighbors=6, n_components=n_components, method=method, random_state=0
)
tol = 1.5 if method == "standard" else 3
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
reconstruction_error = linalg.norm(np.dot(N, X) - X)
assert reconstruction_error < tol
clf.set_params(eigen_solver=solver)
clf.fit(X)
assert clf.embedding_.shape[1] == n_components
reconstruction_error = (
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
)
details = "solver: %s, method: %s" % (solver, method)
assert reconstruction_error < tol, details
assert (
np.abs(clf.reconstruction_error_ - reconstruction_error)
< tol * reconstruction_error
), details
# Test the error raised when parameter passed to lle is invalid
def test_lle_init_parameters():
X = np.random.rand(5, 3)
clf = manifold.LocallyLinearEmbedding(eigen_solver="error")
msg = "unrecognized eigen_solver 'error'"
with pytest.raises(ValueError, match=msg):
clf.fit(X)
clf = manifold.LocallyLinearEmbedding(method="error")
msg = "unrecognized method 'error'"
with pytest.raises(ValueError, match=msg):
clf.fit(X)
def test_pipeline():
# check that LocallyLinearEmbedding works fine as a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
from sklearn import pipeline, datasets
X, y = datasets.make_blobs(random_state=0)
clf = pipeline.Pipeline(
[
("filter", manifold.LocallyLinearEmbedding(random_state=0)),
("clf", neighbors.KNeighborsClassifier()),
]
)
clf.fit(X, y)
assert 0.9 < clf.score(X, y)
# Test the error raised when the weight matrix is singular
def test_singular_matrix():
M = np.ones((10, 3))
f = ignore_warnings
with pytest.raises(ValueError):
f(
manifold.locally_linear_embedding(
M,
n_neighbors=2,
n_components=1,
method="standard",
eigen_solver="arpack",
)
)
# regression test for #6033
def test_integer_input():
rand = np.random.RandomState(0)
X = rand.randint(0, 100, size=(20, 3))
for method in ["standard", "hessian", "modified", "ltsa"]:
clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
clf.fit(X) # this previously raised a TypeError
def test_get_feature_names_out():
"""Check get_feature_names_out for LocallyLinearEmbedding."""
X, y = make_blobs(random_state=0, n_features=4)
n_components = 2
iso = manifold.LocallyLinearEmbedding(n_components=n_components)
iso.fit(X)
names = iso.get_feature_names_out()
assert_array_equal(
[f"locallylinearembedding{i}" for i in range(n_components)], names
)

View File

@@ -0,0 +1,44 @@
import numpy as np
from numpy.testing import assert_array_almost_equal
import pytest
from sklearn.manifold import _mds as mds
def test_smacof():
# test metric smacof using the data of "Modern Multidimensional Scaling",
# Borg & Groenen, p 154
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]])
X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
X_true = np.array(
[[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]]
)
assert_array_almost_equal(X, X_true, decimal=3)
def test_smacof_error():
# Not symmetric similarity matrix:
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
with pytest.raises(ValueError):
mds.smacof(sim)
# Not squared similarity matrix:
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]])
with pytest.raises(ValueError):
mds.smacof(sim)
# init not None and not correct format:
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]])
with pytest.raises(ValueError):
mds.smacof(sim, init=Z, n_init=1)
def test_MDS():
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed")
mds_clf.fit(sim)

View File

@@ -0,0 +1,482 @@
import pytest
import numpy as np
from scipy import sparse
from scipy.sparse import csgraph
from scipy.linalg import eigh
from sklearn.manifold import SpectralEmbedding
from sklearn.manifold._spectral_embedding import _graph_is_connected
from sklearn.manifold._spectral_embedding import _graph_connected_component
from sklearn.manifold import spectral_embedding
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import normalized_mutual_info_score
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
try:
from pyamg import smoothed_aggregation_solver # noqa
pyamg_available = True
except ImportError:
pyamg_available = False
skip_if_no_pyamg = pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
# non centered, sparse centers to check the
centers = np.array(
[
[0.0, 5.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 4.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 1.0],
]
)
n_samples = 1000
n_clusters, n_features = centers.shape
S, true_labels = make_blobs(
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
)
def _assert_equal_with_sign_flipping(A, B, tol=0.0):
"""Check array A and B are equal with possible sign flipping on
each columns"""
tol_squared = tol**2
for A_col, B_col in zip(A.T, B.T):
assert (
np.max((A_col - B_col) ** 2) <= tol_squared
or np.max((A_col + B_col) ** 2) <= tol_squared
)
def test_sparse_graph_connected_component():
rng = np.random.RandomState(42)
n_samples = 300
boundaries = [0, 42, 121, 200, n_samples]
p = rng.permutation(n_samples)
connections = []
for start, stop in zip(boundaries[:-1], boundaries[1:]):
group = p[start:stop]
# Connect all elements within the group at least once via an
# arbitrary path that spans the group.
for i in range(len(group) - 1):
connections.append((group[i], group[i + 1]))
# Add some more random connections within the group
min_idx, max_idx = 0, len(group) - 1
n_random_connections = 1000
source = rng.randint(min_idx, max_idx, size=n_random_connections)
target = rng.randint(min_idx, max_idx, size=n_random_connections)
connections.extend(zip(group[source], group[target]))
# Build a symmetric affinity matrix
row_idx, column_idx = tuple(np.array(connections).T)
data = rng.uniform(0.1, 42, size=len(connections))
affinity = sparse.coo_matrix((data, (row_idx, column_idx)))
affinity = 0.5 * (affinity + affinity.T)
for start, stop in zip(boundaries[:-1], boundaries[1:]):
component_1 = _graph_connected_component(affinity, p[start])
component_size = stop - start
assert component_1.sum() == component_size
# We should retrieve the same component mask by starting by both ends
# of the group
component_2 = _graph_connected_component(affinity, p[stop - 1])
assert component_2.sum() == component_size
assert_array_equal(component_1, component_2)
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_spectral_embedding_two_components(eigen_solver, dtype, seed=36):
# Test spectral embedding with two components
random_state = np.random.RandomState(seed)
n_sample = 100
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
# first component
affinity[0:n_sample, 0:n_sample] = (
np.abs(random_state.randn(n_sample, n_sample)) + 2
)
# second component
affinity[n_sample::, n_sample::] = (
np.abs(random_state.randn(n_sample, n_sample)) + 2
)
# Test of internal _graph_connected_component before connection
component = _graph_connected_component(affinity, 0)
assert component[:n_sample].all()
assert not component[n_sample:].any()
component = _graph_connected_component(affinity, -1)
assert not component[:n_sample].any()
assert component[n_sample:].all()
# connection
affinity[0, n_sample + 1] = 1
affinity[n_sample + 1, 0] = 1
affinity.flat[:: 2 * n_sample + 1] = 0
affinity = 0.5 * (affinity + affinity.T)
true_label = np.zeros(shape=2 * n_sample)
true_label[0:n_sample] = 1
se_precomp = SpectralEmbedding(
n_components=1,
affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
# thresholding on the first components using 0.
label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)
@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)], ids=["dense", "sparse"])
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_precomputed_affinity(X, eigen_solver, dtype, seed=36):
# Test spectral embedding with precomputed kernel
gamma = 1.0
se_precomp = SpectralEmbedding(
n_components=2,
affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
se_rbf = SpectralEmbedding(
n_components=2,
affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
embed_rbf = se_rbf.fit_transform(X.astype(dtype))
assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
_assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
def test_precomputed_nearest_neighbors_filtering():
# Test precomputed graph filtering when containing too many neighbors
n_neighbors = 2
results = []
for additional_neighbors in [0, 10]:
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S)
graph = nn.kneighbors_graph(S, mode="connectivity")
embedding = (
SpectralEmbedding(
random_state=0,
n_components=2,
affinity="precomputed_nearest_neighbors",
n_neighbors=n_neighbors,
)
.fit(graph)
.embedding_
)
results.append(embedding)
assert_array_equal(results[0], results[1])
@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)], ids=["dense", "sparse"])
def test_spectral_embedding_callable_affinity(X, seed=36):
# Test spectral embedding with callable affinity
gamma = 0.9
kern = rbf_kernel(S, gamma=gamma)
se_callable = SpectralEmbedding(
n_components=2,
affinity=(lambda x: rbf_kernel(x, gamma=gamma)),
gamma=gamma,
random_state=np.random.RandomState(seed),
)
se_rbf = SpectralEmbedding(
n_components=2,
affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed),
)
embed_rbf = se_rbf.fit_transform(X)
embed_callable = se_callable.fit_transform(X)
assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
_assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of np.float
@pytest.mark.filterwarnings(
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of pinv2
@pytest.mark.filterwarnings(
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_amg_solver(dtype, seed=36):
se_amg = SpectralEmbedding(
n_components=2,
affinity="nearest_neighbors",
eigen_solver="amg",
n_neighbors=5,
random_state=np.random.RandomState(seed),
)
se_arpack = SpectralEmbedding(
n_components=2,
affinity="nearest_neighbors",
eigen_solver="arpack",
n_neighbors=5,
random_state=np.random.RandomState(seed),
)
embed_amg = se_amg.fit_transform(S.astype(dtype))
embed_arpack = se_arpack.fit_transform(S.astype(dtype))
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
# same with special case in which amg is not actually used
# regression test for #10715
# affinity between nodes
row = [0, 0, 1, 2, 3, 3, 4]
col = [1, 2, 2, 3, 4, 5, 5]
val = [100, 100, 100, 1, 100, 100, 100]
affinity = sparse.coo_matrix(
(val + val, (row + col, col + row)), shape=(6, 6)
).toarray()
se_amg.affinity = "precomputed"
se_arpack.affinity = "precomputed"
embed_amg = se_amg.fit_transform(affinity.astype(dtype))
embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
# TODO: Remove filterwarnings when pyamg does replaces sp.rand call with
# np.random.rand:
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of np.float
@pytest.mark.filterwarnings(
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of pinv2
@pytest.mark.filterwarnings(
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
# Non-regression test for amg solver failure (issue #13393 on github)
num_nodes = 100
X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
X = X.astype(dtype)
upper = sparse.triu(X) - sparse.diags(X.diagonal())
sym_matrix = upper + upper.T
embedding = spectral_embedding(
sym_matrix, n_components=10, eigen_solver="amg", random_state=0
)
# Check that the learned embedding is stable w.r.t. random solver init:
for i in range(3):
new_embedding = spectral_embedding(
sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1
)
_assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
@pytest.mark.filterwarnings("ignore:the behavior of nmi will change in version 0.22")
def test_pipeline_spectral_clustering(seed=36):
# Test using pipeline to do spectral clustering
random_state = np.random.RandomState(seed)
se_rbf = SpectralEmbedding(
n_components=n_clusters, affinity="rbf", random_state=random_state
)
se_knn = SpectralEmbedding(
n_components=n_clusters,
affinity="nearest_neighbors",
n_neighbors=5,
random_state=random_state,
)
for se in [se_rbf, se_knn]:
km = KMeans(n_clusters=n_clusters, random_state=random_state)
km.fit(se.fit_transform(S))
assert_array_almost_equal(
normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2
)
def test_spectral_embedding_unknown_eigensolver(seed=36):
# Test that SpectralClustering fails with an unknown eigensolver
se = SpectralEmbedding(
n_components=1,
affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver="<unknown>",
)
with pytest.raises(ValueError):
se.fit(S)
def test_spectral_embedding_unknown_affinity(seed=36):
# Test that SpectralClustering fails with an unknown affinity type
se = SpectralEmbedding(
n_components=1,
affinity="<unknown>",
random_state=np.random.RandomState(seed),
)
with pytest.raises(ValueError):
se.fit(S)
def test_connectivity(seed=36):
# Test that graph connectivity test works as expected
graph = np.array(
[
[1, 0, 0, 0, 0],
[0, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
]
)
assert not _graph_is_connected(graph)
assert not _graph_is_connected(sparse.csr_matrix(graph))
assert not _graph_is_connected(sparse.csc_matrix(graph))
graph = np.array(
[
[1, 1, 0, 0, 0],
[1, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
]
)
assert _graph_is_connected(graph)
assert _graph_is_connected(sparse.csr_matrix(graph))
assert _graph_is_connected(sparse.csc_matrix(graph))
def test_spectral_embedding_deterministic():
# Test that Spectral Embedding is deterministic
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
embedding_1 = spectral_embedding(sims)
embedding_2 = spectral_embedding(sims)
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_unnormalized():
# Test that spectral_embedding is also processing unnormalized laplacian
# correctly
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 8
embedding_1 = spectral_embedding(
sims, norm_laplacian=False, n_components=n_components, drop_first=False
)
# Verify using manual computation with dense eigh
laplacian, dd = csgraph.laplacian(sims, normed=False, return_diag=True)
_, diffusion_map = eigh(laplacian)
embedding_2 = diffusion_map.T[:n_components]
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_first_eigen_vector():
# Test that the first eigenvector of spectral_embedding
# is constant and that the second is not (for a connected graph)
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 2
for seed in range(10):
embedding = spectral_embedding(
sims,
norm_laplacian=False,
n_components=n_components,
drop_first=False,
random_state=seed,
)
assert np.std(embedding[:, 0]) == pytest.approx(0)
assert np.std(embedding[:, 1]) > 1e-3
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
"""Check that `SpectralEmbedding is preserving the dtype of the fitted
attribute and transformed data.
Ideally, this test should be covered by the common test
`check_transformer_preserve_dtypes`. However, this test only run
with transformers implementing `transform` while `SpectralEmbedding`
implements only `fit_transform`.
"""
X = S.astype(dtype)
se = SpectralEmbedding(
n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
)
X_trans = se.fit_transform(X)
assert X_trans.dtype == dtype
assert se.embedding_.dtype == dtype
assert se.affinity_matrix_.dtype == dtype
@pytest.mark.skipif(
pyamg_available,
reason="PyAMG is installed and we should not test for an error.",
)
def test_error_pyamg_not_available():
se_precomp = SpectralEmbedding(
n_components=2,
affinity="rbf",
eigen_solver="amg",
)
err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
with pytest.raises(ValueError, match=err_msg):
se_precomp.fit_transform(S)