first commit
This commit is contained in:
@@ -0,0 +1,21 @@
|
||||
"""
|
||||
The :mod:`sklearn.manifold` module implements data embedding techniques.
|
||||
"""
|
||||
|
||||
from ._locally_linear import locally_linear_embedding, LocallyLinearEmbedding
|
||||
from ._isomap import Isomap
|
||||
from ._mds import MDS, smacof
|
||||
from ._spectral_embedding import SpectralEmbedding, spectral_embedding
|
||||
from ._t_sne import TSNE, trustworthiness
|
||||
|
||||
__all__ = [
|
||||
"locally_linear_embedding",
|
||||
"LocallyLinearEmbedding",
|
||||
"Isomap",
|
||||
"MDS",
|
||||
"smacof",
|
||||
"SpectralEmbedding",
|
||||
"spectral_embedding",
|
||||
"TSNE",
|
||||
"trustworthiness",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,394 @@
|
||||
"""Isomap for manifold learning"""
|
||||
|
||||
# Author: Jake Vanderplas -- <vanderplas@astro.washington.edu>
|
||||
# License: BSD 3 clause (C) 2011
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse.csgraph import shortest_path
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
|
||||
from ..neighbors import NearestNeighbors, kneighbors_graph
|
||||
from ..neighbors import radius_neighbors_graph
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ..decomposition import KernelPCA
|
||||
from ..preprocessing import KernelCenterer
|
||||
from ..utils.graph import _fix_connected_components
|
||||
|
||||
|
||||
class Isomap(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
|
||||
"""Isomap Embedding.
|
||||
|
||||
Non-linear dimensionality reduction through Isometric Mapping
|
||||
|
||||
Read more in the :ref:`User Guide <isomap>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int or None, default=5
|
||||
Number of neighbors to consider for each point. If `n_neighbors` is an int,
|
||||
then `radius` must be `None`.
|
||||
|
||||
radius : float or None, default=None
|
||||
Limiting distance of neighbors to return. If `radius` is a float,
|
||||
then `n_neighbors` must be set to `None`.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
n_components : int, default=2
|
||||
Number of coordinates for the manifold.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
|
||||
'auto' : Attempt to choose the most efficient solver
|
||||
for the given problem.
|
||||
|
||||
'arpack' : Use Arnoldi decomposition to find the eigenvalues
|
||||
and eigenvectors.
|
||||
|
||||
'dense' : Use a direct solver (i.e. LAPACK)
|
||||
for the eigenvalue decomposition.
|
||||
|
||||
tol : float, default=0
|
||||
Convergence tolerance passed to arpack or lobpcg.
|
||||
not used if eigen_solver == 'dense'.
|
||||
|
||||
max_iter : int, default=None
|
||||
Maximum number of iterations for the arpack solver.
|
||||
not used if eigen_solver == 'dense'.
|
||||
|
||||
path_method : {'auto', 'FW', 'D'}, default='auto'
|
||||
Method to use in finding shortest path.
|
||||
|
||||
'auto' : attempt to choose the best algorithm automatically.
|
||||
|
||||
'FW' : Floyd-Warshall algorithm.
|
||||
|
||||
'D' : Dijkstra's algorithm.
|
||||
|
||||
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
|
||||
default='auto'
|
||||
Algorithm to use for nearest neighbors search,
|
||||
passed to neighbors.NearestNeighbors instance.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
metric : str, or callable, default="minkowski"
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string or callable, it must be one of
|
||||
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
|
||||
its metric parameter.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square. X may be a :term:`Glossary <sparse graph>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
p : int, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : array-like, shape (n_samples, n_components)
|
||||
Stores the embedding vectors.
|
||||
|
||||
kernel_pca_ : object
|
||||
:class:`~sklearn.decomposition.KernelPCA` object used to implement the
|
||||
embedding.
|
||||
|
||||
nbrs_ : sklearn.neighbors.NearestNeighbors instance
|
||||
Stores nearest neighbors instance, including BallTree or KDtree
|
||||
if applicable.
|
||||
|
||||
dist_matrix_ : array-like, shape (n_samples, n_samples)
|
||||
Stores the geodesic distance matrix of training data.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.decomposition.PCA : Principal component analysis that is a linear
|
||||
dimensionality reduction method.
|
||||
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
|
||||
kernels and PCA.
|
||||
MDS : Manifold learning using multidimensional scaling.
|
||||
TSNE : T-distributed Stochastic Neighbor Embedding.
|
||||
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
|
||||
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
|
||||
framework for nonlinear dimensionality reduction. Science 290 (5500)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import Isomap
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = Isomap(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
n_neighbors=5,
|
||||
radius=None,
|
||||
n_components=2,
|
||||
eigen_solver="auto",
|
||||
tol=0,
|
||||
max_iter=None,
|
||||
path_method="auto",
|
||||
neighbors_algorithm="auto",
|
||||
n_jobs=None,
|
||||
metric="minkowski",
|
||||
p=2,
|
||||
metric_params=None,
|
||||
):
|
||||
self.n_neighbors = n_neighbors
|
||||
self.radius = radius
|
||||
self.n_components = n_components
|
||||
self.eigen_solver = eigen_solver
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.path_method = path_method
|
||||
self.neighbors_algorithm = neighbors_algorithm
|
||||
self.n_jobs = n_jobs
|
||||
self.metric = metric
|
||||
self.p = p
|
||||
self.metric_params = metric_params
|
||||
|
||||
def _fit_transform(self, X):
|
||||
if self.n_neighbors is not None and self.radius is not None:
|
||||
raise ValueError(
|
||||
"Both n_neighbors and radius are provided. Use"
|
||||
f" Isomap(radius={self.radius}, n_neighbors=None) if intended to use"
|
||||
" radius-based neighbors"
|
||||
)
|
||||
|
||||
self.nbrs_ = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors,
|
||||
radius=self.radius,
|
||||
algorithm=self.neighbors_algorithm,
|
||||
metric=self.metric,
|
||||
p=self.p,
|
||||
metric_params=self.metric_params,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
self.nbrs_.fit(X)
|
||||
self.n_features_in_ = self.nbrs_.n_features_in_
|
||||
if hasattr(self.nbrs_, "feature_names_in_"):
|
||||
self.feature_names_in_ = self.nbrs_.feature_names_in_
|
||||
|
||||
self.kernel_pca_ = KernelPCA(
|
||||
n_components=self.n_components,
|
||||
kernel="precomputed",
|
||||
eigen_solver=self.eigen_solver,
|
||||
tol=self.tol,
|
||||
max_iter=self.max_iter,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
|
||||
if self.n_neighbors is not None:
|
||||
nbg = kneighbors_graph(
|
||||
self.nbrs_,
|
||||
self.n_neighbors,
|
||||
metric=self.metric,
|
||||
p=self.p,
|
||||
metric_params=self.metric_params,
|
||||
mode="distance",
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
else:
|
||||
nbg = radius_neighbors_graph(
|
||||
self.nbrs_,
|
||||
radius=self.radius,
|
||||
metric=self.metric,
|
||||
p=self.p,
|
||||
metric_params=self.metric_params,
|
||||
mode="distance",
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
|
||||
# Compute the number of connected components, and connect the different
|
||||
# components to be able to compute a shortest path between all pairs
|
||||
# of samples in the graph.
|
||||
# Similar fix to cluster._agglomerative._fix_connectivity.
|
||||
n_connected_components, labels = connected_components(nbg)
|
||||
if n_connected_components > 1:
|
||||
if self.metric == "precomputed" and issparse(X):
|
||||
raise RuntimeError(
|
||||
"The number of connected components of the neighbors graph"
|
||||
f" is {n_connected_components} > 1. The graph cannot be "
|
||||
"completed with metric='precomputed', and Isomap cannot be"
|
||||
"fitted. Increase the number of neighbors to avoid this "
|
||||
"issue, or precompute the full distance matrix instead "
|
||||
"of passing a sparse neighbors graph."
|
||||
)
|
||||
warnings.warn(
|
||||
"The number of connected components of the neighbors graph "
|
||||
f"is {n_connected_components} > 1. Completing the graph to fit"
|
||||
" Isomap might be slow. Increase the number of neighbors to "
|
||||
"avoid this issue.",
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# use array validated by NearestNeighbors
|
||||
nbg = _fix_connected_components(
|
||||
X=self.nbrs_._fit_X,
|
||||
graph=nbg,
|
||||
n_connected_components=n_connected_components,
|
||||
component_labels=labels,
|
||||
mode="distance",
|
||||
metric=self.nbrs_.effective_metric_,
|
||||
**self.nbrs_.effective_metric_params_,
|
||||
)
|
||||
|
||||
self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False)
|
||||
|
||||
G = self.dist_matrix_**2
|
||||
G *= -0.5
|
||||
|
||||
self.embedding_ = self.kernel_pca_.fit_transform(G)
|
||||
self._n_features_out = self.embedding_.shape[1]
|
||||
|
||||
def reconstruction_error(self):
|
||||
"""Compute the reconstruction error for the embedding.
|
||||
|
||||
Returns
|
||||
-------
|
||||
reconstruction_error : float
|
||||
Reconstruction error.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The cost function of an isomap embedding is
|
||||
|
||||
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
|
||||
|
||||
Where D is the matrix of distances for the input data X,
|
||||
D_fit is the matrix of distances for the output embedding X_fit,
|
||||
and K is the isomap kernel:
|
||||
|
||||
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
|
||||
"""
|
||||
G = -0.5 * self.dist_matrix_**2
|
||||
G_center = KernelCenterer().fit_transform(G)
|
||||
evals = self.kernel_pca_.eigenvalues_
|
||||
return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Compute the embedding vectors for data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}
|
||||
Sample data, shape = (n_samples, n_features), in the form of a
|
||||
numpy array, sparse graph, precomputed tree, or NearestNeighbors
|
||||
object.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns a fitted instance of self.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit the model from data in X and transform X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse graph, BallTree, KDTree}
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like, shape (n_samples, n_components)
|
||||
X transformed in the new space.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self.embedding_
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform X.
|
||||
|
||||
This is implemented by linking the points X into the graph of geodesic
|
||||
distances of the training data. First the `n_neighbors` nearest
|
||||
neighbors of X are found in the training data, and from these the
|
||||
shortest geodesic distances from each point in X to each point in
|
||||
the training data are computed in order to construct the kernel.
|
||||
The embedding of X is the projection of this kernel onto the
|
||||
embedding vectors of the training set.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_queries, n_features)
|
||||
If neighbors_algorithm='precomputed', X is assumed to be a
|
||||
distance matrix or a sparse graph of shape
|
||||
(n_queries, n_samples_fit).
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like, shape (n_queries, n_components)
|
||||
X transformed in the new space.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
if self.n_neighbors is not None:
|
||||
distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
|
||||
else:
|
||||
distances, indices = self.nbrs_.radius_neighbors(X, return_distance=True)
|
||||
|
||||
# Create the graph of shortest distances from X to
|
||||
# training data via the nearest neighbors of X.
|
||||
# This can be done as a single array operation, but it potentially
|
||||
# takes a lot of memory. To avoid that, use a loop:
|
||||
|
||||
n_samples_fit = self.nbrs_.n_samples_fit_
|
||||
n_queries = distances.shape[0]
|
||||
G_X = np.zeros((n_queries, n_samples_fit))
|
||||
for i in range(n_queries):
|
||||
G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)
|
||||
|
||||
G_X **= 2
|
||||
G_X *= -0.5
|
||||
|
||||
return self.kernel_pca_.transform(G_X)
|
||||
@@ -0,0 +1,810 @@
|
||||
"""Locally Linear Embedding"""
|
||||
|
||||
# Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
|
||||
# Jake Vanderplas -- <vanderplas@astro.washington.edu>
|
||||
# License: BSD 3 clause (C) INRIA 2011
|
||||
|
||||
import numpy as np
|
||||
from scipy.linalg import eigh, svd, qr, solve
|
||||
from scipy.sparse import eye, csr_matrix
|
||||
from scipy.sparse.linalg import eigsh
|
||||
|
||||
from ..base import (
|
||||
BaseEstimator,
|
||||
TransformerMixin,
|
||||
_UnstableArchMixin,
|
||||
_ClassNamePrefixFeaturesOutMixin,
|
||||
)
|
||||
from ..utils import check_random_state, check_array
|
||||
from ..utils._arpack import _init_arpack_v0
|
||||
from ..utils.extmath import stable_cumsum
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ..utils.validation import FLOAT_DTYPES
|
||||
from ..neighbors import NearestNeighbors
|
||||
|
||||
|
||||
def barycenter_weights(X, Y, indices, reg=1e-3):
|
||||
"""Compute barycenter weights of X from Y along the first axis
|
||||
|
||||
We estimate the weights to assign to each point in Y[indices] to recover
|
||||
the point X[i]. The barycenter weights sum to 1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_dim)
|
||||
|
||||
Y : array-like, shape (n_samples, n_dim)
|
||||
|
||||
indices : array-like, shape (n_samples, n_dim)
|
||||
Indices of the points in Y used to compute the barycenter
|
||||
|
||||
reg : float, default=1e-3
|
||||
amount of regularization to add for the problem to be
|
||||
well-posed in the case of n_neighbors > n_dim
|
||||
|
||||
Returns
|
||||
-------
|
||||
B : array-like, shape (n_samples, n_neighbors)
|
||||
|
||||
Notes
|
||||
-----
|
||||
See developers note for more information.
|
||||
"""
|
||||
X = check_array(X, dtype=FLOAT_DTYPES)
|
||||
Y = check_array(Y, dtype=FLOAT_DTYPES)
|
||||
indices = check_array(indices, dtype=int)
|
||||
|
||||
n_samples, n_neighbors = indices.shape
|
||||
assert X.shape[0] == n_samples
|
||||
|
||||
B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
|
||||
v = np.ones(n_neighbors, dtype=X.dtype)
|
||||
|
||||
# this might raise a LinalgError if G is singular and has trace
|
||||
# zero
|
||||
for i, ind in enumerate(indices):
|
||||
A = Y[ind]
|
||||
C = A - X[i] # broadcasting
|
||||
G = np.dot(C, C.T)
|
||||
trace = np.trace(G)
|
||||
if trace > 0:
|
||||
R = reg * trace
|
||||
else:
|
||||
R = reg
|
||||
G.flat[:: n_neighbors + 1] += R
|
||||
w = solve(G, v, sym_pos=True)
|
||||
B[i, :] = w / np.sum(w)
|
||||
return B
|
||||
|
||||
|
||||
def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
|
||||
"""Computes the barycenter weighted graph of k-Neighbors for points in X
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, NearestNeighbors}
|
||||
Sample data, shape = (n_samples, n_features), in the form of a
|
||||
numpy array or a NearestNeighbors object.
|
||||
|
||||
n_neighbors : int
|
||||
Number of neighbors for each sample.
|
||||
|
||||
reg : float, default=1e-3
|
||||
Amount of regularization when solving the least-squares
|
||||
problem. Only relevant if mode='barycenter'. If None, use the
|
||||
default.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A : sparse matrix in CSR format, shape = [n_samples, n_samples]
|
||||
A[i, j] is assigned the weight of edge that connects i to j.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.neighbors.kneighbors_graph
|
||||
sklearn.neighbors.radius_neighbors_graph
|
||||
"""
|
||||
knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)
|
||||
X = knn._fit_X
|
||||
n_samples = knn.n_samples_fit_
|
||||
ind = knn.kneighbors(X, return_distance=False)[:, 1:]
|
||||
data = barycenter_weights(X, X, ind, reg=reg)
|
||||
indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
|
||||
return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples))
|
||||
|
||||
|
||||
def null_space(
|
||||
M, k, k_skip=1, eigen_solver="arpack", tol=1e-6, max_iter=100, random_state=None
|
||||
):
|
||||
"""
|
||||
Find the null space of a matrix M.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
M : {array, matrix, sparse matrix, LinearOperator}
|
||||
Input covariance matrix: should be symmetric positive semi-definite
|
||||
|
||||
k : int
|
||||
Number of eigenvalues/vectors to return
|
||||
|
||||
k_skip : int, default=1
|
||||
Number of low eigenvalues to skip.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='arpack'
|
||||
auto : algorithm will attempt to choose the best method for input data
|
||||
arpack : use arnoldi iteration in shift-invert mode.
|
||||
For this method, M may be a dense matrix, sparse matrix,
|
||||
or general linear operator.
|
||||
Warning: ARPACK can be unstable for some problems. It is
|
||||
best to try several random seeds in order to check results.
|
||||
dense : use standard dense matrix operations for the eigenvalue
|
||||
decomposition. For this method, M must be an array
|
||||
or matrix type. This method should be avoided for
|
||||
large problems.
|
||||
|
||||
tol : float, default=1e-6
|
||||
Tolerance for 'arpack' method.
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of iterations for 'arpack' method.
|
||||
Not used if eigen_solver=='dense'
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator when ``solver`` == 'arpack'.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
"""
|
||||
if eigen_solver == "auto":
|
||||
if M.shape[0] > 200 and k + k_skip < 10:
|
||||
eigen_solver = "arpack"
|
||||
else:
|
||||
eigen_solver = "dense"
|
||||
|
||||
if eigen_solver == "arpack":
|
||||
v0 = _init_arpack_v0(M.shape[0], random_state)
|
||||
try:
|
||||
eigen_values, eigen_vectors = eigsh(
|
||||
M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0
|
||||
)
|
||||
except RuntimeError as e:
|
||||
raise ValueError(
|
||||
"Error in determining null-space with ARPACK. Error message: "
|
||||
"'%s'. Note that eigen_solver='arpack' can fail when the "
|
||||
"weight matrix is singular or otherwise ill-behaved. In that "
|
||||
"case, eigen_solver='dense' is recommended. See online "
|
||||
"documentation for more information." % e
|
||||
) from e
|
||||
|
||||
return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])
|
||||
elif eigen_solver == "dense":
|
||||
if hasattr(M, "toarray"):
|
||||
M = M.toarray()
|
||||
eigen_values, eigen_vectors = eigh(
|
||||
M, eigvals=(k_skip, k + k_skip - 1), overwrite_a=True
|
||||
)
|
||||
index = np.argsort(np.abs(eigen_values))
|
||||
return eigen_vectors[:, index], np.sum(eigen_values)
|
||||
else:
|
||||
raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
|
||||
|
||||
|
||||
def locally_linear_embedding(
|
||||
X,
|
||||
*,
|
||||
n_neighbors,
|
||||
n_components,
|
||||
reg=1e-3,
|
||||
eigen_solver="auto",
|
||||
tol=1e-6,
|
||||
max_iter=100,
|
||||
method="standard",
|
||||
hessian_tol=1e-4,
|
||||
modified_tol=1e-12,
|
||||
random_state=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
"""Perform a Locally Linear Embedding analysis on the data.
|
||||
|
||||
Read more in the :ref:`User Guide <locally_linear_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, NearestNeighbors}
|
||||
Sample data, shape = (n_samples, n_features), in the form of a
|
||||
numpy array or a NearestNeighbors object.
|
||||
|
||||
n_neighbors : int
|
||||
number of neighbors to consider for each point.
|
||||
|
||||
n_components : int
|
||||
number of coordinates for the manifold.
|
||||
|
||||
reg : float, default=1e-3
|
||||
regularization constant, multiplies the trace of the local covariance
|
||||
matrix of the distances.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
|
||||
auto : algorithm will attempt to choose the best method for input data
|
||||
|
||||
arpack : use arnoldi iteration in shift-invert mode.
|
||||
For this method, M may be a dense matrix, sparse matrix,
|
||||
or general linear operator.
|
||||
Warning: ARPACK can be unstable for some problems. It is
|
||||
best to try several random seeds in order to check results.
|
||||
|
||||
dense : use standard dense matrix operations for the eigenvalue
|
||||
decomposition. For this method, M must be an array
|
||||
or matrix type. This method should be avoided for
|
||||
large problems.
|
||||
|
||||
tol : float, default=1e-6
|
||||
Tolerance for 'arpack' method
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
max_iter : int, default=100
|
||||
maximum number of iterations for the arpack solver.
|
||||
|
||||
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
|
||||
standard : use the standard locally linear embedding algorithm.
|
||||
see reference [1]_
|
||||
hessian : use the Hessian eigenmap method. This method requires
|
||||
n_neighbors > n_components * (1 + (n_components + 1) / 2.
|
||||
see reference [2]_
|
||||
modified : use the modified locally linear embedding algorithm.
|
||||
see reference [3]_
|
||||
ltsa : use local tangent space alignment algorithm
|
||||
see reference [4]_
|
||||
|
||||
hessian_tol : float, default=1e-4
|
||||
Tolerance for Hessian eigenmapping method.
|
||||
Only used if method == 'hessian'
|
||||
|
||||
modified_tol : float, default=1e-12
|
||||
Tolerance for modified LLE method.
|
||||
Only used if method == 'modified'
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator when ``solver`` == 'arpack'.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : array-like, shape [n_samples, n_components]
|
||||
Embedding vectors.
|
||||
|
||||
squared_error : float
|
||||
Reconstruction error for the embedding vectors. Equivalent to
|
||||
``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
|
||||
by locally linear embedding. Science 290:2323 (2000).
|
||||
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
|
||||
linear embedding techniques for high-dimensional data.
|
||||
Proc Natl Acad Sci U S A. 100:5591 (2003).
|
||||
.. [3] Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
|
||||
Embedding Using Multiple Weights.
|
||||
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382
|
||||
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
|
||||
dimensionality reduction via tangent space alignment.
|
||||
Journal of Shanghai Univ. 8:406 (2004)
|
||||
"""
|
||||
if eigen_solver not in ("auto", "arpack", "dense"):
|
||||
raise ValueError("unrecognized eigen_solver '%s'" % eigen_solver)
|
||||
|
||||
if method not in ("standard", "hessian", "modified", "ltsa"):
|
||||
raise ValueError("unrecognized method '%s'" % method)
|
||||
|
||||
nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs)
|
||||
nbrs.fit(X)
|
||||
X = nbrs._fit_X
|
||||
|
||||
N, d_in = X.shape
|
||||
|
||||
if n_components > d_in:
|
||||
raise ValueError(
|
||||
"output dimension must be less than or equal to input dimension"
|
||||
)
|
||||
if n_neighbors >= N:
|
||||
raise ValueError(
|
||||
"Expected n_neighbors <= n_samples, but n_samples = %d, n_neighbors = %d"
|
||||
% (N, n_neighbors)
|
||||
)
|
||||
|
||||
if n_neighbors <= 0:
|
||||
raise ValueError("n_neighbors must be positive")
|
||||
|
||||
M_sparse = eigen_solver != "dense"
|
||||
|
||||
if method == "standard":
|
||||
W = barycenter_kneighbors_graph(
|
||||
nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs
|
||||
)
|
||||
|
||||
# we'll compute M = (I-W)'(I-W)
|
||||
# depending on the solver, we'll do this differently
|
||||
if M_sparse:
|
||||
M = eye(*W.shape, format=W.format) - W
|
||||
M = (M.T * M).tocsr()
|
||||
else:
|
||||
M = (W.T * W - W.T - W).toarray()
|
||||
M.flat[:: M.shape[0] + 1] += 1 # W = W - I = W - I
|
||||
|
||||
elif method == "hessian":
|
||||
dp = n_components * (n_components + 1) // 2
|
||||
|
||||
if n_neighbors <= n_components + dp:
|
||||
raise ValueError(
|
||||
"for method='hessian', n_neighbors must be "
|
||||
"greater than "
|
||||
"[n_components * (n_components + 3) / 2]"
|
||||
)
|
||||
|
||||
neighbors = nbrs.kneighbors(
|
||||
X, n_neighbors=n_neighbors + 1, return_distance=False
|
||||
)
|
||||
neighbors = neighbors[:, 1:]
|
||||
|
||||
Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)
|
||||
Yi[:, 0] = 1
|
||||
|
||||
M = np.zeros((N, N), dtype=np.float64)
|
||||
|
||||
use_svd = n_neighbors > d_in
|
||||
|
||||
for i in range(N):
|
||||
Gi = X[neighbors[i]]
|
||||
Gi -= Gi.mean(0)
|
||||
|
||||
# build Hessian estimator
|
||||
if use_svd:
|
||||
U = svd(Gi, full_matrices=0)[0]
|
||||
else:
|
||||
Ci = np.dot(Gi, Gi.T)
|
||||
U = eigh(Ci)[1][:, ::-1]
|
||||
|
||||
Yi[:, 1 : 1 + n_components] = U[:, :n_components]
|
||||
|
||||
j = 1 + n_components
|
||||
for k in range(n_components):
|
||||
Yi[:, j : j + n_components - k] = U[:, k : k + 1] * U[:, k:n_components]
|
||||
j += n_components - k
|
||||
|
||||
Q, R = qr(Yi)
|
||||
|
||||
w = Q[:, n_components + 1 :]
|
||||
S = w.sum(0)
|
||||
|
||||
S[np.where(abs(S) < hessian_tol)] = 1
|
||||
w /= S
|
||||
|
||||
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
|
||||
M[nbrs_x, nbrs_y] += np.dot(w, w.T)
|
||||
|
||||
if M_sparse:
|
||||
M = csr_matrix(M)
|
||||
|
||||
elif method == "modified":
|
||||
if n_neighbors < n_components:
|
||||
raise ValueError("modified LLE requires n_neighbors >= n_components")
|
||||
|
||||
neighbors = nbrs.kneighbors(
|
||||
X, n_neighbors=n_neighbors + 1, return_distance=False
|
||||
)
|
||||
neighbors = neighbors[:, 1:]
|
||||
|
||||
# find the eigenvectors and eigenvalues of each local covariance
|
||||
# matrix. We want V[i] to be a [n_neighbors x n_neighbors] matrix,
|
||||
# where the columns are eigenvectors
|
||||
V = np.zeros((N, n_neighbors, n_neighbors))
|
||||
nev = min(d_in, n_neighbors)
|
||||
evals = np.zeros([N, nev])
|
||||
|
||||
# choose the most efficient way to find the eigenvectors
|
||||
use_svd = n_neighbors > d_in
|
||||
|
||||
if use_svd:
|
||||
for i in range(N):
|
||||
X_nbrs = X[neighbors[i]] - X[i]
|
||||
V[i], evals[i], _ = svd(X_nbrs, full_matrices=True)
|
||||
evals **= 2
|
||||
else:
|
||||
for i in range(N):
|
||||
X_nbrs = X[neighbors[i]] - X[i]
|
||||
C_nbrs = np.dot(X_nbrs, X_nbrs.T)
|
||||
evi, vi = eigh(C_nbrs)
|
||||
evals[i] = evi[::-1]
|
||||
V[i] = vi[:, ::-1]
|
||||
|
||||
# find regularized weights: this is like normal LLE.
|
||||
# because we've already computed the SVD of each covariance matrix,
|
||||
# it's faster to use this rather than np.linalg.solve
|
||||
reg = 1e-3 * evals.sum(1)
|
||||
|
||||
tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors))
|
||||
tmp[:, :nev] /= evals + reg[:, None]
|
||||
tmp[:, nev:] /= reg[:, None]
|
||||
|
||||
w_reg = np.zeros((N, n_neighbors))
|
||||
for i in range(N):
|
||||
w_reg[i] = np.dot(V[i], tmp[i])
|
||||
w_reg /= w_reg.sum(1)[:, None]
|
||||
|
||||
# calculate eta: the median of the ratio of small to large eigenvalues
|
||||
# across the points. This is used to determine s_i, below
|
||||
rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1)
|
||||
eta = np.median(rho)
|
||||
|
||||
# find s_i, the size of the "almost null space" for each point:
|
||||
# this is the size of the largest set of eigenvalues
|
||||
# such that Sum[v; v in set]/Sum[v; v not in set] < eta
|
||||
s_range = np.zeros(N, dtype=int)
|
||||
evals_cumsum = stable_cumsum(evals, 1)
|
||||
eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1
|
||||
for i in range(N):
|
||||
s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)
|
||||
s_range += n_neighbors - nev # number of zero eigenvalues
|
||||
|
||||
# Now calculate M.
|
||||
# This is the [N x N] matrix whose null space is the desired embedding
|
||||
M = np.zeros((N, N), dtype=np.float64)
|
||||
for i in range(N):
|
||||
s_i = s_range[i]
|
||||
|
||||
# select bottom s_i eigenvectors and calculate alpha
|
||||
Vi = V[i, :, n_neighbors - s_i :]
|
||||
alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
|
||||
|
||||
# compute Householder matrix which satisfies
|
||||
# Hi*Vi.T*ones(n_neighbors) = alpha_i*ones(s)
|
||||
# using prescription from paper
|
||||
h = np.full(s_i, alpha_i) - np.dot(Vi.T, np.ones(n_neighbors))
|
||||
|
||||
norm_h = np.linalg.norm(h)
|
||||
if norm_h < modified_tol:
|
||||
h *= 0
|
||||
else:
|
||||
h /= norm_h
|
||||
|
||||
# Householder matrix is
|
||||
# >> Hi = np.identity(s_i) - 2*np.outer(h,h)
|
||||
# Then the weight matrix is
|
||||
# >> Wi = np.dot(Vi,Hi) + (1-alpha_i) * w_reg[i,:,None]
|
||||
# We do this much more efficiently:
|
||||
Wi = Vi - 2 * np.outer(np.dot(Vi, h), h) + (1 - alpha_i) * w_reg[i, :, None]
|
||||
|
||||
# Update M as follows:
|
||||
# >> W_hat = np.zeros( (N,s_i) )
|
||||
# >> W_hat[neighbors[i],:] = Wi
|
||||
# >> W_hat[i] -= 1
|
||||
# >> M += np.dot(W_hat,W_hat.T)
|
||||
# We can do this much more efficiently:
|
||||
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
|
||||
M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)
|
||||
Wi_sum1 = Wi.sum(1)
|
||||
M[i, neighbors[i]] -= Wi_sum1
|
||||
M[neighbors[i], i] -= Wi_sum1
|
||||
M[i, i] += s_i
|
||||
|
||||
if M_sparse:
|
||||
M = csr_matrix(M)
|
||||
|
||||
elif method == "ltsa":
|
||||
neighbors = nbrs.kneighbors(
|
||||
X, n_neighbors=n_neighbors + 1, return_distance=False
|
||||
)
|
||||
neighbors = neighbors[:, 1:]
|
||||
|
||||
M = np.zeros((N, N))
|
||||
|
||||
use_svd = n_neighbors > d_in
|
||||
|
||||
for i in range(N):
|
||||
Xi = X[neighbors[i]]
|
||||
Xi -= Xi.mean(0)
|
||||
|
||||
# compute n_components largest eigenvalues of Xi * Xi^T
|
||||
if use_svd:
|
||||
v = svd(Xi, full_matrices=True)[0]
|
||||
else:
|
||||
Ci = np.dot(Xi, Xi.T)
|
||||
v = eigh(Ci)[1][:, ::-1]
|
||||
|
||||
Gi = np.zeros((n_neighbors, n_components + 1))
|
||||
Gi[:, 1:] = v[:, :n_components]
|
||||
Gi[:, 0] = 1.0 / np.sqrt(n_neighbors)
|
||||
|
||||
GiGiT = np.dot(Gi, Gi.T)
|
||||
|
||||
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
|
||||
M[nbrs_x, nbrs_y] -= GiGiT
|
||||
M[neighbors[i], neighbors[i]] += 1
|
||||
|
||||
return null_space(
|
||||
M,
|
||||
n_components,
|
||||
k_skip=1,
|
||||
eigen_solver=eigen_solver,
|
||||
tol=tol,
|
||||
max_iter=max_iter,
|
||||
random_state=random_state,
|
||||
)
|
||||
|
||||
|
||||
class LocallyLinearEmbedding(
|
||||
_ClassNamePrefixFeaturesOutMixin,
|
||||
TransformerMixin,
|
||||
_UnstableArchMixin,
|
||||
BaseEstimator,
|
||||
):
|
||||
"""Locally Linear Embedding.
|
||||
|
||||
Read more in the :ref:`User Guide <locally_linear_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors to consider for each point.
|
||||
|
||||
n_components : int, default=2
|
||||
Number of coordinates for the manifold.
|
||||
|
||||
reg : float, default=1e-3
|
||||
Regularization constant, multiplies the trace of the local covariance
|
||||
matrix of the distances.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
|
||||
The solver used to compute the eigenvectors. The available options are:
|
||||
|
||||
- `'auto'` : algorithm will attempt to choose the best method for input
|
||||
data.
|
||||
- `'arpack'` : use arnoldi iteration in shift-invert mode. For this
|
||||
method, M may be a dense matrix, sparse matrix, or general linear
|
||||
operator.
|
||||
- `'dense'` : use standard dense matrix operations for the eigenvalue
|
||||
decomposition. For this method, M must be an array or matrix type.
|
||||
This method should be avoided for large problems.
|
||||
|
||||
.. warning::
|
||||
ARPACK can be unstable for some problems. It is best to try several
|
||||
random seeds in order to check results.
|
||||
|
||||
tol : float, default=1e-6
|
||||
Tolerance for 'arpack' method
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of iterations for the arpack solver.
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
|
||||
- `standard`: use the standard locally linear embedding algorithm. see
|
||||
reference [1]_
|
||||
- `hessian`: use the Hessian eigenmap method. This method requires
|
||||
``n_neighbors > n_components * (1 + (n_components + 1) / 2``. see
|
||||
reference [2]_
|
||||
- `modified`: use the modified locally linear embedding algorithm.
|
||||
see reference [3]_
|
||||
- `ltsa`: use local tangent space alignment algorithm. see
|
||||
reference [4]_
|
||||
|
||||
hessian_tol : float, default=1e-4
|
||||
Tolerance for Hessian eigenmapping method.
|
||||
Only used if ``method == 'hessian'``.
|
||||
|
||||
modified_tol : float, default=1e-12
|
||||
Tolerance for modified LLE method.
|
||||
Only used if ``method == 'modified'``.
|
||||
|
||||
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
|
||||
default='auto'
|
||||
Algorithm to use for nearest neighbors search, passed to
|
||||
:class:`~sklearn.neighbors.NearestNeighbors` instance.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator when
|
||||
``eigen_solver`` == 'arpack'. Pass an int for reproducible results
|
||||
across multiple function calls. See :term:`Glossary <random_state>`.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : array-like, shape [n_samples, n_components]
|
||||
Stores the embedding vectors
|
||||
|
||||
reconstruction_error_ : float
|
||||
Reconstruction error associated with `embedding_`
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
nbrs_ : NearestNeighbors object
|
||||
Stores nearest neighbors instance, including BallTree or KDtree
|
||||
if applicable.
|
||||
|
||||
See Also
|
||||
--------
|
||||
SpectralEmbedding : Spectral embedding for non-linear dimensionality
|
||||
reduction.
|
||||
TSNE : Distributed Stochastic Neighbor Embedding.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
|
||||
by locally linear embedding. Science 290:2323 (2000).
|
||||
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
|
||||
linear embedding techniques for high-dimensional data.
|
||||
Proc Natl Acad Sci U S A. 100:5591 (2003).
|
||||
.. [3] Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
|
||||
Embedding Using Multiple Weights.
|
||||
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382
|
||||
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
|
||||
dimensionality reduction via tangent space alignment.
|
||||
Journal of Shanghai Univ. 8:406 (2004)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import LocallyLinearEmbedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = LocallyLinearEmbedding(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
n_neighbors=5,
|
||||
n_components=2,
|
||||
reg=1e-3,
|
||||
eigen_solver="auto",
|
||||
tol=1e-6,
|
||||
max_iter=100,
|
||||
method="standard",
|
||||
hessian_tol=1e-4,
|
||||
modified_tol=1e-12,
|
||||
neighbors_algorithm="auto",
|
||||
random_state=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.n_neighbors = n_neighbors
|
||||
self.n_components = n_components
|
||||
self.reg = reg
|
||||
self.eigen_solver = eigen_solver
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.method = method
|
||||
self.hessian_tol = hessian_tol
|
||||
self.modified_tol = modified_tol
|
||||
self.random_state = random_state
|
||||
self.neighbors_algorithm = neighbors_algorithm
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def _fit_transform(self, X):
|
||||
self.nbrs_ = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors,
|
||||
algorithm=self.neighbors_algorithm,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
X = self._validate_data(X, dtype=float)
|
||||
self.nbrs_.fit(X)
|
||||
self.embedding_, self.reconstruction_error_ = locally_linear_embedding(
|
||||
X=self.nbrs_,
|
||||
n_neighbors=self.n_neighbors,
|
||||
n_components=self.n_components,
|
||||
eigen_solver=self.eigen_solver,
|
||||
tol=self.tol,
|
||||
max_iter=self.max_iter,
|
||||
method=self.method,
|
||||
hessian_tol=self.hessian_tol,
|
||||
modified_tol=self.modified_tol,
|
||||
random_state=random_state,
|
||||
reg=self.reg,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
self._n_features_out = self.embedding_.shape[1]
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Compute the embedding vectors for data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted `LocallyLinearEmbedding` class instance.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Compute the embedding vectors for data X and transform X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like, shape (n_samples, n_components)
|
||||
Returns the instance itself.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self.embedding_
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform new points into embedding space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : ndarray of shape (n_samples, n_components)
|
||||
Returns the instance itself.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Because of scaling performed by this method, it is discouraged to use
|
||||
it together with methods that are not scale-invariant (like SVMs).
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = self._validate_data(X, reset=False)
|
||||
ind = self.nbrs_.kneighbors(
|
||||
X, n_neighbors=self.n_neighbors, return_distance=False
|
||||
)
|
||||
weights = barycenter_weights(X, self.nbrs_._fit_X, ind, reg=self.reg)
|
||||
X_new = np.empty((X.shape[0], self.n_components))
|
||||
for i in range(X.shape[0]):
|
||||
X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])
|
||||
return X_new
|
||||
@@ -0,0 +1,537 @@
|
||||
"""
|
||||
Multi-dimensional Scaling (MDS).
|
||||
"""
|
||||
|
||||
# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
|
||||
# License: BSD
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel, effective_n_jobs
|
||||
|
||||
import warnings
|
||||
|
||||
from ..base import BaseEstimator
|
||||
from ..metrics import euclidean_distances
|
||||
from ..utils import check_random_state, check_array, check_symmetric
|
||||
from ..isotonic import IsotonicRegression
|
||||
from ..utils.fixes import delayed
|
||||
|
||||
|
||||
def _smacof_single(
|
||||
dissimilarities,
|
||||
metric=True,
|
||||
n_components=2,
|
||||
init=None,
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
eps=1e-3,
|
||||
random_state=None,
|
||||
):
|
||||
"""Computes multidimensional scaling using SMACOF algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dissimilarities : ndarray of shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Must be symmetric.
|
||||
|
||||
metric : bool, default=True
|
||||
Compute metric or nonmetric SMACOF algorithm.
|
||||
|
||||
n_components : int, default=2
|
||||
Number of dimensions in which to immerse the dissimilarities. If an
|
||||
``init`` array is provided, this option is overridden and the shape of
|
||||
``init`` is used to determine the dimensionality of the embedding
|
||||
space.
|
||||
|
||||
init : ndarray of shape (n_samples, n_components), default=None
|
||||
Starting configuration of the embedding to initialize the algorithm. By
|
||||
default, the algorithm is initialized with a randomly chosen array.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, default=0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, default=1e-3
|
||||
Relative tolerance with respect to stress at which to declare
|
||||
convergence.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray of shape (n_samples, n_components)
|
||||
Coordinates of the points in a ``n_components``-space.
|
||||
|
||||
stress : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
|
||||
n_iter : int
|
||||
The number of iterations corresponding to the best stress.
|
||||
"""
|
||||
dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
|
||||
|
||||
n_samples = dissimilarities.shape[0]
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
|
||||
sim_flat_w = sim_flat[sim_flat != 0]
|
||||
if init is None:
|
||||
# Randomly choose initial configuration
|
||||
X = random_state.uniform(size=n_samples * n_components)
|
||||
X = X.reshape((n_samples, n_components))
|
||||
else:
|
||||
# overrides the parameter p
|
||||
n_components = init.shape[1]
|
||||
if n_samples != init.shape[0]:
|
||||
raise ValueError(
|
||||
"init matrix should be of shape (%d, %d)" % (n_samples, n_components)
|
||||
)
|
||||
X = init
|
||||
|
||||
old_stress = None
|
||||
ir = IsotonicRegression()
|
||||
for it in range(max_iter):
|
||||
# Compute distance and monotonic regression
|
||||
dis = euclidean_distances(X)
|
||||
|
||||
if metric:
|
||||
disparities = dissimilarities
|
||||
else:
|
||||
dis_flat = dis.ravel()
|
||||
# dissimilarities with 0 are considered as missing values
|
||||
dis_flat_w = dis_flat[sim_flat != 0]
|
||||
|
||||
# Compute the disparities using a monotonic regression
|
||||
disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
|
||||
disparities = dis_flat.copy()
|
||||
disparities[sim_flat != 0] = disparities_flat
|
||||
disparities = disparities.reshape((n_samples, n_samples))
|
||||
disparities *= np.sqrt(
|
||||
(n_samples * (n_samples - 1) / 2) / (disparities**2).sum()
|
||||
)
|
||||
|
||||
# Compute stress
|
||||
stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
|
||||
|
||||
# Update X using the Guttman transform
|
||||
dis[dis == 0] = 1e-5
|
||||
ratio = disparities / dis
|
||||
B = -ratio
|
||||
B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
|
||||
X = 1.0 / n_samples * np.dot(B, X)
|
||||
|
||||
dis = np.sqrt((X**2).sum(axis=1)).sum()
|
||||
if verbose >= 2:
|
||||
print("it: %d, stress %s" % (it, stress))
|
||||
if old_stress is not None:
|
||||
if (old_stress - stress / dis) < eps:
|
||||
if verbose:
|
||||
print("breaking at iteration %d with stress %s" % (it, stress))
|
||||
break
|
||||
old_stress = stress / dis
|
||||
|
||||
return X, stress, it + 1
|
||||
|
||||
|
||||
def smacof(
|
||||
dissimilarities,
|
||||
*,
|
||||
metric=True,
|
||||
n_components=2,
|
||||
init=None,
|
||||
n_init=8,
|
||||
n_jobs=None,
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
eps=1e-3,
|
||||
random_state=None,
|
||||
return_n_iter=False,
|
||||
):
|
||||
"""Compute multidimensional scaling using the SMACOF algorithm.
|
||||
|
||||
The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
|
||||
multidimensional scaling algorithm which minimizes an objective function
|
||||
(the *stress*) using a majorization technique. Stress majorization, also
|
||||
known as the Guttman Transform, guarantees a monotone convergence of
|
||||
stress, and is more powerful than traditional techniques such as gradient
|
||||
descent.
|
||||
|
||||
The SMACOF algorithm for metric MDS can be summarized by the following
|
||||
steps:
|
||||
|
||||
1. Set an initial start configuration, randomly or not.
|
||||
2. Compute the stress
|
||||
3. Compute the Guttman Transform
|
||||
4. Iterate 2 and 3 until convergence.
|
||||
|
||||
The nonmetric algorithm adds a monotonic regression step before computing
|
||||
the stress.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dissimilarities : ndarray of shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Must be symmetric.
|
||||
|
||||
metric : bool, default=True
|
||||
Compute metric or nonmetric SMACOF algorithm.
|
||||
|
||||
n_components : int, default=2
|
||||
Number of dimensions in which to immerse the dissimilarities. If an
|
||||
``init`` array is provided, this option is overridden and the shape of
|
||||
``init`` is used to determine the dimensionality of the embedding
|
||||
space.
|
||||
|
||||
init : ndarray of shape (n_samples, n_components), default=None
|
||||
Starting configuration of the embedding to initialize the algorithm. By
|
||||
default, the algorithm is initialized with a randomly chosen array.
|
||||
|
||||
n_init : int, default=8
|
||||
Number of times the SMACOF algorithm will be run with different
|
||||
initializations. The final results will be the best output of the runs,
|
||||
determined by the run with the smallest final stress. If ``init`` is
|
||||
provided, this option is overridden and a single run is performed.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. If multiple
|
||||
initializations are used (``n_init``), each run of the algorithm is
|
||||
computed in parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, default=0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, default=1e-3
|
||||
Relative tolerance with respect to stress at which to declare
|
||||
convergence.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
return_n_iter : bool, default=False
|
||||
Whether or not to return the number of iterations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray of shape (n_samples, n_components)
|
||||
Coordinates of the points in a ``n_components``-space.
|
||||
|
||||
stress : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
|
||||
n_iter : int
|
||||
The number of iterations corresponding to the best stress. Returned
|
||||
only if ``return_n_iter`` is set to ``True``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
"Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
|
||||
"Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
||||
Psychometrika, 29 (1964)
|
||||
|
||||
"Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
||||
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
||||
"""
|
||||
|
||||
dissimilarities = check_array(dissimilarities)
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
if hasattr(init, "__array__"):
|
||||
init = np.asarray(init).copy()
|
||||
if not n_init == 1:
|
||||
warnings.warn(
|
||||
"Explicit initial positions passed: "
|
||||
"performing only one init of the MDS instead of %d" % n_init
|
||||
)
|
||||
n_init = 1
|
||||
|
||||
best_pos, best_stress = None, None
|
||||
|
||||
if effective_n_jobs(n_jobs) == 1:
|
||||
for it in range(n_init):
|
||||
pos, stress, n_iter_ = _smacof_single(
|
||||
dissimilarities,
|
||||
metric=metric,
|
||||
n_components=n_components,
|
||||
init=init,
|
||||
max_iter=max_iter,
|
||||
verbose=verbose,
|
||||
eps=eps,
|
||||
random_state=random_state,
|
||||
)
|
||||
if best_stress is None or stress < best_stress:
|
||||
best_stress = stress
|
||||
best_pos = pos.copy()
|
||||
best_iter = n_iter_
|
||||
else:
|
||||
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
|
||||
results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
|
||||
delayed(_smacof_single)(
|
||||
dissimilarities,
|
||||
metric=metric,
|
||||
n_components=n_components,
|
||||
init=init,
|
||||
max_iter=max_iter,
|
||||
verbose=verbose,
|
||||
eps=eps,
|
||||
random_state=seed,
|
||||
)
|
||||
for seed in seeds
|
||||
)
|
||||
positions, stress, n_iters = zip(*results)
|
||||
best = np.argmin(stress)
|
||||
best_stress = stress[best]
|
||||
best_pos = positions[best]
|
||||
best_iter = n_iters[best]
|
||||
|
||||
if return_n_iter:
|
||||
return best_pos, best_stress, best_iter
|
||||
else:
|
||||
return best_pos, best_stress
|
||||
|
||||
|
||||
class MDS(BaseEstimator):
|
||||
"""Multidimensional scaling.
|
||||
|
||||
Read more in the :ref:`User Guide <multidimensional_scaling>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=2
|
||||
Number of dimensions in which to immerse the dissimilarities.
|
||||
|
||||
metric : bool, default=True
|
||||
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
|
||||
|
||||
n_init : int, default=4
|
||||
Number of times the SMACOF algorithm will be run with different
|
||||
initializations. The final results will be the best output of the runs,
|
||||
determined by the run with the smallest final stress.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, default=0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, default=1e-3
|
||||
Relative tolerance with respect to stress at which to declare
|
||||
convergence.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. If multiple
|
||||
initializations are used (``n_init``), each run of the algorithm is
|
||||
computed in parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
dissimilarity : {'euclidean', 'precomputed'}, default='euclidean'
|
||||
Dissimilarity measure to use:
|
||||
|
||||
- 'euclidean':
|
||||
Pairwise Euclidean distances between points in the dataset.
|
||||
|
||||
- 'precomputed':
|
||||
Pre-computed dissimilarities are passed directly to ``fit`` and
|
||||
``fit_transform``.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : ndarray of shape (n_samples, n_components)
|
||||
Stores the position of the dataset in the embedding space.
|
||||
|
||||
stress_ : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
|
||||
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Symmetric matrix that:
|
||||
|
||||
- either uses a custom dissimilarity matrix by setting `dissimilarity`
|
||||
to 'precomputed';
|
||||
- or constructs a dissimilarity matrix from data using
|
||||
Euclidean distances.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
The number of iterations corresponding to the best stress.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.decomposition.PCA : Principal component analysis that is a linear
|
||||
dimensionality reduction method.
|
||||
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
|
||||
kernels and PCA.
|
||||
TSNE : T-distributed Stochastic Neighbor Embedding.
|
||||
Isomap : Manifold learning based on Isometric Mapping.
|
||||
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
|
||||
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
|
||||
|
||||
References
|
||||
----------
|
||||
"Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
|
||||
"Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
||||
Psychometrika, 29 (1964)
|
||||
|
||||
"Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
||||
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import MDS
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = MDS(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_components=2,
|
||||
*,
|
||||
metric=True,
|
||||
n_init=4,
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
eps=1e-3,
|
||||
n_jobs=None,
|
||||
random_state=None,
|
||||
dissimilarity="euclidean",
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.dissimilarity = dissimilarity
|
||||
self.metric = metric
|
||||
self.n_init = n_init
|
||||
self.max_iter = max_iter
|
||||
self.eps = eps
|
||||
self.verbose = verbose
|
||||
self.n_jobs = n_jobs
|
||||
self.random_state = random_state
|
||||
|
||||
def _more_tags(self):
|
||||
return {"pairwise": self.dissimilarity == "precomputed"}
|
||||
|
||||
def fit(self, X, y=None, init=None):
|
||||
"""
|
||||
Compute the position of the points in the embedding space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Input data. If ``dissimilarity=='precomputed'``, the input should
|
||||
be the dissimilarity matrix.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
init : ndarray of shape (n_samples,), default=None
|
||||
Starting configuration of the embedding to initialize the SMACOF
|
||||
algorithm. By default, the algorithm is initialized with a randomly
|
||||
chosen array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
self.fit_transform(X, init=init)
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None, init=None):
|
||||
"""
|
||||
Fit the data from `X`, and returns the embedded coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Input data. If ``dissimilarity=='precomputed'``, the input should
|
||||
be the dissimilarity matrix.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
init : ndarray of shape (n_samples,), default=None
|
||||
Starting configuration of the embedding to initialize the SMACOF
|
||||
algorithm. By default, the algorithm is initialized with a randomly
|
||||
chosen array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : ndarray of shape (n_samples, n_components)
|
||||
X transformed in the new space.
|
||||
"""
|
||||
X = self._validate_data(X)
|
||||
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
|
||||
warnings.warn(
|
||||
"The MDS API has changed. ``fit`` now constructs an"
|
||||
" dissimilarity matrix from data. To use a custom "
|
||||
"dissimilarity matrix, set "
|
||||
"``dissimilarity='precomputed'``."
|
||||
)
|
||||
|
||||
if self.dissimilarity == "precomputed":
|
||||
self.dissimilarity_matrix_ = X
|
||||
elif self.dissimilarity == "euclidean":
|
||||
self.dissimilarity_matrix_ = euclidean_distances(X)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Proximity must be 'precomputed' or 'euclidean'. Got %s instead"
|
||||
% str(self.dissimilarity)
|
||||
)
|
||||
|
||||
self.embedding_, self.stress_, self.n_iter_ = smacof(
|
||||
self.dissimilarity_matrix_,
|
||||
metric=self.metric,
|
||||
n_components=self.n_components,
|
||||
init=init,
|
||||
n_init=self.n_init,
|
||||
n_jobs=self.n_jobs,
|
||||
max_iter=self.max_iter,
|
||||
verbose=self.verbose,
|
||||
eps=self.eps,
|
||||
random_state=self.random_state,
|
||||
return_n_iter=True,
|
||||
)
|
||||
|
||||
return self.embedding_
|
||||
@@ -0,0 +1,671 @@
|
||||
"""Spectral Embedding."""
|
||||
|
||||
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Wei LI <kuantkid@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from scipy.linalg import eigh
|
||||
from scipy.sparse.linalg import eigsh
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
from scipy.sparse.csgraph import laplacian as csgraph_laplacian
|
||||
|
||||
from ..base import BaseEstimator
|
||||
from ..utils import (
|
||||
check_array,
|
||||
check_random_state,
|
||||
check_symmetric,
|
||||
)
|
||||
from ..utils._arpack import _init_arpack_v0
|
||||
from ..utils.extmath import _deterministic_vector_sign_flip
|
||||
from ..utils.fixes import lobpcg
|
||||
from ..metrics.pairwise import rbf_kernel
|
||||
from ..neighbors import kneighbors_graph, NearestNeighbors
|
||||
|
||||
|
||||
def _graph_connected_component(graph, node_id):
|
||||
"""Find the largest graph connected components that contains one
|
||||
given node.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : array-like of shape (n_samples, n_samples)
|
||||
Adjacency matrix of the graph, non-zero weight means an edge
|
||||
between the nodes.
|
||||
|
||||
node_id : int
|
||||
The index of the query node of the graph.
|
||||
|
||||
Returns
|
||||
-------
|
||||
connected_components_matrix : array-like of shape (n_samples,)
|
||||
An array of bool value indicating the indexes of the nodes
|
||||
belonging to the largest connected components of the given query
|
||||
node.
|
||||
"""
|
||||
n_node = graph.shape[0]
|
||||
if sparse.issparse(graph):
|
||||
# speed up row-wise access to boolean connection mask
|
||||
graph = graph.tocsr()
|
||||
connected_nodes = np.zeros(n_node, dtype=bool)
|
||||
nodes_to_explore = np.zeros(n_node, dtype=bool)
|
||||
nodes_to_explore[node_id] = True
|
||||
for _ in range(n_node):
|
||||
last_num_component = connected_nodes.sum()
|
||||
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
|
||||
if last_num_component >= connected_nodes.sum():
|
||||
break
|
||||
indices = np.where(nodes_to_explore)[0]
|
||||
nodes_to_explore.fill(False)
|
||||
for i in indices:
|
||||
if sparse.issparse(graph):
|
||||
neighbors = graph[i].toarray().ravel()
|
||||
else:
|
||||
neighbors = graph[i]
|
||||
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
|
||||
return connected_nodes
|
||||
|
||||
|
||||
def _graph_is_connected(graph):
|
||||
"""Return whether the graph is connected (True) or Not (False).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : {array-like, sparse matrix} of shape (n_samples, n_samples)
|
||||
Adjacency matrix of the graph, non-zero weight means an edge
|
||||
between the nodes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_connected : bool
|
||||
True means the graph is fully connected and False means not.
|
||||
"""
|
||||
if sparse.isspmatrix(graph):
|
||||
# sparse graph, find all the connected components
|
||||
n_connected_components, _ = connected_components(graph)
|
||||
return n_connected_components == 1
|
||||
else:
|
||||
# dense graph, find all connected components start from node 0
|
||||
return _graph_connected_component(graph, 0).sum() == graph.shape[0]
|
||||
|
||||
|
||||
def _set_diag(laplacian, value, norm_laplacian):
|
||||
"""Set the diagonal of the laplacian matrix and convert it to a
|
||||
sparse format well suited for eigenvalue decomposition.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
laplacian : {ndarray, sparse matrix}
|
||||
The graph laplacian.
|
||||
|
||||
value : float
|
||||
The value of the diagonal.
|
||||
|
||||
norm_laplacian : bool
|
||||
Whether the value of the diagonal should be changed or not.
|
||||
|
||||
Returns
|
||||
-------
|
||||
laplacian : {array, sparse matrix}
|
||||
An array of matrix in a form that is well suited to fast
|
||||
eigenvalue decomposition, depending on the band width of the
|
||||
matrix.
|
||||
"""
|
||||
n_nodes = laplacian.shape[0]
|
||||
# We need all entries in the diagonal to values
|
||||
if not sparse.isspmatrix(laplacian):
|
||||
if norm_laplacian:
|
||||
laplacian.flat[:: n_nodes + 1] = value
|
||||
else:
|
||||
laplacian = laplacian.tocoo()
|
||||
if norm_laplacian:
|
||||
diag_idx = laplacian.row == laplacian.col
|
||||
laplacian.data[diag_idx] = value
|
||||
# If the matrix has a small number of diagonals (as in the
|
||||
# case of structured matrices coming from images), the
|
||||
# dia format might be best suited for matvec products:
|
||||
n_diags = np.unique(laplacian.row - laplacian.col).size
|
||||
if n_diags <= 7:
|
||||
# 3 or less outer diagonals on each side
|
||||
laplacian = laplacian.todia()
|
||||
else:
|
||||
# csr has the fastest matvec and is thus best suited to
|
||||
# arpack
|
||||
laplacian = laplacian.tocsr()
|
||||
return laplacian
|
||||
|
||||
|
||||
def spectral_embedding(
|
||||
adjacency,
|
||||
*,
|
||||
n_components=8,
|
||||
eigen_solver=None,
|
||||
random_state=None,
|
||||
eigen_tol=0.0,
|
||||
norm_laplacian=True,
|
||||
drop_first=True,
|
||||
):
|
||||
"""Project the sample on the first eigenvectors of the graph Laplacian.
|
||||
|
||||
The adjacency matrix is used to compute a normalized graph Laplacian
|
||||
whose spectrum (especially the eigenvectors associated to the
|
||||
smallest eigenvalues) has an interpretation in terms of minimal
|
||||
number of cuts necessary to split the graph into comparably sized
|
||||
components.
|
||||
|
||||
This embedding can also 'work' even if the ``adjacency`` variable is
|
||||
not strictly the adjacency matrix of a graph but more generally
|
||||
an affinity or similarity matrix between samples (for instance the
|
||||
heat kernel of a euclidean distance matrix or a k-NN matrix).
|
||||
|
||||
However care must taken to always make the affinity matrix symmetric
|
||||
so that the eigenvector decomposition works as expected.
|
||||
|
||||
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)
|
||||
The adjacency matrix of the graph to embed.
|
||||
|
||||
n_components : int, default=8
|
||||
The dimension of the projection subspace.
|
||||
|
||||
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems,
|
||||
but may also lead to instabilities. If None, then ``'arpack'`` is
|
||||
used.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigen vectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
eigen_tol : float, default=0.0
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix
|
||||
when using arpack eigen_solver.
|
||||
|
||||
norm_laplacian : bool, default=True
|
||||
If True, then compute symmetric normalized Laplacian.
|
||||
|
||||
drop_first : bool, default=True
|
||||
Whether to drop the first eigenvector. For spectral embedding, this
|
||||
should be True as the first eigenvector should be constant vector for
|
||||
connected graph, but for spectral clustering, this should be kept as
|
||||
False to retain the first eigenvector.
|
||||
|
||||
Returns
|
||||
-------
|
||||
embedding : ndarray of shape (n_samples, n_components)
|
||||
The reduced samples.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
|
||||
has one connected component. If there graph has many components, the first
|
||||
few eigenvectors will simply uncover the connected components of the graph.
|
||||
|
||||
References
|
||||
----------
|
||||
* https://en.wikipedia.org/wiki/LOBPCG
|
||||
|
||||
* :doi:`"Toward the Optimal Preconditioned Eigensolver: Locally Optimal
|
||||
Block Preconditioned Conjugate Gradient Method",
|
||||
Andrew V. Knyazev
|
||||
<10.1137/S1064827500366124>`
|
||||
"""
|
||||
adjacency = check_symmetric(adjacency)
|
||||
|
||||
try:
|
||||
from pyamg import smoothed_aggregation_solver
|
||||
except ImportError as e:
|
||||
if eigen_solver == "amg":
|
||||
raise ValueError(
|
||||
"The eigen_solver was set to 'amg', but pyamg is not available."
|
||||
) from e
|
||||
|
||||
if eigen_solver is None:
|
||||
eigen_solver = "arpack"
|
||||
elif eigen_solver not in ("arpack", "lobpcg", "amg"):
|
||||
raise ValueError(
|
||||
"Unknown value for eigen_solver: '%s'."
|
||||
"Should be 'amg', 'arpack', or 'lobpcg'" % eigen_solver
|
||||
)
|
||||
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
n_nodes = adjacency.shape[0]
|
||||
# Whether to drop the first eigenvector
|
||||
if drop_first:
|
||||
n_components = n_components + 1
|
||||
|
||||
if not _graph_is_connected(adjacency):
|
||||
warnings.warn(
|
||||
"Graph is not fully connected, spectral embedding may not work as expected."
|
||||
)
|
||||
|
||||
laplacian, dd = csgraph_laplacian(
|
||||
adjacency, normed=norm_laplacian, return_diag=True
|
||||
)
|
||||
if (
|
||||
eigen_solver == "arpack"
|
||||
or eigen_solver != "lobpcg"
|
||||
and (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)
|
||||
):
|
||||
# lobpcg used with eigen_solver='amg' has bugs for low number of nodes
|
||||
# for details see the source code in scipy:
|
||||
# https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
|
||||
# /lobpcg/lobpcg.py#L237
|
||||
# or matlab:
|
||||
# https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
|
||||
# Here we'll use shift-invert mode for fast eigenvalues
|
||||
# (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
|
||||
# for a short explanation of what this means)
|
||||
# Because the normalized Laplacian has eigenvalues between 0 and 2,
|
||||
# I - L has eigenvalues between -1 and 1. ARPACK is most efficient
|
||||
# when finding eigenvalues of largest magnitude (keyword which='LM')
|
||||
# and when these eigenvalues are very large compared to the rest.
|
||||
# For very large, very sparse graphs, I - L can have many, many
|
||||
# eigenvalues very near 1.0. This leads to slow convergence. So
|
||||
# instead, we'll use ARPACK's shift-invert mode, asking for the
|
||||
# eigenvalues near 1.0. This effectively spreads-out the spectrum
|
||||
# near 1.0 and leads to much faster convergence: potentially an
|
||||
# orders-of-magnitude speedup over simply using keyword which='LA'
|
||||
# in standard mode.
|
||||
try:
|
||||
# We are computing the opposite of the laplacian inplace so as
|
||||
# to spare a memory allocation of a possibly very large array
|
||||
laplacian *= -1
|
||||
v0 = _init_arpack_v0(laplacian.shape[0], random_state)
|
||||
_, diffusion_map = eigsh(
|
||||
laplacian, k=n_components, sigma=1.0, which="LM", tol=eigen_tol, v0=v0
|
||||
)
|
||||
embedding = diffusion_map.T[n_components::-1]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
except RuntimeError:
|
||||
# When submatrices are exactly singular, an LU decomposition
|
||||
# in arpack fails. We fallback to lobpcg
|
||||
eigen_solver = "lobpcg"
|
||||
# Revert the laplacian to its opposite to have lobpcg work
|
||||
laplacian *= -1
|
||||
|
||||
elif eigen_solver == "amg":
|
||||
# Use AMG to get a preconditioner and speed up the eigenvalue
|
||||
# problem.
|
||||
if not sparse.issparse(laplacian):
|
||||
warnings.warn("AMG works better for sparse matrices")
|
||||
laplacian = check_array(
|
||||
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
|
||||
)
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
|
||||
# The Laplacian matrix is always singular, having at least one zero
|
||||
# eigenvalue, corresponding to the trivial eigenvector, which is a
|
||||
# constant. Using a singular matrix for preconditioning may result in
|
||||
# random failures in LOBPCG and is not supported by the existing
|
||||
# theory:
|
||||
# see https://doi.org/10.1007/s10208-015-9297-1
|
||||
# Shift the Laplacian so its diagononal is not all ones. The shift
|
||||
# does change the eigenpairs however, so we'll feed the shifted
|
||||
# matrix to the solver and afterward set it back to the original.
|
||||
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
|
||||
laplacian += diag_shift
|
||||
ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr"))
|
||||
laplacian -= diag_shift
|
||||
|
||||
M = ml.aspreconditioner()
|
||||
# Create initial approximation X to eigenvectors
|
||||
X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1))
|
||||
X[:, 0] = dd.ravel()
|
||||
X = X.astype(laplacian.dtype)
|
||||
_, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.0e-5, largest=False)
|
||||
embedding = diffusion_map.T
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
if embedding.shape[0] == 1:
|
||||
raise ValueError
|
||||
|
||||
if eigen_solver == "lobpcg":
|
||||
laplacian = check_array(
|
||||
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
|
||||
)
|
||||
if n_nodes < 5 * n_components + 1:
|
||||
# see note above under arpack why lobpcg has problems with small
|
||||
# number of nodes
|
||||
# lobpcg will fallback to eigh, so we short circuit it
|
||||
if sparse.isspmatrix(laplacian):
|
||||
laplacian = laplacian.toarray()
|
||||
_, diffusion_map = eigh(laplacian, check_finite=False)
|
||||
embedding = diffusion_map.T[:n_components]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
else:
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
# We increase the number of eigenvectors requested, as lobpcg
|
||||
# doesn't behave well in low dimension and create initial
|
||||
# approximation X to eigenvectors
|
||||
X = random_state.standard_normal(
|
||||
size=(laplacian.shape[0], n_components + 1)
|
||||
)
|
||||
X[:, 0] = dd.ravel()
|
||||
X = X.astype(laplacian.dtype)
|
||||
_, diffusion_map = lobpcg(
|
||||
laplacian, X, tol=1e-5, largest=False, maxiter=2000
|
||||
)
|
||||
embedding = diffusion_map.T[:n_components]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
if embedding.shape[0] == 1:
|
||||
raise ValueError
|
||||
|
||||
embedding = _deterministic_vector_sign_flip(embedding)
|
||||
if drop_first:
|
||||
return embedding[1:n_components].T
|
||||
else:
|
||||
return embedding[:n_components].T
|
||||
|
||||
|
||||
class SpectralEmbedding(BaseEstimator):
|
||||
"""Spectral embedding for non-linear dimensionality reduction.
|
||||
|
||||
Forms an affinity matrix given by the specified function and
|
||||
applies spectral decomposition to the corresponding graph laplacian.
|
||||
The resulting transformation is given by the value of the
|
||||
eigenvectors for each data point.
|
||||
|
||||
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=2
|
||||
The dimension of the projected subspace.
|
||||
|
||||
affinity : {'nearest_neighbors', 'rbf', 'precomputed', \
|
||||
'precomputed_nearest_neighbors'} or callable, \
|
||||
default='nearest_neighbors'
|
||||
How to construct the affinity matrix.
|
||||
- 'nearest_neighbors' : construct the affinity matrix by computing a
|
||||
graph of nearest neighbors.
|
||||
- 'rbf' : construct the affinity matrix by computing a radial basis
|
||||
function (RBF) kernel.
|
||||
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
|
||||
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
|
||||
of precomputed nearest neighbors, and constructs the affinity matrix
|
||||
by selecting the ``n_neighbors`` nearest neighbors.
|
||||
- callable : use passed in function as affinity
|
||||
the function takes in data matrix (n_samples, n_features)
|
||||
and return affinity matrix (n_samples, n_samples).
|
||||
|
||||
gamma : float, default=None
|
||||
Kernel coefficient for rbf kernel. If None, gamma will be set to
|
||||
1/n_features.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigen vectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems.
|
||||
If None, then ``'arpack'`` is used.
|
||||
|
||||
n_neighbors : int, default=None
|
||||
Number of nearest neighbors for nearest_neighbors graph building.
|
||||
If None, n_neighbors will be set to max(n_samples/10, 1).
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : ndarray of shape (n_samples, n_components)
|
||||
Spectral embedding of the training matrix.
|
||||
|
||||
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Affinity_matrix constructed from samples or precomputed.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_neighbors_ : int
|
||||
Number of nearest neighbors effectively used.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Isomap : Non-linear dimensionality reduction through Isometric Mapping.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
- :doi:`A Tutorial on Spectral Clustering, 2007
|
||||
Ulrike von Luxburg
|
||||
<10.1007/s11222-007-9033-z>`
|
||||
|
||||
- On Spectral Clustering: Analysis and an algorithm, 2001
|
||||
Andrew Y. Ng, Michael I. Jordan, Yair Weiss
|
||||
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.8100
|
||||
|
||||
- :doi:`Normalized cuts and image segmentation, 2000
|
||||
Jianbo Shi, Jitendra Malik
|
||||
<10.1109/34.868688>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import SpectralEmbedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = SpectralEmbedding(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_components=2,
|
||||
*,
|
||||
affinity="nearest_neighbors",
|
||||
gamma=None,
|
||||
random_state=None,
|
||||
eigen_solver=None,
|
||||
n_neighbors=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.affinity = affinity
|
||||
self.gamma = gamma
|
||||
self.random_state = random_state
|
||||
self.eigen_solver = eigen_solver
|
||||
self.n_neighbors = n_neighbors
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def _more_tags(self):
|
||||
return {
|
||||
"pairwise": self.affinity
|
||||
in ["precomputed", "precomputed_nearest_neighbors"]
|
||||
}
|
||||
|
||||
def _get_affinity_matrix(self, X, Y=None):
|
||||
"""Calculate the affinity matrix from data
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : array-like of shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
Y: Ignored
|
||||
|
||||
Returns
|
||||
-------
|
||||
affinity_matrix of shape (n_samples, n_samples)
|
||||
"""
|
||||
if self.affinity == "precomputed":
|
||||
self.affinity_matrix_ = X
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "precomputed_nearest_neighbors":
|
||||
estimator = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
|
||||
).fit(X)
|
||||
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
|
||||
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "nearest_neighbors":
|
||||
if sparse.issparse(X):
|
||||
warnings.warn(
|
||||
"Nearest neighbors affinity currently does "
|
||||
"not support sparse input, falling back to "
|
||||
"rbf affinity"
|
||||
)
|
||||
self.affinity = "rbf"
|
||||
else:
|
||||
self.n_neighbors_ = (
|
||||
self.n_neighbors
|
||||
if self.n_neighbors is not None
|
||||
else max(int(X.shape[0] / 10), 1)
|
||||
)
|
||||
self.affinity_matrix_ = kneighbors_graph(
|
||||
X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs
|
||||
)
|
||||
# currently only symmetric affinity_matrix supported
|
||||
self.affinity_matrix_ = 0.5 * (
|
||||
self.affinity_matrix_ + self.affinity_matrix_.T
|
||||
)
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "rbf":
|
||||
self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]
|
||||
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
|
||||
return self.affinity_matrix_
|
||||
self.affinity_matrix_ = self.affinity(X)
|
||||
return self.affinity_matrix_
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the model from data in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
|
||||
X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
if isinstance(self.affinity, str):
|
||||
if self.affinity not in {
|
||||
"nearest_neighbors",
|
||||
"rbf",
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
}:
|
||||
raise ValueError(
|
||||
"%s is not a valid affinity. Expected "
|
||||
"'precomputed', 'rbf', 'nearest_neighbors' "
|
||||
"or a callable."
|
||||
% self.affinity
|
||||
)
|
||||
elif not callable(self.affinity):
|
||||
raise ValueError(
|
||||
"'affinity' is expected to be an affinity name or a callable. Got: %s"
|
||||
% self.affinity
|
||||
)
|
||||
|
||||
affinity_matrix = self._get_affinity_matrix(X)
|
||||
self.embedding_ = spectral_embedding(
|
||||
affinity_matrix,
|
||||
n_components=self.n_components,
|
||||
eigen_solver=self.eigen_solver,
|
||||
random_state=random_state,
|
||||
)
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit the model from data in X and transform X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like of shape (n_samples, n_components)
|
||||
Spectral embedding of the training matrix.
|
||||
"""
|
||||
self.fit(X)
|
||||
return self.embedding_
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,39 @@
|
||||
import os
|
||||
|
||||
import numpy
|
||||
|
||||
|
||||
def configuration(parent_package="", top_path=None):
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
|
||||
config = Configuration("manifold", parent_package, top_path)
|
||||
|
||||
libraries = []
|
||||
if os.name == "posix":
|
||||
libraries.append("m")
|
||||
|
||||
config.add_extension(
|
||||
"_utils",
|
||||
sources=["_utils.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
extra_compile_args=["-O3"],
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_barnes_hut_tsne",
|
||||
sources=["_barnes_hut_tsne.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
extra_compile_args=["-O3"],
|
||||
)
|
||||
|
||||
config.add_subpackage("tests")
|
||||
|
||||
return config
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from numpy.distutils.core import setup
|
||||
|
||||
setup(**configuration().todict())
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,293 @@
|
||||
from itertools import product
|
||||
import numpy as np
|
||||
import math
|
||||
from numpy.testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn import manifold
|
||||
from sklearn import neighbors
|
||||
from sklearn import pipeline
|
||||
from sklearn import preprocessing
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.utils._testing import assert_allclose, assert_allclose_dense_sparse
|
||||
|
||||
from scipy.sparse import rand as sparse_rand
|
||||
|
||||
eigen_solvers = ["auto", "dense", "arpack"]
|
||||
path_methods = ["auto", "FW", "D"]
|
||||
|
||||
|
||||
def create_sample_data(n_pts=25, add_noise=False):
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
n_per_side = int(math.sqrt(n_pts))
|
||||
X = np.array(list(product(range(n_per_side), repeat=2)))
|
||||
if add_noise:
|
||||
# add noise in a third dimension
|
||||
rng = np.random.RandomState(0)
|
||||
noise = 0.1 * rng.randn(n_pts, 1)
|
||||
X = np.concatenate((X, noise), 1)
|
||||
return X
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
|
||||
def test_isomap_simple_grid(n_neighbors, radius):
|
||||
# Isomap should preserve distances when all neighbors are used
|
||||
n_pts = 25
|
||||
X = create_sample_data(n_pts=n_pts, add_noise=False)
|
||||
|
||||
# distances from each point to all others
|
||||
if n_neighbors is not None:
|
||||
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance")
|
||||
else:
|
||||
G = neighbors.radius_neighbors_graph(X, radius, mode="distance")
|
||||
|
||||
for eigen_solver in eigen_solvers:
|
||||
for path_method in path_methods:
|
||||
clf = manifold.Isomap(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=radius,
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
)
|
||||
clf.fit(X)
|
||||
|
||||
if n_neighbors is not None:
|
||||
G_iso = neighbors.kneighbors_graph(
|
||||
clf.embedding_, n_neighbors, mode="distance"
|
||||
)
|
||||
else:
|
||||
G_iso = neighbors.radius_neighbors_graph(
|
||||
clf.embedding_, radius, mode="distance"
|
||||
)
|
||||
assert_allclose_dense_sparse(G, G_iso)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
|
||||
def test_isomap_reconstruction_error(n_neighbors, radius):
|
||||
# Same setup as in test_isomap_simple_grid, with an added dimension
|
||||
n_pts = 25
|
||||
X = create_sample_data(n_pts=n_pts, add_noise=True)
|
||||
|
||||
# compute input kernel
|
||||
if n_neighbors is not None:
|
||||
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()
|
||||
else:
|
||||
G = neighbors.radius_neighbors_graph(X, radius, mode="distance").toarray()
|
||||
centerer = preprocessing.KernelCenterer()
|
||||
K = centerer.fit_transform(-0.5 * G**2)
|
||||
|
||||
for eigen_solver in eigen_solvers:
|
||||
for path_method in path_methods:
|
||||
clf = manifold.Isomap(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=radius,
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
)
|
||||
clf.fit(X)
|
||||
|
||||
# compute output kernel
|
||||
if n_neighbors is not None:
|
||||
G_iso = neighbors.kneighbors_graph(
|
||||
clf.embedding_, n_neighbors, mode="distance"
|
||||
)
|
||||
else:
|
||||
G_iso = neighbors.radius_neighbors_graph(
|
||||
clf.embedding_, radius, mode="distance"
|
||||
)
|
||||
G_iso = G_iso.toarray()
|
||||
K_iso = centerer.fit_transform(-0.5 * G_iso**2)
|
||||
|
||||
# make sure error agrees
|
||||
reconstruction_error = np.linalg.norm(K - K_iso) / n_pts
|
||||
assert_almost_equal(reconstruction_error, clf.reconstruction_error())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 0.5)])
|
||||
def test_transform(n_neighbors, radius):
|
||||
n_samples = 200
|
||||
n_components = 10
|
||||
noise_scale = 0.01
|
||||
|
||||
# Create S-curve dataset
|
||||
X, y = datasets.make_s_curve(n_samples, random_state=0)
|
||||
|
||||
# Compute isomap embedding
|
||||
iso = manifold.Isomap(
|
||||
n_components=n_components, n_neighbors=n_neighbors, radius=radius
|
||||
)
|
||||
X_iso = iso.fit_transform(X)
|
||||
|
||||
# Re-embed a noisy version of the points
|
||||
rng = np.random.RandomState(0)
|
||||
noise = noise_scale * rng.randn(*X.shape)
|
||||
X_iso2 = iso.transform(X + noise)
|
||||
|
||||
# Make sure the rms error on re-embedding is comparable to noise_scale
|
||||
assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 10.0)])
|
||||
def test_pipeline(n_neighbors, radius):
|
||||
# check that Isomap works fine as a transformer in a Pipeline
|
||||
# only checks that no error is raised.
|
||||
# TODO check that it actually does something useful
|
||||
X, y = datasets.make_blobs(random_state=0)
|
||||
clf = pipeline.Pipeline(
|
||||
[
|
||||
("isomap", manifold.Isomap(n_neighbors=n_neighbors, radius=radius)),
|
||||
("clf", neighbors.KNeighborsClassifier()),
|
||||
]
|
||||
)
|
||||
clf.fit(X, y)
|
||||
assert 0.9 < clf.score(X, y)
|
||||
|
||||
|
||||
def test_pipeline_with_nearest_neighbors_transformer():
|
||||
# Test chaining NearestNeighborsTransformer and Isomap with
|
||||
# neighbors_algorithm='precomputed'
|
||||
algorithm = "auto"
|
||||
n_neighbors = 10
|
||||
|
||||
X, _ = datasets.make_blobs(random_state=0)
|
||||
X2, _ = datasets.make_blobs(random_state=1)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = pipeline.make_pipeline(
|
||||
neighbors.KNeighborsTransformer(
|
||||
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
|
||||
),
|
||||
manifold.Isomap(n_neighbors=n_neighbors, metric="precomputed"),
|
||||
)
|
||||
est_compact = manifold.Isomap(
|
||||
n_neighbors=n_neighbors, neighbors_algorithm=algorithm
|
||||
)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
Xt_chain = est_chain.transform(X2)
|
||||
Xt_compact = est_compact.transform(X2)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
def test_different_metric():
|
||||
# Test that the metric parameters work correctly, and default to euclidean
|
||||
def custom_metric(x1, x2):
|
||||
return np.sqrt(np.sum(x1**2 + x2**2))
|
||||
|
||||
# metric, p, is_euclidean
|
||||
metrics = [
|
||||
("euclidean", 2, True),
|
||||
("manhattan", 1, False),
|
||||
("minkowski", 1, False),
|
||||
("minkowski", 2, True),
|
||||
(custom_metric, 2, False),
|
||||
]
|
||||
|
||||
X, _ = datasets.make_blobs(random_state=0)
|
||||
reference = manifold.Isomap().fit_transform(X)
|
||||
|
||||
for metric, p, is_euclidean in metrics:
|
||||
embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
|
||||
|
||||
if is_euclidean:
|
||||
assert_array_almost_equal(embedding, reference)
|
||||
else:
|
||||
with pytest.raises(AssertionError, match="not almost equal"):
|
||||
assert_array_almost_equal(embedding, reference)
|
||||
|
||||
|
||||
def test_isomap_clone_bug():
|
||||
# regression test for bug reported in #6062
|
||||
model = manifold.Isomap()
|
||||
for n_neighbors in [10, 15, 20]:
|
||||
model.set_params(n_neighbors=n_neighbors)
|
||||
model.fit(np.random.rand(50, 2))
|
||||
assert model.nbrs_.n_neighbors == n_neighbors
|
||||
|
||||
|
||||
def test_sparse_input():
|
||||
X = sparse_rand(100, 3, density=0.1, format="csr")
|
||||
|
||||
# Should not error
|
||||
for eigen_solver in eigen_solvers:
|
||||
for path_method in path_methods:
|
||||
clf = manifold.Isomap(
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
n_neighbors=8,
|
||||
)
|
||||
clf.fit(X)
|
||||
|
||||
|
||||
def test_isomap_fit_precomputed_radius_graph():
|
||||
# Isomap.fit_transform must yield similar result when using
|
||||
# a precomputed distance matrix.
|
||||
|
||||
X, y = datasets.make_s_curve(200, random_state=0)
|
||||
radius = 10
|
||||
|
||||
g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance")
|
||||
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed")
|
||||
isomap.fit(g)
|
||||
precomputed_result = isomap.embedding_
|
||||
|
||||
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski")
|
||||
result = isomap.fit_transform(X)
|
||||
assert_allclose(precomputed_result, result)
|
||||
|
||||
|
||||
def test_isomap_raise_error_when_neighbor_and_radius_both_set():
|
||||
# Isomap.fit_transform must raise a ValueError if
|
||||
# radius and n_neighbors are provided.
|
||||
|
||||
X, _ = datasets.load_digits(return_X_y=True)
|
||||
isomap = manifold.Isomap(n_neighbors=3, radius=5.5)
|
||||
msg = "Both n_neighbors and radius are provided"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
isomap.fit_transform(X)
|
||||
|
||||
|
||||
def test_multiple_connected_components():
|
||||
# Test that a warning is raised when the graph has multiple components
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
with pytest.warns(UserWarning, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=2).fit(X)
|
||||
|
||||
|
||||
def test_multiple_connected_components_metric_precomputed():
|
||||
# Test that an error is raised when the graph has multiple components
|
||||
# and when X is a precomputed neighbors graph.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
|
||||
# works with a precomputed distance matrix (dense)
|
||||
X_distances = pairwise_distances(X)
|
||||
with pytest.warns(UserWarning, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_distances)
|
||||
|
||||
# does not work with a precomputed neighbors graph (sparse)
|
||||
X_graph = neighbors.kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
with pytest.raises(RuntimeError, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_graph)
|
||||
|
||||
|
||||
def test_get_feature_names_out():
|
||||
"""Check get_feature_names_out for Isomap."""
|
||||
X, y = make_blobs(random_state=0, n_features=4)
|
||||
n_components = 2
|
||||
|
||||
iso = manifold.Isomap(n_components=n_components)
|
||||
iso.fit_transform(X)
|
||||
names = iso.get_feature_names_out()
|
||||
assert_array_equal([f"isomap{i}" for i in range(n_components)], names)
|
||||
@@ -0,0 +1,186 @@
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_equal,
|
||||
)
|
||||
from scipy import linalg
|
||||
import pytest
|
||||
|
||||
from sklearn import neighbors, manifold
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
eigen_solvers = ["dense", "arpack"]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Test utility routines
|
||||
def test_barycenter_kneighbors_graph(global_dtype):
|
||||
X = np.array([[0, 1], [1.01, 1.0], [2, 0]], dtype=global_dtype)
|
||||
|
||||
graph = barycenter_kneighbors_graph(X, 1)
|
||||
expected_graph = np.array(
|
||||
[[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=global_dtype
|
||||
)
|
||||
|
||||
assert graph.dtype == global_dtype
|
||||
|
||||
assert_allclose(graph.toarray(), expected_graph)
|
||||
|
||||
graph = barycenter_kneighbors_graph(X, 2)
|
||||
# check that columns sum to one
|
||||
assert_allclose(np.sum(graph.toarray(), axis=1), np.ones(3))
|
||||
pred = np.dot(graph.toarray(), X)
|
||||
assert linalg.norm(pred - X) / X.shape[0] < 1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Test LLE by computing the reconstruction error on some manifolds.
|
||||
|
||||
|
||||
def test_lle_simple_grid(global_dtype):
|
||||
# note: ARPACK is numerically unstable, so this test will fail for
|
||||
# some random seeds. We choose 42 because the tests pass.
|
||||
# for arm64 platforms 2 makes the test fail.
|
||||
# TODO: rewrite this test to make less sensitive to the random seed,
|
||||
# irrespective of the platform.
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
X = np.array(list(product(range(5), repeat=2)))
|
||||
X = X + 1e-10 * rng.uniform(size=X.shape)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
n_components = 2
|
||||
clf = manifold.LocallyLinearEmbedding(
|
||||
n_neighbors=5, n_components=n_components, random_state=rng
|
||||
)
|
||||
tol = 0.1
|
||||
|
||||
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
|
||||
reconstruction_error = linalg.norm(np.dot(N, X) - X, "fro")
|
||||
assert reconstruction_error < tol
|
||||
|
||||
for solver in eigen_solvers:
|
||||
clf.set_params(eigen_solver=solver)
|
||||
clf.fit(X)
|
||||
assert clf.embedding_.shape[1] == n_components
|
||||
reconstruction_error = (
|
||||
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
|
||||
)
|
||||
|
||||
assert reconstruction_error < tol
|
||||
assert_allclose(clf.reconstruction_error_, reconstruction_error, atol=1e-1)
|
||||
|
||||
# re-embed a noisy version of X using the transform method
|
||||
noise = rng.randn(*X.shape).astype(global_dtype, copy=False) / 100
|
||||
X_reembedded = clf.transform(X + noise)
|
||||
assert linalg.norm(X_reembedded - clf.embedding_) < tol
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["standard", "hessian", "modified", "ltsa"])
|
||||
@pytest.mark.parametrize("solver", eigen_solvers)
|
||||
def test_lle_manifold(global_dtype, method, solver):
|
||||
rng = np.random.RandomState(0)
|
||||
# similar test on a slightly more complex manifold
|
||||
X = np.array(list(product(np.arange(18), repeat=2)))
|
||||
X = np.c_[X, X[:, 0] ** 2 / 18]
|
||||
X = X + 1e-10 * rng.uniform(size=X.shape)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
n_components = 2
|
||||
|
||||
clf = manifold.LocallyLinearEmbedding(
|
||||
n_neighbors=6, n_components=n_components, method=method, random_state=0
|
||||
)
|
||||
tol = 1.5 if method == "standard" else 3
|
||||
|
||||
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
|
||||
reconstruction_error = linalg.norm(np.dot(N, X) - X)
|
||||
assert reconstruction_error < tol
|
||||
|
||||
clf.set_params(eigen_solver=solver)
|
||||
clf.fit(X)
|
||||
assert clf.embedding_.shape[1] == n_components
|
||||
reconstruction_error = (
|
||||
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
|
||||
)
|
||||
details = "solver: %s, method: %s" % (solver, method)
|
||||
assert reconstruction_error < tol, details
|
||||
assert (
|
||||
np.abs(clf.reconstruction_error_ - reconstruction_error)
|
||||
< tol * reconstruction_error
|
||||
), details
|
||||
|
||||
|
||||
# Test the error raised when parameter passed to lle is invalid
|
||||
def test_lle_init_parameters():
|
||||
X = np.random.rand(5, 3)
|
||||
|
||||
clf = manifold.LocallyLinearEmbedding(eigen_solver="error")
|
||||
msg = "unrecognized eigen_solver 'error'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clf.fit(X)
|
||||
|
||||
clf = manifold.LocallyLinearEmbedding(method="error")
|
||||
msg = "unrecognized method 'error'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clf.fit(X)
|
||||
|
||||
|
||||
def test_pipeline():
|
||||
# check that LocallyLinearEmbedding works fine as a Pipeline
|
||||
# only checks that no error is raised.
|
||||
# TODO check that it actually does something useful
|
||||
from sklearn import pipeline, datasets
|
||||
|
||||
X, y = datasets.make_blobs(random_state=0)
|
||||
clf = pipeline.Pipeline(
|
||||
[
|
||||
("filter", manifold.LocallyLinearEmbedding(random_state=0)),
|
||||
("clf", neighbors.KNeighborsClassifier()),
|
||||
]
|
||||
)
|
||||
clf.fit(X, y)
|
||||
assert 0.9 < clf.score(X, y)
|
||||
|
||||
|
||||
# Test the error raised when the weight matrix is singular
|
||||
def test_singular_matrix():
|
||||
M = np.ones((10, 3))
|
||||
f = ignore_warnings
|
||||
with pytest.raises(ValueError):
|
||||
f(
|
||||
manifold.locally_linear_embedding(
|
||||
M,
|
||||
n_neighbors=2,
|
||||
n_components=1,
|
||||
method="standard",
|
||||
eigen_solver="arpack",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# regression test for #6033
|
||||
def test_integer_input():
|
||||
rand = np.random.RandomState(0)
|
||||
X = rand.randint(0, 100, size=(20, 3))
|
||||
|
||||
for method in ["standard", "hessian", "modified", "ltsa"]:
|
||||
clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
|
||||
clf.fit(X) # this previously raised a TypeError
|
||||
|
||||
|
||||
def test_get_feature_names_out():
|
||||
"""Check get_feature_names_out for LocallyLinearEmbedding."""
|
||||
X, y = make_blobs(random_state=0, n_features=4)
|
||||
n_components = 2
|
||||
|
||||
iso = manifold.LocallyLinearEmbedding(n_components=n_components)
|
||||
iso.fit(X)
|
||||
names = iso.get_feature_names_out()
|
||||
assert_array_equal(
|
||||
[f"locallylinearembedding{i}" for i in range(n_components)], names
|
||||
)
|
||||
@@ -0,0 +1,44 @@
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
import pytest
|
||||
|
||||
from sklearn.manifold import _mds as mds
|
||||
|
||||
|
||||
def test_smacof():
|
||||
# test metric smacof using the data of "Modern Multidimensional Scaling",
|
||||
# Borg & Groenen, p 154
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]])
|
||||
X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
|
||||
X_true = np.array(
|
||||
[[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]]
|
||||
)
|
||||
assert_array_almost_equal(X, X_true, decimal=3)
|
||||
|
||||
|
||||
def test_smacof_error():
|
||||
# Not symmetric similarity matrix:
|
||||
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim)
|
||||
|
||||
# Not squared similarity matrix:
|
||||
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim)
|
||||
|
||||
# init not None and not correct format:
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]])
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim, init=Z, n_init=1)
|
||||
|
||||
|
||||
def test_MDS():
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed")
|
||||
mds_clf.fit(sim)
|
||||
@@ -0,0 +1,482 @@
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy import sparse
|
||||
from scipy.sparse import csgraph
|
||||
from scipy.linalg import eigh
|
||||
|
||||
from sklearn.manifold import SpectralEmbedding
|
||||
from sklearn.manifold._spectral_embedding import _graph_is_connected
|
||||
from sklearn.manifold._spectral_embedding import _graph_connected_component
|
||||
from sklearn.manifold import spectral_embedding
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
from sklearn.metrics import normalized_mutual_info_score
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.utils.extmath import _deterministic_vector_sign_flip
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
try:
|
||||
from pyamg import smoothed_aggregation_solver # noqa
|
||||
|
||||
pyamg_available = True
|
||||
except ImportError:
|
||||
pyamg_available = False
|
||||
skip_if_no_pyamg = pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
|
||||
# non centered, sparse centers to check the
|
||||
centers = np.array(
|
||||
[
|
||||
[0.0, 5.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 4.0, 0.0, 0.0],
|
||||
[1.0, 0.0, 0.0, 5.0, 1.0],
|
||||
]
|
||||
)
|
||||
n_samples = 1000
|
||||
n_clusters, n_features = centers.shape
|
||||
S, true_labels = make_blobs(
|
||||
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
|
||||
)
|
||||
|
||||
|
||||
def _assert_equal_with_sign_flipping(A, B, tol=0.0):
|
||||
"""Check array A and B are equal with possible sign flipping on
|
||||
each columns"""
|
||||
tol_squared = tol**2
|
||||
for A_col, B_col in zip(A.T, B.T):
|
||||
assert (
|
||||
np.max((A_col - B_col) ** 2) <= tol_squared
|
||||
or np.max((A_col + B_col) ** 2) <= tol_squared
|
||||
)
|
||||
|
||||
|
||||
def test_sparse_graph_connected_component():
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 300
|
||||
boundaries = [0, 42, 121, 200, n_samples]
|
||||
p = rng.permutation(n_samples)
|
||||
connections = []
|
||||
|
||||
for start, stop in zip(boundaries[:-1], boundaries[1:]):
|
||||
group = p[start:stop]
|
||||
# Connect all elements within the group at least once via an
|
||||
# arbitrary path that spans the group.
|
||||
for i in range(len(group) - 1):
|
||||
connections.append((group[i], group[i + 1]))
|
||||
|
||||
# Add some more random connections within the group
|
||||
min_idx, max_idx = 0, len(group) - 1
|
||||
n_random_connections = 1000
|
||||
source = rng.randint(min_idx, max_idx, size=n_random_connections)
|
||||
target = rng.randint(min_idx, max_idx, size=n_random_connections)
|
||||
connections.extend(zip(group[source], group[target]))
|
||||
|
||||
# Build a symmetric affinity matrix
|
||||
row_idx, column_idx = tuple(np.array(connections).T)
|
||||
data = rng.uniform(0.1, 42, size=len(connections))
|
||||
affinity = sparse.coo_matrix((data, (row_idx, column_idx)))
|
||||
affinity = 0.5 * (affinity + affinity.T)
|
||||
|
||||
for start, stop in zip(boundaries[:-1], boundaries[1:]):
|
||||
component_1 = _graph_connected_component(affinity, p[start])
|
||||
component_size = stop - start
|
||||
assert component_1.sum() == component_size
|
||||
|
||||
# We should retrieve the same component mask by starting by both ends
|
||||
# of the group
|
||||
component_2 = _graph_connected_component(affinity, p[stop - 1])
|
||||
assert component_2.sum() == component_size
|
||||
assert_array_equal(component_1, component_2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_spectral_embedding_two_components(eigen_solver, dtype, seed=36):
|
||||
# Test spectral embedding with two components
|
||||
random_state = np.random.RandomState(seed)
|
||||
n_sample = 100
|
||||
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
|
||||
# first component
|
||||
affinity[0:n_sample, 0:n_sample] = (
|
||||
np.abs(random_state.randn(n_sample, n_sample)) + 2
|
||||
)
|
||||
# second component
|
||||
affinity[n_sample::, n_sample::] = (
|
||||
np.abs(random_state.randn(n_sample, n_sample)) + 2
|
||||
)
|
||||
|
||||
# Test of internal _graph_connected_component before connection
|
||||
component = _graph_connected_component(affinity, 0)
|
||||
assert component[:n_sample].all()
|
||||
assert not component[n_sample:].any()
|
||||
component = _graph_connected_component(affinity, -1)
|
||||
assert not component[:n_sample].any()
|
||||
assert component[n_sample:].all()
|
||||
|
||||
# connection
|
||||
affinity[0, n_sample + 1] = 1
|
||||
affinity[n_sample + 1, 0] = 1
|
||||
affinity.flat[:: 2 * n_sample + 1] = 0
|
||||
affinity = 0.5 * (affinity + affinity.T)
|
||||
|
||||
true_label = np.zeros(shape=2 * n_sample)
|
||||
true_label[0:n_sample] = 1
|
||||
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=1,
|
||||
affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
|
||||
embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
|
||||
# thresholding on the first components using 0.
|
||||
label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
|
||||
assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)], ids=["dense", "sparse"])
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_spectral_embedding_precomputed_affinity(X, eigen_solver, dtype, seed=36):
|
||||
# Test spectral embedding with precomputed kernel
|
||||
gamma = 1.0
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
|
||||
embed_rbf = se_rbf.fit_transform(X.astype(dtype))
|
||||
assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
|
||||
_assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
|
||||
|
||||
|
||||
def test_precomputed_nearest_neighbors_filtering():
|
||||
# Test precomputed graph filtering when containing too many neighbors
|
||||
n_neighbors = 2
|
||||
results = []
|
||||
for additional_neighbors in [0, 10]:
|
||||
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S)
|
||||
graph = nn.kneighbors_graph(S, mode="connectivity")
|
||||
embedding = (
|
||||
SpectralEmbedding(
|
||||
random_state=0,
|
||||
n_components=2,
|
||||
affinity="precomputed_nearest_neighbors",
|
||||
n_neighbors=n_neighbors,
|
||||
)
|
||||
.fit(graph)
|
||||
.embedding_
|
||||
)
|
||||
results.append(embedding)
|
||||
|
||||
assert_array_equal(results[0], results[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)], ids=["dense", "sparse"])
|
||||
def test_spectral_embedding_callable_affinity(X, seed=36):
|
||||
# Test spectral embedding with callable affinity
|
||||
gamma = 0.9
|
||||
kern = rbf_kernel(S, gamma=gamma)
|
||||
se_callable = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity=(lambda x: rbf_kernel(x, gamma=gamma)),
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
embed_rbf = se_rbf.fit_transform(X)
|
||||
embed_callable = se_callable.fit_transform(X)
|
||||
assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
|
||||
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
|
||||
_assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
|
||||
|
||||
|
||||
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/15913
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of np.float
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of pinv2
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_spectral_embedding_amg_solver(dtype, seed=36):
|
||||
se_amg = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="nearest_neighbors",
|
||||
eigen_solver="amg",
|
||||
n_neighbors=5,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
se_arpack = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="nearest_neighbors",
|
||||
eigen_solver="arpack",
|
||||
n_neighbors=5,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
embed_amg = se_amg.fit_transform(S.astype(dtype))
|
||||
embed_arpack = se_arpack.fit_transform(S.astype(dtype))
|
||||
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
|
||||
|
||||
# same with special case in which amg is not actually used
|
||||
# regression test for #10715
|
||||
# affinity between nodes
|
||||
row = [0, 0, 1, 2, 3, 3, 4]
|
||||
col = [1, 2, 2, 3, 4, 5, 5]
|
||||
val = [100, 100, 100, 1, 100, 100, 100]
|
||||
|
||||
affinity = sparse.coo_matrix(
|
||||
(val + val, (row + col, col + row)), shape=(6, 6)
|
||||
).toarray()
|
||||
se_amg.affinity = "precomputed"
|
||||
se_arpack.affinity = "precomputed"
|
||||
embed_amg = se_amg.fit_transform(affinity.astype(dtype))
|
||||
embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
|
||||
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
|
||||
|
||||
|
||||
# TODO: Remove filterwarnings when pyamg does replaces sp.rand call with
|
||||
# np.random.rand:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/15913
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of np.float
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of pinv2
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
|
||||
# Non-regression test for amg solver failure (issue #13393 on github)
|
||||
num_nodes = 100
|
||||
X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
|
||||
X = X.astype(dtype)
|
||||
upper = sparse.triu(X) - sparse.diags(X.diagonal())
|
||||
sym_matrix = upper + upper.T
|
||||
embedding = spectral_embedding(
|
||||
sym_matrix, n_components=10, eigen_solver="amg", random_state=0
|
||||
)
|
||||
|
||||
# Check that the learned embedding is stable w.r.t. random solver init:
|
||||
for i in range(3):
|
||||
new_embedding = spectral_embedding(
|
||||
sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1
|
||||
)
|
||||
_assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:the behavior of nmi will change in version 0.22")
|
||||
def test_pipeline_spectral_clustering(seed=36):
|
||||
# Test using pipeline to do spectral clustering
|
||||
random_state = np.random.RandomState(seed)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=n_clusters, affinity="rbf", random_state=random_state
|
||||
)
|
||||
se_knn = SpectralEmbedding(
|
||||
n_components=n_clusters,
|
||||
affinity="nearest_neighbors",
|
||||
n_neighbors=5,
|
||||
random_state=random_state,
|
||||
)
|
||||
for se in [se_rbf, se_knn]:
|
||||
km = KMeans(n_clusters=n_clusters, random_state=random_state)
|
||||
km.fit(se.fit_transform(S))
|
||||
assert_array_almost_equal(
|
||||
normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2
|
||||
)
|
||||
|
||||
|
||||
def test_spectral_embedding_unknown_eigensolver(seed=36):
|
||||
# Test that SpectralClustering fails with an unknown eigensolver
|
||||
se = SpectralEmbedding(
|
||||
n_components=1,
|
||||
affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver="<unknown>",
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
se.fit(S)
|
||||
|
||||
|
||||
def test_spectral_embedding_unknown_affinity(seed=36):
|
||||
# Test that SpectralClustering fails with an unknown affinity type
|
||||
se = SpectralEmbedding(
|
||||
n_components=1,
|
||||
affinity="<unknown>",
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
se.fit(S)
|
||||
|
||||
|
||||
def test_connectivity(seed=36):
|
||||
# Test that graph connectivity test works as expected
|
||||
graph = np.array(
|
||||
[
|
||||
[1, 0, 0, 0, 0],
|
||||
[0, 1, 1, 0, 0],
|
||||
[0, 1, 1, 1, 0],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1],
|
||||
]
|
||||
)
|
||||
assert not _graph_is_connected(graph)
|
||||
assert not _graph_is_connected(sparse.csr_matrix(graph))
|
||||
assert not _graph_is_connected(sparse.csc_matrix(graph))
|
||||
graph = np.array(
|
||||
[
|
||||
[1, 1, 0, 0, 0],
|
||||
[1, 1, 1, 0, 0],
|
||||
[0, 1, 1, 1, 0],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1],
|
||||
]
|
||||
)
|
||||
assert _graph_is_connected(graph)
|
||||
assert _graph_is_connected(sparse.csr_matrix(graph))
|
||||
assert _graph_is_connected(sparse.csc_matrix(graph))
|
||||
|
||||
|
||||
def test_spectral_embedding_deterministic():
|
||||
# Test that Spectral Embedding is deterministic
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
embedding_1 = spectral_embedding(sims)
|
||||
embedding_2 = spectral_embedding(sims)
|
||||
assert_array_almost_equal(embedding_1, embedding_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_unnormalized():
|
||||
# Test that spectral_embedding is also processing unnormalized laplacian
|
||||
# correctly
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
n_components = 8
|
||||
embedding_1 = spectral_embedding(
|
||||
sims, norm_laplacian=False, n_components=n_components, drop_first=False
|
||||
)
|
||||
|
||||
# Verify using manual computation with dense eigh
|
||||
laplacian, dd = csgraph.laplacian(sims, normed=False, return_diag=True)
|
||||
_, diffusion_map = eigh(laplacian)
|
||||
embedding_2 = diffusion_map.T[:n_components]
|
||||
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
|
||||
|
||||
assert_array_almost_equal(embedding_1, embedding_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_first_eigen_vector():
|
||||
# Test that the first eigenvector of spectral_embedding
|
||||
# is constant and that the second is not (for a connected graph)
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
n_components = 2
|
||||
|
||||
for seed in range(10):
|
||||
embedding = spectral_embedding(
|
||||
sims,
|
||||
norm_laplacian=False,
|
||||
n_components=n_components,
|
||||
drop_first=False,
|
||||
random_state=seed,
|
||||
)
|
||||
|
||||
assert np.std(embedding[:, 0]) == pytest.approx(0)
|
||||
assert np.std(embedding[:, 1]) > 1e-3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
|
||||
"""Check that `SpectralEmbedding is preserving the dtype of the fitted
|
||||
attribute and transformed data.
|
||||
|
||||
Ideally, this test should be covered by the common test
|
||||
`check_transformer_preserve_dtypes`. However, this test only run
|
||||
with transformers implementing `transform` while `SpectralEmbedding`
|
||||
implements only `fit_transform`.
|
||||
"""
|
||||
X = S.astype(dtype)
|
||||
se = SpectralEmbedding(
|
||||
n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
|
||||
)
|
||||
X_trans = se.fit_transform(X)
|
||||
|
||||
assert X_trans.dtype == dtype
|
||||
assert se.embedding_.dtype == dtype
|
||||
assert se.affinity_matrix_.dtype == dtype
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
pyamg_available,
|
||||
reason="PyAMG is installed and we should not test for an error.",
|
||||
)
|
||||
def test_error_pyamg_not_available():
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
eigen_solver="amg",
|
||||
)
|
||||
err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
se_precomp.fit_transform(S)
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user