first commit

This commit is contained in:
Carla Floricel
2022-08-02 09:52:52 -04:00
parent 417ea8660b
commit 05e52aa52b
10444 changed files with 2300232 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
"""
The :mod:`sklearn.neighbors` module implements the k-nearest neighbors
algorithm.
"""
from ._ball_tree import BallTree
from ._kd_tree import KDTree
from ._distance_metric import DistanceMetric
from ._graph import kneighbors_graph, radius_neighbors_graph
from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer
from ._unsupervised import NearestNeighbors
from ._classification import KNeighborsClassifier, RadiusNeighborsClassifier
from ._regression import KNeighborsRegressor, RadiusNeighborsRegressor
from ._nearest_centroid import NearestCentroid
from ._kde import KernelDensity
from ._lof import LocalOutlierFactor
from ._nca import NeighborhoodComponentsAnalysis
from ._base import VALID_METRICS, VALID_METRICS_SPARSE
__all__ = [
"BallTree",
"DistanceMetric",
"KDTree",
"KNeighborsClassifier",
"KNeighborsRegressor",
"KNeighborsTransformer",
"NearestCentroid",
"NearestNeighbors",
"RadiusNeighborsClassifier",
"RadiusNeighborsRegressor",
"RadiusNeighborsTransformer",
"kneighbors_graph",
"radius_neighbors_graph",
"KernelDensity",
"LocalOutlierFactor",
"NeighborhoodComponentsAnalysis",
"VALID_METRICS",
"VALID_METRICS_SPARSE",
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,700 @@
"""Nearest Neighbor Classification"""
# Authors: Jake Vanderplas <vanderplas@astro.washington.edu>
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Sparseness support by Lars Buitinck
# Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
#
# License: BSD 3 clause (C) INRIA, University of Amsterdam
import numpy as np
from scipy import stats
from ..utils.extmath import weighted_mode
from ..utils.validation import _is_arraylike, _num_samples
import warnings
from ._base import _check_weights, _get_weights
from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
from ..base import ClassifierMixin
class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
"""Classifier implementing the k-nearest neighbors vote.
Read more in the :ref:`User Guide <classification>`.
Parameters
----------
n_neighbors : int, default=5
Number of neighbors to use by default for :meth:`kneighbors` queries.
weights : {'uniform', 'distance'} or callable, default='uniform'
Weight function used in prediction. Possible values:
- 'uniform' : uniform weights. All points in each neighborhood
are weighted equally.
- 'distance' : weight points by the inverse of their distance.
in this case, closer neighbors of a query point will have a
greater influence than neighbors which are further away.
- [callable] : a user-defined function which accepts an
array of distances, and returns an array of the same shape
containing the weights.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, default=30
Leaf size passed to BallTree or KDTree. This can affect the
speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
p : int, default=2
Power parameter for the Minkowski metric. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric : str or callable, default='minkowski'
The distance metric to use for the tree. The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric. For a list of available metrics, see the documentation of
:class:`~sklearn.metrics.DistanceMetric` and the metrics listed in
`sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`. Note that the
"cosine" metric uses :func:`~sklearn.metrics.pairwise.cosine_distances`.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit. X may be a :term:`sparse graph`,
in which case only "nonzero" elements may be considered neighbors.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Doesn't affect :meth:`fit` method.
Attributes
----------
classes_ : array of shape (n_classes,)
Class labels known to the classifier
effective_metric_ : str or callble
The distance metric used. It will be same as the `metric` parameter
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
'minkowski' and `p` parameter set to 2.
effective_metric_params_ : dict
Additional keyword arguments for the metric function. For most metrics
will be same with `metric_params` parameter, but may also contain the
`p` parameter value if the `effective_metric_` attribute is set to
'minkowski'.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_samples_fit_ : int
Number of samples in the fitted data.
outputs_2d_ : bool
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
otherwise True.
See Also
--------
RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius.
KNeighborsRegressor: Regression based on k-nearest neighbors.
RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius.
NearestNeighbors: Unsupervised learner for implementing neighbor searches.
Notes
-----
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
.. warning::
Regarding the Nearest Neighbors algorithms, if it is found that two
neighbors, neighbor `k+1` and `k`, have identical distances
but different labels, the results will depend on the ordering of the
training data.
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
Examples
--------
>>> X = [[0], [1], [2], [3]]
>>> y = [0, 0, 1, 1]
>>> from sklearn.neighbors import KNeighborsClassifier
>>> neigh = KNeighborsClassifier(n_neighbors=3)
>>> neigh.fit(X, y)
KNeighborsClassifier(...)
>>> print(neigh.predict([[1.1]]))
[0]
>>> print(neigh.predict_proba([[0.9]]))
[[0.666... 0.333...]]
"""
def __init__(
self,
n_neighbors=5,
*,
weights="uniform",
algorithm="auto",
leaf_size=30,
p=2,
metric="minkowski",
metric_params=None,
n_jobs=None,
):
super().__init__(
n_neighbors=n_neighbors,
algorithm=algorithm,
leaf_size=leaf_size,
metric=metric,
p=p,
metric_params=metric_params,
n_jobs=n_jobs,
)
self.weights = weights
def fit(self, X, y):
"""Fit the k-nearest neighbors classifier from the training dataset.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples) if metric='precomputed'
Training data.
y : {array-like, sparse matrix} of shape (n_samples,) or \
(n_samples, n_outputs)
Target values.
Returns
-------
self : KNeighborsClassifier
The fitted k-nearest neighbors classifier.
"""
self.weights = _check_weights(self.weights)
return self._fit(X, y)
def predict(self, X):
"""Predict the class labels for the provided data.
Parameters
----------
X : array-like of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
Returns
-------
y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
Class labels for each data sample.
"""
if self.weights == "uniform":
# In that case, we do not need the distances to perform
# the weighting so we do not compute them.
neigh_ind = self.kneighbors(X, return_distance=False)
neigh_dist = None
else:
neigh_dist, neigh_ind = self.kneighbors(X)
classes_ = self.classes_
_y = self._y
if not self.outputs_2d_:
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]
n_outputs = len(classes_)
n_queries = _num_samples(X)
weights = _get_weights(neigh_dist, self.weights)
y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
for k, classes_k in enumerate(classes_):
if weights is None:
mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
else:
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
mode = np.asarray(mode.ravel(), dtype=np.intp)
y_pred[:, k] = classes_k.take(mode)
if not self.outputs_2d_:
y_pred = y_pred.ravel()
return y_pred
def predict_proba(self, X):
"""Return probability estimates for the test data X.
Parameters
----------
X : array-like of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
Returns
-------
p : ndarray of shape (n_queries, n_classes), or a list of n_outputs \
of such arrays if n_outputs > 1.
The class probabilities of the input samples. Classes are ordered
by lexicographic order.
"""
if self.weights == "uniform":
# In that case, we do not need the distances to perform
# the weighting so we do not compute them.
neigh_ind = self.kneighbors(X, return_distance=False)
neigh_dist = None
else:
neigh_dist, neigh_ind = self.kneighbors(X)
classes_ = self.classes_
_y = self._y
if not self.outputs_2d_:
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]
n_queries = _num_samples(X)
weights = _get_weights(neigh_dist, self.weights)
if weights is None:
weights = np.ones_like(neigh_ind)
all_rows = np.arange(n_queries)
probabilities = []
for k, classes_k in enumerate(classes_):
pred_labels = _y[:, k][neigh_ind]
proba_k = np.zeros((n_queries, classes_k.size))
# a simple ':' index doesn't work right
for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors)
proba_k[all_rows, idx] += weights[:, i]
# normalize 'votes' into real [0,1] probabilities
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba_k /= normalizer
probabilities.append(proba_k)
if not self.outputs_2d_:
probabilities = probabilities[0]
return probabilities
def _more_tags(self):
return {"multilabel": True}
class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase):
"""Classifier implementing a vote among neighbors within a given radius.
Read more in the :ref:`User Guide <classification>`.
Parameters
----------
radius : float, default=1.0
Range of parameter space to use by default for :meth:`radius_neighbors`
queries.
weights : {'uniform', 'distance'} or callable, default='uniform'
Weight function used in prediction. Possible values:
- 'uniform' : uniform weights. All points in each neighborhood
are weighted equally.
- 'distance' : weight points by the inverse of their distance.
in this case, closer neighbors of a query point will have a
greater influence than neighbors which are further away.
- [callable] : a user-defined function which accepts an
array of distances, and returns an array of the same shape
containing the weights.
Uniform weights are used by default.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, default=30
Leaf size passed to BallTree or KDTree. This can affect the
speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
p : int, default=2
Power parameter for the Minkowski metric. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric : str or callable, default='minkowski'
Distance metric to use for the tree. The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric. For a list of available metrics, see the documentation of
:class:`~sklearn.metrics.DistanceMetric`.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit. X may be a :term:`sparse graph`,
in which case only "nonzero" elements may be considered neighbors.
outlier_label : {manual label, 'most_frequent'}, default=None
Label for outlier samples (samples with no neighbors in given radius).
- manual label: str or int label (should be the same type as y)
or list of manual labels if multi-output is used.
- 'most_frequent' : assign the most frequent label of y to outliers.
- None : when any outlier is detected, ValueError will be raised.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
**kwargs : dict
Additional keyword arguments passed to the constructor.
.. deprecated:: 1.0
The RadiusNeighborsClassifier class will not longer accept extra
keyword parameters in 1.2 since they are unused.
Attributes
----------
classes_ : ndarray of shape (n_classes,)
Class labels known to the classifier.
effective_metric_ : str or callable
The distance metric used. It will be same as the `metric` parameter
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
'minkowski' and `p` parameter set to 2.
effective_metric_params_ : dict
Additional keyword arguments for the metric function. For most metrics
will be same with `metric_params` parameter, but may also contain the
`p` parameter value if the `effective_metric_` attribute is set to
'minkowski'.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_samples_fit_ : int
Number of samples in the fitted data.
outlier_label_ : int or array-like of shape (n_class,)
Label which is given for outlier samples (samples with no neighbors
on given radius).
outputs_2d_ : bool
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
otherwise True.
See Also
--------
KNeighborsClassifier : Classifier implementing the k-nearest neighbors
vote.
RadiusNeighborsRegressor : Regression based on neighbors within a
fixed radius.
KNeighborsRegressor : Regression based on k-nearest neighbors.
NearestNeighbors : Unsupervised learner for implementing neighbor
searches.
Notes
-----
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
Examples
--------
>>> X = [[0], [1], [2], [3]]
>>> y = [0, 0, 1, 1]
>>> from sklearn.neighbors import RadiusNeighborsClassifier
>>> neigh = RadiusNeighborsClassifier(radius=1.0)
>>> neigh.fit(X, y)
RadiusNeighborsClassifier(...)
>>> print(neigh.predict([[1.5]]))
[0]
>>> print(neigh.predict_proba([[1.0]]))
[[0.66666667 0.33333333]]
"""
def __init__(
self,
radius=1.0,
*,
weights="uniform",
algorithm="auto",
leaf_size=30,
p=2,
metric="minkowski",
outlier_label=None,
metric_params=None,
n_jobs=None,
**kwargs,
):
# TODO: Remove in v1.2
if len(kwargs) > 0:
warnings.warn(
"Passing additional keyword parameters has no effect and is "
"deprecated in 1.0. An error will be raised from 1.2 and "
"beyond. The ignored keyword parameter(s) are: "
f"{kwargs.keys()}.",
FutureWarning,
)
super().__init__(
radius=radius,
algorithm=algorithm,
leaf_size=leaf_size,
metric=metric,
p=p,
metric_params=metric_params,
n_jobs=n_jobs,
)
self.weights = weights
self.outlier_label = outlier_label
def fit(self, X, y):
"""Fit the radius neighbors classifier from the training dataset.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples) if metric='precomputed'
Training data.
y : {array-like, sparse matrix} of shape (n_samples,) or \
(n_samples, n_outputs)
Target values.
Returns
-------
self : RadiusNeighborsClassifier
The fitted radius neighbors classifier.
"""
self.weights = _check_weights(self.weights)
self._fit(X, y)
classes_ = self.classes_
_y = self._y
if not self.outputs_2d_:
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]
if self.outlier_label is None:
outlier_label_ = None
elif self.outlier_label == "most_frequent":
outlier_label_ = []
# iterate over multi-output, get the most frequent label for each
# output.
for k, classes_k in enumerate(classes_):
label_count = np.bincount(_y[:, k])
outlier_label_.append(classes_k[label_count.argmax()])
else:
if _is_arraylike(self.outlier_label) and not isinstance(
self.outlier_label, str
):
if len(self.outlier_label) != len(classes_):
raise ValueError(
"The length of outlier_label: {} is "
"inconsistent with the output "
"length: {}".format(self.outlier_label, len(classes_))
)
outlier_label_ = self.outlier_label
else:
outlier_label_ = [self.outlier_label] * len(classes_)
for classes, label in zip(classes_, outlier_label_):
if _is_arraylike(label) and not isinstance(label, str):
# ensure the outlier label for each output is a scalar.
raise TypeError(
"The outlier_label of classes {} is "
"supposed to be a scalar, got "
"{}.".format(classes, label)
)
if np.append(classes, label).dtype != classes.dtype:
# ensure the dtype of outlier label is consistent with y.
raise TypeError(
"The dtype of outlier_label {} is "
"inconsistent with classes {} in "
"y.".format(label, classes)
)
self.outlier_label_ = outlier_label_
return self
def predict(self, X):
"""Predict the class labels for the provided data.
Parameters
----------
X : array-like of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
Returns
-------
y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
Class labels for each data sample.
"""
probs = self.predict_proba(X)
classes_ = self.classes_
if not self.outputs_2d_:
probs = [probs]
classes_ = [self.classes_]
n_outputs = len(classes_)
n_queries = probs[0].shape[0]
y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
for k, prob in enumerate(probs):
# iterate over multi-output, assign labels based on probabilities
# of each output.
max_prob_index = prob.argmax(axis=1)
y_pred[:, k] = classes_[k].take(max_prob_index)
outlier_zero_probs = (prob == 0).all(axis=1)
if outlier_zero_probs.any():
zero_prob_index = np.flatnonzero(outlier_zero_probs)
y_pred[zero_prob_index, k] = self.outlier_label_[k]
if not self.outputs_2d_:
y_pred = y_pred.ravel()
return y_pred
def predict_proba(self, X):
"""Return probability estimates for the test data X.
Parameters
----------
X : array-like of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
Returns
-------
p : ndarray of shape (n_queries, n_classes), or a list of \
n_outputs of such arrays if n_outputs > 1.
The class probabilities of the input samples. Classes are ordered
by lexicographic order.
"""
n_queries = _num_samples(X)
neigh_dist, neigh_ind = self.radius_neighbors(X)
outlier_mask = np.zeros(n_queries, dtype=bool)
outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
outliers = np.flatnonzero(outlier_mask)
inliers = np.flatnonzero(~outlier_mask)
classes_ = self.classes_
_y = self._y
if not self.outputs_2d_:
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]
if self.outlier_label_ is None and outliers.size > 0:
raise ValueError(
"No neighbors found for test samples %r, "
"you can try using larger radius, "
"giving a label for outliers, "
"or considering removing them from your dataset." % outliers
)
weights = _get_weights(neigh_dist, self.weights)
if weights is not None:
weights = weights[inliers]
probabilities = []
# iterate over multi-output, measure probabilities of the k-th output.
for k, classes_k in enumerate(classes_):
pred_labels = np.zeros(len(neigh_ind), dtype=object)
pred_labels[:] = [_y[ind, k] for ind in neigh_ind]
proba_k = np.zeros((n_queries, classes_k.size))
proba_inl = np.zeros((len(inliers), classes_k.size))
# samples have different size of neighbors within the same radius
if weights is None:
for i, idx in enumerate(pred_labels[inliers]):
proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size)
else:
for i, idx in enumerate(pred_labels[inliers]):
proba_inl[i, :] = np.bincount(
idx, weights[i], minlength=classes_k.size
)
proba_k[inliers, :] = proba_inl
if outliers.size > 0:
_outlier_label = self.outlier_label_[k]
label_index = np.flatnonzero(classes_k == _outlier_label)
if label_index.size == 1:
proba_k[outliers, label_index[0]] = 1.0
else:
warnings.warn(
"Outlier label {} is not in training "
"classes. All class probabilities of "
"outliers will be assigned with 0."
"".format(self.outlier_label_[k])
)
# normalize 'votes' into real [0,1] probabilities
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba_k /= normalizer
probabilities.append(proba_k)
if not self.outputs_2d_:
probabilities = probabilities[0]
return probabilities
def _more_tags(self):
return {"multilabel": True}

View File

@@ -0,0 +1,20 @@
# TODO: Remove this file in 1.3
import warnings
from ..metrics import DistanceMetric as _DistanceMetric
class DistanceMetric(_DistanceMetric):
@classmethod
def _warn(cls):
warnings.warn(
"sklearn.neighbors.DistanceMetric has been moved "
"to sklearn.metrics.DistanceMetric in 1.0. "
"This import path will be removed in 1.3",
category=FutureWarning,
)
@classmethod
def get_metric(cls, metric, **kwargs):
DistanceMetric._warn()
return _DistanceMetric.get_metric(metric, **kwargs)

View File

@@ -0,0 +1,678 @@
"""Nearest Neighbors graph functions"""
# Author: Jake Vanderplas <vanderplas@astro.washington.edu>
# Tom Dupre la Tour
#
# License: BSD 3 clause (C) INRIA, University of Amsterdam
from ._base import KNeighborsMixin, RadiusNeighborsMixin
from ._base import NeighborsBase
from ._unsupervised import NearestNeighbors
from ..base import TransformerMixin, _ClassNamePrefixFeaturesOutMixin
from ..utils.validation import check_is_fitted
def _check_params(X, metric, p, metric_params):
"""Check the validity of the input parameters"""
params = zip(["metric", "p", "metric_params"], [metric, p, metric_params])
est_params = X.get_params()
for param_name, func_param in params:
if func_param != est_params[param_name]:
raise ValueError(
"Got %s for %s, while the estimator has %s for the same parameter."
% (func_param, param_name, est_params[param_name])
)
def _query_include_self(X, include_self, mode):
"""Return the query based on include_self param"""
if include_self == "auto":
include_self = mode == "connectivity"
# it does not include each sample as its own neighbors
if not include_self:
X = None
return X
def kneighbors_graph(
X,
n_neighbors,
*,
mode="connectivity",
metric="minkowski",
p=2,
metric_params=None,
include_self=False,
n_jobs=None,
):
"""Compute the (weighted) graph of k-Neighbors for points in X.
Read more in the :ref:`User Guide <unsupervised_neighbors>`.
Parameters
----------
X : array-like of shape (n_samples, n_features) or BallTree
Sample data, in the form of a numpy array or a precomputed
:class:`BallTree`.
n_neighbors : int
Number of neighbors for each sample.
mode : {'connectivity', 'distance'}, default='connectivity'
Type of returned matrix: 'connectivity' will return the connectivity
matrix with ones and zeros, and 'distance' will return the distances
between neighbors according to the given metric.
metric : str, default='minkowski'
The distance metric to use for the tree. The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric.
For a list of available metrics, see the documentation of
:class:`~sklearn.metrics.DistanceMetric` and the metrics listed in
`sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`. Note that the
"cosine" metric uses :func:`~sklearn.metrics.pairwise.cosine_distances`.
p : int, default=2
Power parameter for the Minkowski metric. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
include_self : bool or 'auto', default=False
Whether or not to mark each sample as the first nearest neighbor to
itself. If 'auto', then True is used for mode='connectivity' and False
for mode='distance'.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
A : sparse matrix of shape (n_samples, n_samples)
Graph where A[i, j] is assigned the weight of edge that
connects i to j. The matrix is of CSR format.
See Also
--------
radius_neighbors_graph: Compute the (weighted) graph of Neighbors for points in X.
Examples
--------
>>> X = [[0], [3], [1]]
>>> from sklearn.neighbors import kneighbors_graph
>>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True)
>>> A.toarray()
array([[1., 0., 1.],
[0., 1., 1.],
[1., 0., 1.]])
"""
if not isinstance(X, KNeighborsMixin):
X = NearestNeighbors(
n_neighbors=n_neighbors,
metric=metric,
p=p,
metric_params=metric_params,
n_jobs=n_jobs,
).fit(X)
else:
_check_params(X, metric, p, metric_params)
query = _query_include_self(X._fit_X, include_self, mode)
return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
def radius_neighbors_graph(
X,
radius,
*,
mode="connectivity",
metric="minkowski",
p=2,
metric_params=None,
include_self=False,
n_jobs=None,
):
"""Compute the (weighted) graph of Neighbors for points in X.
Neighborhoods are restricted the points at a distance lower than
radius.
Read more in the :ref:`User Guide <unsupervised_neighbors>`.
Parameters
----------
X : array-like of shape (n_samples, n_features) or BallTree
Sample data, in the form of a numpy array or a precomputed
:class:`BallTree`.
radius : float
Radius of neighborhoods.
mode : {'connectivity', 'distance'}, default='connectivity'
Type of returned matrix: 'connectivity' will return the connectivity
matrix with ones and zeros, and 'distance' will return the distances
between neighbors according to the given metric.
metric : str, default='minkowski'
The distance metric to use for the tree. The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric.
For a list of available metrics, see the documentation of
:class:`~sklearn.metrics.DistanceMetric` and the metrics listed in
`sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`. Note that the
"cosine" metric uses :func:`~sklearn.metrics.pairwise.cosine_distances`.
p : int, default=2
Power parameter for the Minkowski metric. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
include_self : bool or 'auto', default=False
Whether or not to mark each sample as the first nearest neighbor to
itself. If 'auto', then True is used for mode='connectivity' and False
for mode='distance'.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
A : sparse matrix of shape (n_samples, n_samples)
Graph where A[i, j] is assigned the weight of edge that connects
i to j. The matrix is of CSR format.
See Also
--------
kneighbors_graph: Compute the weighted graph of k-neighbors for points in X.
Examples
--------
>>> X = [[0], [3], [1]]
>>> from sklearn.neighbors import radius_neighbors_graph
>>> A = radius_neighbors_graph(X, 1.5, mode='connectivity',
... include_self=True)
>>> A.toarray()
array([[1., 0., 1.],
[0., 1., 0.],
[1., 0., 1.]])
"""
if not isinstance(X, RadiusNeighborsMixin):
X = NearestNeighbors(
radius=radius,
metric=metric,
p=p,
metric_params=metric_params,
n_jobs=n_jobs,
).fit(X)
else:
_check_params(X, metric, p, metric_params)
query = _query_include_self(X._fit_X, include_self, mode)
return X.radius_neighbors_graph(query, radius, mode)
class KNeighborsTransformer(
_ClassNamePrefixFeaturesOutMixin, KNeighborsMixin, TransformerMixin, NeighborsBase
):
"""Transform X into a (weighted) graph of k nearest neighbors.
The transformed data is a sparse graph as returned by kneighbors_graph.
Read more in the :ref:`User Guide <neighbors_transformer>`.
.. versionadded:: 0.22
Parameters
----------
mode : {'distance', 'connectivity'}, default='distance'
Type of returned matrix: 'connectivity' will return the connectivity
matrix with ones and zeros, and 'distance' will return the distances
between neighbors according to the given metric.
n_neighbors : int, default=5
Number of neighbors for each sample in the transformed sparse graph.
For compatibility reasons, as each sample is considered as its own
neighbor, one extra neighbor will be computed when mode == 'distance'.
In this case, the sparse graph contains (n_neighbors + 1) neighbors.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, default=30
Leaf size passed to BallTree or KDTree. This can affect the
speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
metric : str or callable, default='minkowski'
Metric to use for distance computation. Any metric from scikit-learn
or scipy.spatial.distance can be used.
If metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays as input and return one value indicating the
distance between them. This works for Scipy's metrics, but is less
efficient than passing the metric name as a string.
Distance matrices are not supported.
Valid values for metric are:
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
'yule']
See the documentation for scipy.spatial.distance for details on these
metrics.
p : int, default=2
Parameter for the Minkowski metric from
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
n_jobs : int, default=1
The number of parallel jobs to run for neighbors search.
If ``-1``, then the number of jobs is set to the number of CPU cores.
Attributes
----------
effective_metric_ : str or callable
The distance metric used. It will be same as the `metric` parameter
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
'minkowski' and `p` parameter set to 2.
effective_metric_params_ : dict
Additional keyword arguments for the metric function. For most metrics
will be same with `metric_params` parameter, but may also contain the
`p` parameter value if the `effective_metric_` attribute is set to
'minkowski'.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_samples_fit_ : int
Number of samples in the fitted data.
See Also
--------
kneighbors_graph : Compute the weighted graph of k-neighbors for
points in X.
RadiusNeighborsTransformer : Transform X into a weighted graph of
neighbors nearer than a radius.
Examples
--------
>>> from sklearn.datasets import load_wine
>>> from sklearn.neighbors import KNeighborsTransformer
>>> X, _ = load_wine(return_X_y=True)
>>> X.shape
(178, 13)
>>> transformer = KNeighborsTransformer(n_neighbors=5, mode='distance')
>>> X_dist_graph = transformer.fit_transform(X)
>>> X_dist_graph.shape
(178, 178)
"""
def __init__(
self,
*,
mode="distance",
n_neighbors=5,
algorithm="auto",
leaf_size=30,
metric="minkowski",
p=2,
metric_params=None,
n_jobs=1,
):
super(KNeighborsTransformer, self).__init__(
n_neighbors=n_neighbors,
radius=None,
algorithm=algorithm,
leaf_size=leaf_size,
metric=metric,
p=p,
metric_params=metric_params,
n_jobs=n_jobs,
)
self.mode = mode
def fit(self, X, y=None):
"""Fit the k-nearest neighbors transformer from the training dataset.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples) if metric='precomputed'
Training data.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : KNeighborsTransformer
The fitted k-nearest neighbors transformer.
"""
self._fit(X)
self._n_features_out = self.n_samples_fit_
return self
def transform(self, X):
"""Compute the (weighted) graph of Neighbors for points in X.
Parameters
----------
X : array-like of shape (n_samples_transform, n_features)
Sample data.
Returns
-------
Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)
Xt[i, j] is assigned the weight of edge that connects i to j.
Only the neighbors have an explicit value.
The diagonal is always explicit.
The matrix is of CSR format.
"""
check_is_fitted(self)
add_one = self.mode == "distance"
return self.kneighbors_graph(
X, mode=self.mode, n_neighbors=self.n_neighbors + add_one
)
def fit_transform(self, X, y=None):
"""Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params
and returns a transformed version of X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
Xt : sparse matrix of shape (n_samples, n_samples)
Xt[i, j] is assigned the weight of edge that connects i to j.
Only the neighbors have an explicit value.
The diagonal is always explicit.
The matrix is of CSR format.
"""
return self.fit(X).transform(X)
def _more_tags(self):
return {
"_xfail_checks": {
"check_methods_sample_order_invariance": "check is not applicable."
}
}
class RadiusNeighborsTransformer(
_ClassNamePrefixFeaturesOutMixin,
RadiusNeighborsMixin,
TransformerMixin,
NeighborsBase,
):
"""Transform X into a (weighted) graph of neighbors nearer than a radius.
The transformed data is a sparse graph as returned by
`radius_neighbors_graph`.
Read more in the :ref:`User Guide <neighbors_transformer>`.
.. versionadded:: 0.22
Parameters
----------
mode : {'distance', 'connectivity'}, default='distance'
Type of returned matrix: 'connectivity' will return the connectivity
matrix with ones and zeros, and 'distance' will return the distances
between neighbors according to the given metric.
radius : float, default=1.0
Radius of neighborhood in the transformed sparse graph.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, default=30
Leaf size passed to BallTree or KDTree. This can affect the
speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
metric : str or callable, default='minkowski'
Metric to use for distance computation. Any metric from scikit-learn
or scipy.spatial.distance can be used.
If metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays as input and return one value indicating the
distance between them. This works for Scipy's metrics, but is less
efficient than passing the metric name as a string.
Distance matrices are not supported.
Valid values for metric are:
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
'yule']
See the documentation for scipy.spatial.distance for details on these
metrics.
p : int, default=2
Parameter for the Minkowski metric from
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
n_jobs : int, default=1
The number of parallel jobs to run for neighbors search.
If ``-1``, then the number of jobs is set to the number of CPU cores.
Attributes
----------
effective_metric_ : str or callable
The distance metric used. It will be same as the `metric` parameter
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
'minkowski' and `p` parameter set to 2.
effective_metric_params_ : dict
Additional keyword arguments for the metric function. For most metrics
will be same with `metric_params` parameter, but may also contain the
`p` parameter value if the `effective_metric_` attribute is set to
'minkowski'.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_samples_fit_ : int
Number of samples in the fitted data.
See Also
--------
kneighbors_graph : Compute the weighted graph of k-neighbors for
points in X.
KNeighborsTransformer : Transform X into a weighted graph of k
nearest neighbors.
Examples
--------
>>> import numpy as np
>>> from sklearn.datasets import load_wine
>>> from sklearn.cluster import DBSCAN
>>> from sklearn.neighbors import RadiusNeighborsTransformer
>>> from sklearn.pipeline import make_pipeline
>>> X, _ = load_wine(return_X_y=True)
>>> estimator = make_pipeline(
... RadiusNeighborsTransformer(radius=42.0, mode='distance'),
... DBSCAN(eps=25.0, metric='precomputed'))
>>> X_clustered = estimator.fit_predict(X)
>>> clusters, counts = np.unique(X_clustered, return_counts=True)
>>> print(counts)
[ 29 15 111 11 12]
"""
def __init__(
self,
*,
mode="distance",
radius=1.0,
algorithm="auto",
leaf_size=30,
metric="minkowski",
p=2,
metric_params=None,
n_jobs=1,
):
super(RadiusNeighborsTransformer, self).__init__(
n_neighbors=None,
radius=radius,
algorithm=algorithm,
leaf_size=leaf_size,
metric=metric,
p=p,
metric_params=metric_params,
n_jobs=n_jobs,
)
self.mode = mode
def fit(self, X, y=None):
"""Fit the radius neighbors transformer from the training dataset.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples) if metric='precomputed'
Training data.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : RadiusNeighborsTransformer
The fitted radius neighbors transformer.
"""
self._fit(X)
self._n_features_out = self.n_samples_fit_
return self
def transform(self, X):
"""Compute the (weighted) graph of Neighbors for points in X.
Parameters
----------
X : array-like of shape (n_samples_transform, n_features)
Sample data.
Returns
-------
Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)
Xt[i, j] is assigned the weight of edge that connects i to j.
Only the neighbors have an explicit value.
The diagonal is always explicit.
The matrix is of CSR format.
"""
check_is_fitted(self)
return self.radius_neighbors_graph(X, mode=self.mode, sort_results=True)
def fit_transform(self, X, y=None):
"""Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params
and returns a transformed version of X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
Xt : sparse matrix of shape (n_samples, n_samples)
Xt[i, j] is assigned the weight of edge that connects i to j.
Only the neighbors have an explicit value.
The diagonal is always explicit.
The matrix is of CSR format.
"""
return self.fit(X).transform(X)
def _more_tags(self):
return {
"_xfail_checks": {
"check_methods_sample_order_invariance": "check is not applicable."
}
}

View File

@@ -0,0 +1,328 @@
"""
Kernel Density Estimation
-------------------------
"""
# Author: Jake Vanderplas <jakevdp@cs.washington.edu>
import numpy as np
from scipy.special import gammainc
from ..base import BaseEstimator
from ..utils import check_random_state
from ..utils.validation import _check_sample_weight, check_is_fitted
from ..utils.extmath import row_norms
from ._ball_tree import BallTree, DTYPE
from ._kd_tree import KDTree
VALID_KERNELS = [
"gaussian",
"tophat",
"epanechnikov",
"exponential",
"linear",
"cosine",
]
TREE_DICT = {"ball_tree": BallTree, "kd_tree": KDTree}
# TODO: implement a brute force version for testing purposes
# TODO: bandwidth estimation
# TODO: create a density estimation base class?
class KernelDensity(BaseEstimator):
"""Kernel Density Estimation.
Read more in the :ref:`User Guide <kernel_density>`.
Parameters
----------
bandwidth : float, default=1.0
The bandwidth of the kernel.
algorithm : {'kd_tree', 'ball_tree', 'auto'}, default='auto'
The tree algorithm to use.
kernel : {'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', \
'cosine'}, default='gaussian'
The kernel to use.
metric : str, default='euclidean'
The distance metric to use. Note that not all metrics are
valid with all algorithms. Refer to the documentation of
:class:`BallTree` and :class:`KDTree` for a description of
available algorithms. Note that the normalization of the density
output is correct only for the Euclidean distance metric. Default
is 'euclidean'.
atol : float, default=0
The desired absolute tolerance of the result. A larger tolerance will
generally lead to faster execution.
rtol : float, default=0
The desired relative tolerance of the result. A larger tolerance will
generally lead to faster execution.
breadth_first : bool, default=True
If true (default), use a breadth-first approach to the problem.
Otherwise use a depth-first approach.
leaf_size : int, default=40
Specify the leaf size of the underlying tree. See :class:`BallTree`
or :class:`KDTree` for details.
metric_params : dict, default=None
Additional parameters to be passed to the tree for use with the
metric. For more information, see the documentation of
:class:`BallTree` or :class:`KDTree`.
Attributes
----------
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
tree_ : ``BinaryTree`` instance
The tree algorithm for fast generalized N-point problems.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
sklearn.neighbors.KDTree : K-dimensional tree for fast generalized N-point
problems.
sklearn.neighbors.BallTree : Ball tree for fast generalized N-point
problems.
Examples
--------
Compute a gaussian kernel density estimate with a fixed bandwidth.
>>> from sklearn.neighbors import KernelDensity
>>> import numpy as np
>>> rng = np.random.RandomState(42)
>>> X = rng.random_sample((100, 3))
>>> kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X)
>>> log_density = kde.score_samples(X[:3])
>>> log_density
array([-1.52955942, -1.51462041, -1.60244657])
"""
def __init__(
self,
*,
bandwidth=1.0,
algorithm="auto",
kernel="gaussian",
metric="euclidean",
atol=0,
rtol=0,
breadth_first=True,
leaf_size=40,
metric_params=None,
):
self.algorithm = algorithm
self.bandwidth = bandwidth
self.kernel = kernel
self.metric = metric
self.atol = atol
self.rtol = rtol
self.breadth_first = breadth_first
self.leaf_size = leaf_size
self.metric_params = metric_params
def _choose_algorithm(self, algorithm, metric):
# given the algorithm string + metric string, choose the optimal
# algorithm to compute the result.
if algorithm == "auto":
# use KD Tree if possible
if metric in KDTree.valid_metrics:
return "kd_tree"
elif metric in BallTree.valid_metrics:
return "ball_tree"
else:
raise ValueError("invalid metric: '{0}'".format(metric))
elif algorithm in TREE_DICT:
if metric not in TREE_DICT[algorithm].valid_metrics:
raise ValueError(
"invalid metric for {0}: '{1}'".format(TREE_DICT[algorithm], metric)
)
return algorithm
else:
raise ValueError("invalid algorithm: '{0}'".format(algorithm))
def fit(self, X, y=None, sample_weight=None):
"""Fit the Kernel Density model on the data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
List of n_features-dimensional data points. Each row
corresponds to a single data point.
y : None
Ignored. This parameter exists only for compatibility with
:class:`~sklearn.pipeline.Pipeline`.
sample_weight : array-like of shape (n_samples,), default=None
List of sample weights attached to the data X.
.. versionadded:: 0.20
Returns
-------
self : object
Returns the instance itself.
"""
algorithm = self._choose_algorithm(self.algorithm, self.metric)
if self.bandwidth <= 0:
raise ValueError("bandwidth must be positive")
if self.kernel not in VALID_KERNELS:
raise ValueError("invalid kernel: '{0}'".format(self.kernel))
X = self._validate_data(X, order="C", dtype=DTYPE)
if sample_weight is not None:
sample_weight = _check_sample_weight(
sample_weight, X, DTYPE, only_non_negative=True
)
kwargs = self.metric_params
if kwargs is None:
kwargs = {}
self.tree_ = TREE_DICT[algorithm](
X,
metric=self.metric,
leaf_size=self.leaf_size,
sample_weight=sample_weight,
**kwargs,
)
return self
def score_samples(self, X):
"""Compute the log-likelihood of each sample under the model.
Parameters
----------
X : array-like of shape (n_samples, n_features)
An array of points to query. Last dimension should match dimension
of training data (n_features).
Returns
-------
density : ndarray of shape (n_samples,)
Log-likelihood of each sample in `X`. These are normalized to be
probability densities, so values will be low for high-dimensional
data.
"""
check_is_fitted(self)
# The returned density is normalized to the number of points.
# For it to be a probability, we must scale it. For this reason
# we'll also scale atol.
X = self._validate_data(X, order="C", dtype=DTYPE, reset=False)
if self.tree_.sample_weight is None:
N = self.tree_.data.shape[0]
else:
N = self.tree_.sum_weight
atol_N = self.atol * N
log_density = self.tree_.kernel_density(
X,
h=self.bandwidth,
kernel=self.kernel,
atol=atol_N,
rtol=self.rtol,
breadth_first=self.breadth_first,
return_log=True,
)
log_density -= np.log(N)
return log_density
def score(self, X, y=None):
"""Compute the total log-likelihood under the model.
Parameters
----------
X : array-like of shape (n_samples, n_features)
List of n_features-dimensional data points. Each row
corresponds to a single data point.
y : None
Ignored. This parameter exists only for compatibility with
:class:`~sklearn.pipeline.Pipeline`.
Returns
-------
logprob : float
Total log-likelihood of the data in X. This is normalized to be a
probability density, so the value will be low for high-dimensional
data.
"""
return np.sum(self.score_samples(X))
def sample(self, n_samples=1, random_state=None):
"""Generate random samples from the model.
Currently, this is implemented only for gaussian and tophat kernels.
Parameters
----------
n_samples : int, default=1
Number of samples to generate.
random_state : int, RandomState instance or None, default=None
Determines random number generation used to generate
random samples. Pass an int for reproducible results
across multiple function calls.
See :term:`Glossary <random_state>`.
Returns
-------
X : array-like of shape (n_samples, n_features)
List of samples.
"""
check_is_fitted(self)
# TODO: implement sampling for other valid kernel shapes
if self.kernel not in ["gaussian", "tophat"]:
raise NotImplementedError()
data = np.asarray(self.tree_.data)
rng = check_random_state(random_state)
u = rng.uniform(0, 1, size=n_samples)
if self.tree_.sample_weight is None:
i = (u * data.shape[0]).astype(np.int64)
else:
cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight))
sum_weight = cumsum_weight[-1]
i = np.searchsorted(cumsum_weight, u * sum_weight)
if self.kernel == "gaussian":
return np.atleast_2d(rng.normal(data[i], self.bandwidth))
elif self.kernel == "tophat":
# we first draw points from a d-dimensional normal distribution,
# then use an incomplete gamma function to map them to a uniform
# d-dimensional tophat distribution.
dim = data.shape[1]
X = rng.normal(size=(n_samples, dim))
s_sq = row_norms(X, squared=True)
correction = (
gammainc(0.5 * dim, 0.5 * s_sq) ** (1.0 / dim)
* self.bandwidth
/ np.sqrt(s_sq)
)
return data[i] + X * correction[:, np.newaxis]
def _more_tags(self):
return {
"_xfail_checks": {
"check_sample_weights_invariance": (
"sample_weight must have positive values"
),
}
}

View File

@@ -0,0 +1,500 @@
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# License: BSD 3 clause
import numpy as np
import warnings
from ._base import NeighborsBase
from ._base import KNeighborsMixin
from ..base import OutlierMixin
from ..utils.metaestimators import available_if
from ..utils.validation import check_is_fitted
from ..utils import check_array
__all__ = ["LocalOutlierFactor"]
class LocalOutlierFactor(KNeighborsMixin, OutlierMixin, NeighborsBase):
"""Unsupervised Outlier Detection using the Local Outlier Factor (LOF).
The anomaly score of each sample is called the Local Outlier Factor.
It measures the local deviation of the density of a given sample with respect
to its neighbors.
It is local in that the anomaly score depends on how isolated the object
is with respect to the surrounding neighborhood.
More precisely, locality is given by k-nearest neighbors, whose distance
is used to estimate the local density.
By comparing the local density of a sample to the local densities of its
neighbors, one can identify samples that have a substantially lower density
than their neighbors. These are considered outliers.
.. versionadded:: 0.19
Parameters
----------
n_neighbors : int, default=20
Number of neighbors to use by default for :meth:`kneighbors` queries.
If n_neighbors is larger than the number of samples provided,
all samples will be used.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, default=30
Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can
affect the speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
metric : str or callable, default='minkowski'
The metric is used for distance computation. Any metric from scikit-learn
or scipy.spatial.distance can be used.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a sparse matrix, in which case only "nonzero"
elements may be considered neighbors.
If metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays as input and return one value indicating the
distance between them. This works for Scipy's metrics, but is less
efficient than passing the metric name as a string.
Valid values for metric are:
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
'yule']
See the documentation for scipy.spatial.distance for details on these
metrics:
https://docs.scipy.org/doc/scipy/reference/spatial.distance.html.
p : int, default=2
Parameter for the Minkowski metric from
:func:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this
is equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
contamination : 'auto' or float, default='auto'
The amount of contamination of the data set, i.e. the proportion
of outliers in the data set. When fitting this is used to define the
threshold on the scores of the samples.
- if 'auto', the threshold is determined as in the
original paper,
- if a float, the contamination should be in the range (0, 0.5].
.. versionchanged:: 0.22
The default value of ``contamination`` changed from 0.1
to ``'auto'``.
novelty : bool, default=False
By default, LocalOutlierFactor is only meant to be used for outlier
detection (novelty=False). Set novelty to True if you want to use
LocalOutlierFactor for novelty detection. In this case be aware that
you should only use predict, decision_function and score_samples
on new unseen data and not on the training set; and note that the
results obtained this way may differ from the standard LOF results.
.. versionadded:: 0.20
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
negative_outlier_factor_ : ndarray of shape (n_samples,)
The opposite LOF of the training samples. The higher, the more normal.
Inliers tend to have a LOF score close to 1
(``negative_outlier_factor_`` close to -1), while outliers tend to have
a larger LOF score.
The local outlier factor (LOF) of a sample captures its
supposed 'degree of abnormality'.
It is the average of the ratio of the local reachability density of
a sample and those of its k-nearest neighbors.
n_neighbors_ : int
The actual number of neighbors used for :meth:`kneighbors` queries.
offset_ : float
Offset used to obtain binary labels from the raw scores.
Observations having a negative_outlier_factor smaller than `offset_`
are detected as abnormal.
The offset is set to -1.5 (inliers score around -1), except when a
contamination parameter different than "auto" is provided. In that
case, the offset is defined in such a way we obtain the expected
number of outliers in training.
.. versionadded:: 0.20
effective_metric_ : str
The effective metric used for the distance computation.
effective_metric_params_ : dict
The effective additional keyword arguments for the metric function.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_samples_fit_ : int
It is the number of samples in the fitted data.
See Also
--------
sklearn.svm.OneClassSVM: Unsupervised Outlier Detection using
Support Vector Machine.
References
----------
.. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).
LOF: identifying density-based local outliers. In ACM sigmod record.
Examples
--------
>>> import numpy as np
>>> from sklearn.neighbors import LocalOutlierFactor
>>> X = [[-1.1], [0.2], [101.1], [0.3]]
>>> clf = LocalOutlierFactor(n_neighbors=2)
>>> clf.fit_predict(X)
array([ 1, 1, -1, 1])
>>> clf.negative_outlier_factor_
array([ -0.9821..., -1.0370..., -73.3697..., -0.9821...])
"""
def __init__(
self,
n_neighbors=20,
*,
algorithm="auto",
leaf_size=30,
metric="minkowski",
p=2,
metric_params=None,
contamination="auto",
novelty=False,
n_jobs=None,
):
super().__init__(
n_neighbors=n_neighbors,
algorithm=algorithm,
leaf_size=leaf_size,
metric=metric,
p=p,
metric_params=metric_params,
n_jobs=n_jobs,
)
self.contamination = contamination
self.novelty = novelty
def _check_novelty_fit_predict(self):
if self.novelty:
msg = (
"fit_predict is not available when novelty=True. Use "
"novelty=False if you want to predict on the training set."
)
raise AttributeError(msg)
return True
@available_if(_check_novelty_fit_predict)
def fit_predict(self, X, y=None):
"""Fit the model to the training set X and return the labels.
**Not available for novelty detection (when novelty is set to True).**
Label is 1 for an inlier and -1 for an outlier according to the LOF
score and the contamination parameter.
Parameters
----------
X : array-like of shape (n_samples, n_features), default=None
The query sample or samples to compute the Local Outlier Factor
w.r.t. to the training samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
is_inlier : ndarray of shape (n_samples,)
Returns -1 for anomalies/outliers and 1 for inliers.
"""
# As fit_predict would be different from fit.predict, fit_predict is
# only available for outlier detection (novelty=False)
return self.fit(X)._predict()
def fit(self, X, y=None):
"""Fit the local outlier factor detector from the training dataset.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples) if metric='precomputed'
Training data.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : LocalOutlierFactor
The fitted local outlier factor detector.
"""
self._fit(X)
if self.contamination != "auto":
if not (0.0 < self.contamination <= 0.5):
raise ValueError(
"contamination must be in (0, 0.5], got: %f" % self.contamination
)
n_samples = self.n_samples_fit_
if self.n_neighbors > n_samples:
warnings.warn(
"n_neighbors (%s) is greater than the "
"total number of samples (%s). n_neighbors "
"will be set to (n_samples - 1) for estimation."
% (self.n_neighbors, n_samples)
)
self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))
self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors(
n_neighbors=self.n_neighbors_
)
self._lrd = self._local_reachability_density(
self._distances_fit_X_, _neighbors_indices_fit_X_
)
# Compute lof score over training samples to define offset_:
lrd_ratios_array = (
self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis]
)
self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1)
if self.contamination == "auto":
# inliers score around -1 (the higher, the less abnormal).
self.offset_ = -1.5
else:
self.offset_ = np.percentile(
self.negative_outlier_factor_, 100.0 * self.contamination
)
return self
def _check_novelty_predict(self):
if not self.novelty:
msg = (
"predict is not available when novelty=False, use "
"fit_predict if you want to predict on training data. Use "
"novelty=True if you want to use LOF for novelty detection "
"and predict on new unseen data."
)
raise AttributeError(msg)
return True
@available_if(_check_novelty_predict)
def predict(self, X=None):
"""Predict the labels (1 inlier, -1 outlier) of X according to LOF.
**Only available for novelty detection (when novelty is set to True).**
This method allows to generalize prediction to *new observations* (not
in the training set). Note that the result of ``clf.fit(X)`` then
``clf.predict(X)`` with ``novelty=True`` may differ from the result
obtained by ``clf.fit_predict(X)`` with ``novelty=False``.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The query sample or samples to compute the Local Outlier Factor
w.r.t. to the training samples.
Returns
-------
is_inlier : ndarray of shape (n_samples,)
Returns -1 for anomalies/outliers and +1 for inliers.
"""
return self._predict(X)
def _predict(self, X=None):
"""Predict the labels (1 inlier, -1 outlier) of X according to LOF.
If X is None, returns the same as fit_predict(X_train).
Parameters
----------
X : array-like of shape (n_samples, n_features), default=None
The query sample or samples to compute the Local Outlier Factor
w.r.t. to the training samples. If None, makes prediction on the
training data without considering them as their own neighbors.
Returns
-------
is_inlier : ndarray of shape (n_samples,)
Returns -1 for anomalies/outliers and +1 for inliers.
"""
check_is_fitted(self)
if X is not None:
X = check_array(X, accept_sparse="csr")
is_inlier = np.ones(X.shape[0], dtype=int)
is_inlier[self.decision_function(X) < 0] = -1
else:
is_inlier = np.ones(self.n_samples_fit_, dtype=int)
is_inlier[self.negative_outlier_factor_ < self.offset_] = -1
return is_inlier
def _check_novelty_decision_function(self):
if not self.novelty:
msg = (
"decision_function is not available when novelty=False. "
"Use novelty=True if you want to use LOF for novelty "
"detection and compute decision_function for new unseen "
"data. Note that the opposite LOF of the training samples "
"is always available by considering the "
"negative_outlier_factor_ attribute."
)
raise AttributeError(msg)
return True
@available_if(_check_novelty_decision_function)
def decision_function(self, X):
"""Shifted opposite of the Local Outlier Factor of X.
Bigger is better, i.e. large values correspond to inliers.
**Only available for novelty detection (when novelty is set to True).**
The shift offset allows a zero threshold for being an outlier.
The argument X is supposed to contain *new data*: if X contains a
point from training, it considers the later in its own neighborhood.
Also, the samples in X are not considered in the neighborhood of any
point.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The query sample or samples to compute the Local Outlier Factor
w.r.t. the training samples.
Returns
-------
shifted_opposite_lof_scores : ndarray of shape (n_samples,)
The shifted opposite of the Local Outlier Factor of each input
samples. The lower, the more abnormal. Negative scores represent
outliers, positive scores represent inliers.
"""
return self.score_samples(X) - self.offset_
def _check_novelty_score_samples(self):
if not self.novelty:
msg = (
"score_samples is not available when novelty=False. The "
"scores of the training samples are always available "
"through the negative_outlier_factor_ attribute. Use "
"novelty=True if you want to use LOF for novelty detection "
"and compute score_samples for new unseen data."
)
raise AttributeError(msg)
return True
@available_if(_check_novelty_score_samples)
def score_samples(self, X):
"""Opposite of the Local Outlier Factor of X.
It is the opposite as bigger is better, i.e. large values correspond
to inliers.
**Only available for novelty detection (when novelty is set to True).**
The argument X is supposed to contain *new data*: if X contains a
point from training, it considers the later in its own neighborhood.
Also, the samples in X are not considered in the neighborhood of any
point. Because of this, the scores obtained via ``score_samples`` may
differ from the standard LOF scores.
The standard LOF scores for the training data is available via the
``negative_outlier_factor_`` attribute.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The query sample or samples to compute the Local Outlier Factor
w.r.t. the training samples.
Returns
-------
opposite_lof_scores : ndarray of shape (n_samples,)
The opposite of the Local Outlier Factor of each input samples.
The lower, the more abnormal.
"""
check_is_fitted(self)
X = check_array(X, accept_sparse="csr")
distances_X, neighbors_indices_X = self.kneighbors(
X, n_neighbors=self.n_neighbors_
)
X_lrd = self._local_reachability_density(distances_X, neighbors_indices_X)
lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis]
# as bigger is better:
return -np.mean(lrd_ratios_array, axis=1)
def _local_reachability_density(self, distances_X, neighbors_indices):
"""The local reachability density (LRD)
The LRD of a sample is the inverse of the average reachability
distance of its k-nearest neighbors.
Parameters
----------
distances_X : ndarray of shape (n_queries, self.n_neighbors)
Distances to the neighbors (in the training samples `self._fit_X`)
of each query point to compute the LRD.
neighbors_indices : ndarray of shape (n_queries, self.n_neighbors)
Neighbors indices (of each query point) among training samples
self._fit_X.
Returns
-------
local_reachability_density : ndarray of shape (n_queries,)
The local reachability density of each sample.
"""
dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1]
reach_dist_array = np.maximum(distances_X, dist_k)
# 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_:
return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10)

View File

@@ -0,0 +1,568 @@
"""
Neighborhood Component Analysis
"""
# Authors: William de Vazelhes <wdevazelhes@gmail.com>
# John Chiotellis <ioannis.chiotellis@in.tum.de>
# License: BSD 3 clause
from warnings import warn
import numpy as np
import sys
import time
import numbers
from scipy.optimize import minimize
from ..utils.extmath import softmax
from ..metrics import pairwise_distances
from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
from ..preprocessing import LabelEncoder
from ..decomposition import PCA
from ..utils.multiclass import check_classification_targets
from ..utils.random import check_random_state
from ..utils.validation import check_is_fitted, check_array, check_scalar
from ..exceptions import ConvergenceWarning
class NeighborhoodComponentsAnalysis(
_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator
):
"""Neighborhood Components Analysis.
Neighborhood Component Analysis (NCA) is a machine learning algorithm for
metric learning. It learns a linear transformation in a supervised fashion
to improve the classification accuracy of a stochastic nearest neighbors
rule in the transformed space.
Read more in the :ref:`User Guide <nca>`.
Parameters
----------
n_components : int, default=None
Preferred dimensionality of the projected space.
If None it will be set to `n_features`.
init : {'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape \
(n_features_a, n_features_b), default='auto'
Initialization of the linear transformation. Possible options are
`'auto'`, `'pca'`, `'lda'`, `'identity'`, `'random'`, and a numpy
array of shape `(n_features_a, n_features_b)`.
- `'auto'`
Depending on `n_components`, the most reasonable initialization
will be chosen. If `n_components <= n_classes` we use `'lda'`, as
it uses labels information. If not, but
`n_components < min(n_features, n_samples)`, we use `'pca'`, as
it projects data in meaningful directions (those of higher
variance). Otherwise, we just use `'identity'`.
- `'pca'`
`n_components` principal components of the inputs passed
to :meth:`fit` will be used to initialize the transformation.
(See :class:`~sklearn.decomposition.PCA`)
- `'lda'`
`min(n_components, n_classes)` most discriminative
components of the inputs passed to :meth:`fit` will be used to
initialize the transformation. (If `n_components > n_classes`,
the rest of the components will be zero.) (See
:class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
- `'identity'`
If `n_components` is strictly smaller than the
dimensionality of the inputs passed to :meth:`fit`, the identity
matrix will be truncated to the first `n_components` rows.
- `'random'`
The initial transformation will be a random array of shape
`(n_components, n_features)`. Each value is sampled from the
standard normal distribution.
- numpy array
`n_features_b` must match the dimensionality of the inputs passed
to :meth:`fit` and n_features_a must be less than or equal to that.
If `n_components` is not `None`, `n_features_a` must match it.
warm_start : bool, default=False
If `True` and :meth:`fit` has been called before, the solution of the
previous call to :meth:`fit` is used as the initial linear
transformation (`n_components` and `init` will be ignored).
max_iter : int, default=50
Maximum number of iterations in the optimization.
tol : float, default=1e-5
Convergence tolerance for the optimization.
callback : callable, default=None
If not `None`, this function is called after every iteration of the
optimizer, taking as arguments the current solution (flattened
transformation matrix) and the number of iterations. This might be
useful in case one wants to examine or store the transformation
found after each iteration.
verbose : int, default=0
If 0, no progress messages will be printed.
If 1, progress messages will be printed to stdout.
If > 1, progress messages will be printed and the `disp`
parameter of :func:`scipy.optimize.minimize` will be set to
`verbose - 2`.
random_state : int or numpy.RandomState, default=None
A pseudo random number generator object or a seed for it if int. If
`init='random'`, `random_state` is used to initialize the random
transformation. If `init='pca'`, `random_state` is passed as an
argument to PCA when initializing the transformation. Pass an int
for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
Attributes
----------
components_ : ndarray of shape (n_components, n_features)
The linear transformation learned during fitting.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
n_iter_ : int
Counts the number of iterations performed by the optimizer.
random_state_ : numpy.RandomState
Pseudo random number generator object used during initialization.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
sklearn.discriminant_analysis.LinearDiscriminantAnalysis : Linear
Discriminant Analysis.
sklearn.decomposition.PCA : Principal component analysis (PCA).
References
----------
.. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.
"Neighbourhood Components Analysis". Advances in Neural Information
Processing Systems. 17, 513-520, 2005.
http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf
.. [2] Wikipedia entry on Neighborhood Components Analysis
https://en.wikipedia.org/wiki/Neighbourhood_components_analysis
Examples
--------
>>> from sklearn.neighbors import NeighborhoodComponentsAnalysis
>>> from sklearn.neighbors import KNeighborsClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> X, y = load_iris(return_X_y=True)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y,
... stratify=y, test_size=0.7, random_state=42)
>>> nca = NeighborhoodComponentsAnalysis(random_state=42)
>>> nca.fit(X_train, y_train)
NeighborhoodComponentsAnalysis(...)
>>> knn = KNeighborsClassifier(n_neighbors=3)
>>> knn.fit(X_train, y_train)
KNeighborsClassifier(...)
>>> print(knn.score(X_test, y_test))
0.933333...
>>> knn.fit(nca.transform(X_train), y_train)
KNeighborsClassifier(...)
>>> print(knn.score(nca.transform(X_test), y_test))
0.961904...
"""
def __init__(
self,
n_components=None,
*,
init="auto",
warm_start=False,
max_iter=50,
tol=1e-5,
callback=None,
verbose=0,
random_state=None,
):
self.n_components = n_components
self.init = init
self.warm_start = warm_start
self.max_iter = max_iter
self.tol = tol
self.callback = callback
self.verbose = verbose
self.random_state = random_state
def fit(self, X, y):
"""Fit the model according to the given training data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training samples.
y : array-like of shape (n_samples,)
The corresponding training labels.
Returns
-------
self : object
Fitted estimator.
"""
# Verify inputs X and y and NCA parameters, and transform a copy if
# needed
X, y, init = self._validate_params(X, y)
# Initialize the random generator
self.random_state_ = check_random_state(self.random_state)
# Measure the total training time
t_train = time.time()
# Compute a mask that stays fixed during optimization:
same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
# (n_samples, n_samples)
# Initialize the transformation
transformation = self._initialize(X, y, init)
# Create a dictionary of parameters to be passed to the optimizer
disp = self.verbose - 2 if self.verbose > 1 else -1
optimizer_params = {
"method": "L-BFGS-B",
"fun": self._loss_grad_lbfgs,
"args": (X, same_class_mask, -1.0),
"jac": True,
"x0": transformation,
"tol": self.tol,
"options": dict(maxiter=self.max_iter, disp=disp),
"callback": self._callback,
}
# Call the optimizer
self.n_iter_ = 0
opt_result = minimize(**optimizer_params)
# Reshape the solution found by the optimizer
self.components_ = opt_result.x.reshape(-1, X.shape[1])
self._n_features_out = self.components_.shape[1]
# Stop timer
t_train = time.time() - t_train
if self.verbose:
cls_name = self.__class__.__name__
# Warn the user if the algorithm did not converge
if not opt_result.success:
warn(
"[{}] NCA did not converge: {}".format(
cls_name, opt_result.message
),
ConvergenceWarning,
)
print("[{}] Training took {:8.2f}s.".format(cls_name, t_train))
return self
def transform(self, X):
"""Apply the learned transformation to the given data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data samples.
Returns
-------
X_embedded: ndarray of shape (n_samples, n_components)
The data samples transformed.
Raises
------
NotFittedError
If :meth:`fit` has not been called before.
"""
check_is_fitted(self)
X = self._validate_data(X, reset=False)
return np.dot(X, self.components_.T)
def _validate_params(self, X, y):
"""Validate parameters as soon as :meth:`fit` is called.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training samples.
y : array-like of shape (n_samples,)
The corresponding training labels.
Returns
-------
X : ndarray of shape (n_samples, n_features)
The validated training samples.
y : ndarray of shape (n_samples,)
The validated training labels, encoded to be integers in
the `range(0, n_classes)`.
init : str or ndarray of shape (n_features_a, n_features_b)
The validated initialization of the linear transformation.
Raises
-------
TypeError
If a parameter is not an instance of the desired type.
ValueError
If a parameter's value violates its legal value range or if the
combination of two or more given parameters is incompatible.
"""
# Validate the inputs X and y, and converts y to numerical classes.
X, y = self._validate_data(X, y, ensure_min_samples=2)
check_classification_targets(y)
y = LabelEncoder().fit_transform(y)
# Check the preferred dimensionality of the projected space
if self.n_components is not None:
check_scalar(self.n_components, "n_components", numbers.Integral, min_val=1)
if self.n_components > X.shape[1]:
raise ValueError(
"The preferred dimensionality of the "
"projected space `n_components` ({}) cannot "
"be greater than the given data "
"dimensionality ({})!".format(self.n_components, X.shape[1])
)
# If warm_start is enabled, check that the inputs are consistent
check_scalar(self.warm_start, "warm_start", bool)
if self.warm_start and hasattr(self, "components_"):
if self.components_.shape[1] != X.shape[1]:
raise ValueError(
"The new inputs dimensionality ({}) does not "
"match the input dimensionality of the "
"previously learned transformation ({}).".format(
X.shape[1], self.components_.shape[1]
)
)
check_scalar(self.max_iter, "max_iter", numbers.Integral, min_val=1)
check_scalar(self.tol, "tol", numbers.Real, min_val=0.0)
check_scalar(self.verbose, "verbose", numbers.Integral, min_val=0)
if self.callback is not None:
if not callable(self.callback):
raise ValueError("`callback` is not callable.")
# Check how the linear transformation should be initialized
init = self.init
if isinstance(init, np.ndarray):
init = check_array(init)
# Assert that init.shape[1] = X.shape[1]
if init.shape[1] != X.shape[1]:
raise ValueError(
"The input dimensionality ({}) of the given "
"linear transformation `init` must match the "
"dimensionality of the given inputs `X` ({}).".format(
init.shape[1], X.shape[1]
)
)
# Assert that init.shape[0] <= init.shape[1]
if init.shape[0] > init.shape[1]:
raise ValueError(
"The output dimensionality ({}) of the given "
"linear transformation `init` cannot be "
"greater than its input dimensionality ({}).".format(
init.shape[0], init.shape[1]
)
)
if self.n_components is not None:
# Assert that self.n_components = init.shape[0]
if self.n_components != init.shape[0]:
raise ValueError(
"The preferred dimensionality of the "
"projected space `n_components` ({}) does"
" not match the output dimensionality of "
"the given linear transformation "
"`init` ({})!".format(self.n_components, init.shape[0])
)
elif init in ["auto", "pca", "lda", "identity", "random"]:
pass
else:
raise ValueError(
"`init` must be 'auto', 'pca', 'lda', 'identity', 'random' "
"or a numpy array of shape (n_components, n_features)."
)
return X, y, init
def _initialize(self, X, y, init):
"""Initialize the transformation.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training samples.
y : array-like of shape (n_samples,)
The training labels.
init : str or ndarray of shape (n_features_a, n_features_b)
The validated initialization of the linear transformation.
Returns
-------
transformation : ndarray of shape (n_components, n_features)
The initialized linear transformation.
"""
transformation = init
if self.warm_start and hasattr(self, "components_"):
transformation = self.components_
elif isinstance(init, np.ndarray):
pass
else:
n_samples, n_features = X.shape
n_components = self.n_components or n_features
if init == "auto":
n_classes = len(np.unique(y))
if n_components <= min(n_features, n_classes - 1):
init = "lda"
elif n_components < min(n_features, n_samples):
init = "pca"
else:
init = "identity"
if init == "identity":
transformation = np.eye(n_components, X.shape[1])
elif init == "random":
transformation = self.random_state_.standard_normal(
size=(n_components, X.shape[1])
)
elif init in {"pca", "lda"}:
init_time = time.time()
if init == "pca":
pca = PCA(
n_components=n_components, random_state=self.random_state_
)
if self.verbose:
print("Finding principal components... ", end="")
sys.stdout.flush()
pca.fit(X)
transformation = pca.components_
elif init == "lda":
from ..discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=n_components)
if self.verbose:
print("Finding most discriminative components... ", end="")
sys.stdout.flush()
lda.fit(X, y)
transformation = lda.scalings_.T[:n_components]
if self.verbose:
print("done in {:5.2f}s".format(time.time() - init_time))
return transformation
def _callback(self, transformation):
"""Called after each iteration of the optimizer.
Parameters
----------
transformation : ndarray of shape (n_components * n_features,)
The solution computed by the optimizer in this iteration.
"""
if self.callback is not None:
self.callback(transformation, self.n_iter_)
self.n_iter_ += 1
def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
"""Compute the loss and the loss gradient w.r.t. `transformation`.
Parameters
----------
transformation : ndarray of shape (n_components * n_features,)
The raveled linear transformation on which to compute loss and
evaluate gradient.
X : ndarray of shape (n_samples, n_features)
The training samples.
same_class_mask : ndarray of shape (n_samples, n_samples)
A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong
to the same class, and `0` otherwise.
Returns
-------
loss : float
The loss computed for the given transformation.
gradient : ndarray of shape (n_components * n_features,)
The new (flattened) gradient of the loss.
"""
if self.n_iter_ == 0:
self.n_iter_ += 1
if self.verbose:
header_fields = ["Iteration", "Objective Value", "Time(s)"]
header_fmt = "{:>10} {:>20} {:>10}"
header = header_fmt.format(*header_fields)
cls_name = self.__class__.__name__
print("[{}]".format(cls_name))
print(
"[{}] {}\n[{}] {}".format(
cls_name, header, cls_name, "-" * len(header)
)
)
t_funcall = time.time()
transformation = transformation.reshape(-1, X.shape[1])
X_embedded = np.dot(X, transformation.T) # (n_samples, n_components)
# Compute softmax distances
p_ij = pairwise_distances(X_embedded, squared=True)
np.fill_diagonal(p_ij, np.inf)
p_ij = softmax(-p_ij) # (n_samples, n_samples)
# Compute loss
masked_p_ij = p_ij * same_class_mask
p = np.sum(masked_p_ij, axis=1, keepdims=True) # (n_samples, 1)
loss = np.sum(p)
# Compute gradient of loss w.r.t. `transform`
weighted_p_ij = masked_p_ij - p_ij * p
weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T
np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0))
gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X)
# time complexity of the gradient: O(n_components x n_samples x (
# n_samples + n_features))
if self.verbose:
t_funcall = time.time() - t_funcall
values_fmt = "[{}] {:>10} {:>20.6e} {:>10.2f}"
print(
values_fmt.format(
self.__class__.__name__, self.n_iter_, loss, t_funcall
)
)
sys.stdout.flush()
return sign * loss, sign * gradient.ravel()
def _more_tags(self):
return {"requires_y": True}

View File

@@ -0,0 +1,222 @@
"""
Nearest Centroid Classification
"""
# Author: Robert Layton <robertlayton@gmail.com>
# Olivier Grisel <olivier.grisel@ensta.org>
#
# License: BSD 3 clause
import warnings
import numpy as np
from scipy import sparse as sp
from ..base import BaseEstimator, ClassifierMixin
from ..metrics.pairwise import pairwise_distances
from ..preprocessing import LabelEncoder
from ..utils.validation import check_is_fitted
from ..utils.sparsefuncs import csc_median_axis_0
from ..utils.multiclass import check_classification_targets
class NearestCentroid(ClassifierMixin, BaseEstimator):
"""Nearest centroid classifier.
Each class is represented by its centroid, with test samples classified to
the class with the nearest centroid.
Read more in the :ref:`User Guide <nearest_centroid_classifier>`.
Parameters
----------
metric : str or callable, default="euclidean"
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by
:func:`~sklearn.metrics.pairwise_distances` for its metric
parameter. The centroids for the samples corresponding to each class is
the point from which the sum of the distances (according to the metric)
of all samples that belong to that particular class are minimized.
If the `"manhattan"` metric is provided, this centroid is the median
and for all other metrics, the centroid is now set to be the mean.
.. versionchanged:: 0.19
`metric='precomputed'` was deprecated and now raises an error
shrink_threshold : float, default=None
Threshold for shrinking centroids to remove features.
Attributes
----------
centroids_ : array-like of shape (n_classes, n_features)
Centroid of each class.
classes_ : array of shape (n_classes,)
The unique classes labels.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
KNeighborsClassifier : Nearest neighbors classifier.
Notes
-----
When used for text classification with tf-idf vectors, this classifier is
also known as the Rocchio classifier.
References
----------
Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of
multiple cancer types by shrunken centroids of gene expression. Proceedings
of the National Academy of Sciences of the United States of America,
99(10), 6567-6572. The National Academy of Sciences.
Examples
--------
>>> from sklearn.neighbors import NearestCentroid
>>> import numpy as np
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> y = np.array([1, 1, 1, 2, 2, 2])
>>> clf = NearestCentroid()
>>> clf.fit(X, y)
NearestCentroid()
>>> print(clf.predict([[-0.8, -1]]))
[1]
"""
def __init__(self, metric="euclidean", *, shrink_threshold=None):
self.metric = metric
self.shrink_threshold = shrink_threshold
def fit(self, X, y):
"""
Fit the NearestCentroid model according to the given training data.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples and
`n_features` is the number of features.
Note that centroid shrinking cannot be used with sparse matrices.
y : array-like of shape (n_samples,)
Target values.
Returns
-------
self : object
Fitted estimator.
"""
if self.metric == "precomputed":
raise ValueError("Precomputed is not supported.")
# If X is sparse and the metric is "manhattan", store it in a csc
# format is easier to calculate the median.
if self.metric == "manhattan":
X, y = self._validate_data(X, y, accept_sparse=["csc"])
else:
X, y = self._validate_data(X, y, accept_sparse=["csr", "csc"])
is_X_sparse = sp.issparse(X)
if is_X_sparse and self.shrink_threshold:
raise ValueError("threshold shrinking not supported for sparse input")
check_classification_targets(y)
n_samples, n_features = X.shape
le = LabelEncoder()
y_ind = le.fit_transform(y)
self.classes_ = classes = le.classes_
n_classes = classes.size
if n_classes < 2:
raise ValueError(
"The number of classes has to be greater than one; got %d class"
% (n_classes)
)
# Mask mapping each class to its members.
self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)
# Number of clusters in each class.
nk = np.zeros(n_classes)
for cur_class in range(n_classes):
center_mask = y_ind == cur_class
nk[cur_class] = np.sum(center_mask)
if is_X_sparse:
center_mask = np.where(center_mask)[0]
# XXX: Update other averaging methods according to the metrics.
if self.metric == "manhattan":
# NumPy does not calculate median of sparse matrices.
if not is_X_sparse:
self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
else:
self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])
else:
if self.metric != "euclidean":
warnings.warn(
"Averaging for metrics other than "
"euclidean and manhattan not supported. "
"The average is set to be the mean."
)
self.centroids_[cur_class] = X[center_mask].mean(axis=0)
if self.shrink_threshold:
if np.all(np.ptp(X, axis=0) == 0):
raise ValueError("All features have zero variance. Division by zero.")
dataset_centroid_ = np.mean(X, axis=0)
# m parameter for determining deviation
m = np.sqrt((1.0 / nk) - (1.0 / n_samples))
# Calculate deviation using the standard deviation of centroids.
variance = (X - self.centroids_[y_ind]) ** 2
variance = variance.sum(axis=0)
s = np.sqrt(variance / (n_samples - n_classes))
s += np.median(s) # To deter outliers from affecting the results.
mm = m.reshape(len(m), 1) # Reshape to allow broadcasting.
ms = mm * s
deviation = (self.centroids_ - dataset_centroid_) / ms
# Soft thresholding: if the deviation crosses 0 during shrinking,
# it becomes zero.
signs = np.sign(deviation)
deviation = np.abs(deviation) - self.shrink_threshold
np.clip(deviation, 0, None, out=deviation)
deviation *= signs
# Now adjust the centroids using the deviation
msd = ms * deviation
self.centroids_ = dataset_centroid_[np.newaxis, :] + msd
return self
def predict(self, X):
"""Perform classification on an array of test vectors `X`.
The predicted class `C` for each sample in `X` is returned.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Test samples.
Returns
-------
C : ndarray of shape (n_samples,)
The predicted classes.
Notes
-----
If the metric constructor parameter is `"precomputed"`, `X` is assumed
to be the distance matrix between the data to be predicted and
`self.centroids_`.
"""
check_is_fitted(self)
X = self._validate_data(X, accept_sparse="csr", reset=False)
return self.classes_[
pairwise_distances(X, self.centroids_, metric=self.metric).argmin(axis=1)
]

View File

@@ -0,0 +1,9 @@
from ..utils._typedefs cimport DTYPE_t, ITYPE_t
cdef int partition_node_indices(
DTYPE_t *data,
ITYPE_t *node_indices,
ITYPE_t split_dim,
ITYPE_t split_index,
ITYPE_t n_features,
ITYPE_t n_points) except -1

View File

@@ -0,0 +1,97 @@
# Author: Thomas Moreau <thomas.moreau.2010@gmail.com>
# Author: Olivier Grisel <olivier.grisel@ensta.fr>
# See quad_tree.pyx for details.
import numpy as np
cimport numpy as np
ctypedef np.npy_float32 DTYPE_t # Type of X
ctypedef np.npy_intp SIZE_t # Type for indices and counters
ctypedef np.npy_int32 INT32_t # Signed 32 bit integer
ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer
# This is effectively an ifdef statement in Cython
# It allows us to write printf debugging lines
# and remove them at compile time
cdef enum:
DEBUGFLAG = 0
cdef float EPSILON = 1e-6
# XXX: Careful to not change the order of the arguments. It is important to
# have is_leaf and max_width consecutive as it permits to avoid padding by
# the compiler and keep the size coherent for both C and numpy data structures.
cdef struct Cell:
# Base storage structure for cells in a QuadTree object
# Tree structure
SIZE_t parent # Parent cell of this cell
SIZE_t[8] children # Array pointing to children of this cell
# Cell description
SIZE_t cell_id # Id of the cell in the cells array in the Tree
SIZE_t point_index # Index of the point at this cell (only defined
# in non empty leaf)
bint is_leaf # Does this cell have children?
DTYPE_t squared_max_width # Squared value of the maximum width w
SIZE_t depth # Depth of the cell in the tree
SIZE_t cumulative_size # Number of points included in the subtree with
# this cell as a root.
# Internal constants
DTYPE_t[3] center # Store the center for quick split of cells
DTYPE_t[3] barycenter # Keep track of the center of mass of the cell
# Cell boundaries
DTYPE_t[3] min_bounds # Inferior boundaries of this cell (inclusive)
DTYPE_t[3] max_bounds # Superior boundaries of this cell (exclusive)
cdef class _QuadTree:
# The QuadTree object is a quad tree structure constructed by inserting
# recursively points in the tree and splitting cells in 4 so that each
# leaf cell contains at most one point.
# This structure also handle 3D data, inserted in trees with 8 children
# for each node.
# Parameters of the tree
cdef public int n_dimensions # Number of dimensions in X
cdef public int verbose # Verbosity of the output
cdef SIZE_t n_cells_per_cell # Number of children per node. (2 ** n_dimension)
# Tree inner structure
cdef public SIZE_t max_depth # Max depth of the tree
cdef public SIZE_t cell_count # Counter for node IDs
cdef public SIZE_t capacity # Capacity of tree, in terms of nodes
cdef public SIZE_t n_points # Total number of points
cdef Cell* cells # Array of nodes
# Point insertion methods
cdef int insert_point(self, DTYPE_t[3] point, SIZE_t point_index,
SIZE_t cell_id=*) nogil except -1
cdef SIZE_t _insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell,
SIZE_t point_index, SIZE_t size=*
) nogil
cdef SIZE_t _select_child(self, DTYPE_t[3] point, Cell* cell) nogil
cdef bint _is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil
# Create a summary of the Tree compare to a query point
cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results,
float squared_theta=*, SIZE_t cell_id=*, long idx=*
) nogil
# Internal cell initialization methods
cdef void _init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil
cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds
) nogil
# Private methods
cdef int _check_point_in_cell(self, DTYPE_t[3] point, Cell* cell
) nogil except -1
# Private array manipulation to manage the ``cells`` array
cdef int _resize(self, SIZE_t capacity) nogil except -1
cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1
cdef int _get_cell(self, DTYPE_t[3] point, SIZE_t cell_id=*) nogil except -1
cdef np.ndarray _get_cell_ndarray(self)

View File

@@ -0,0 +1,471 @@
"""Nearest Neighbor Regression."""
# Authors: Jake Vanderplas <vanderplas@astro.washington.edu>
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Sparseness support by Lars Buitinck
# Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
# Empty radius support by Andreas Bjerre-Nielsen
#
# License: BSD 3 clause (C) INRIA, University of Amsterdam,
# University of Copenhagen
import warnings
import numpy as np
from ._base import _get_weights, _check_weights
from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
from ..base import RegressorMixin
class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
"""Regression based on k-nearest neighbors.
The target is predicted by local interpolation of the targets
associated of the nearest neighbors in the training set.
Read more in the :ref:`User Guide <regression>`.
.. versionadded:: 0.9
Parameters
----------
n_neighbors : int, default=5
Number of neighbors to use by default for :meth:`kneighbors` queries.
weights : {'uniform', 'distance'} or callable, default='uniform'
Weight function used in prediction. Possible values:
- 'uniform' : uniform weights. All points in each neighborhood
are weighted equally.
- 'distance' : weight points by the inverse of their distance.
in this case, closer neighbors of a query point will have a
greater influence than neighbors which are further away.
- [callable] : a user-defined function which accepts an
array of distances, and returns an array of the same shape
containing the weights.
Uniform weights are used by default.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, default=30
Leaf size passed to BallTree or KDTree. This can affect the
speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
p : int, default=2
Power parameter for the Minkowski metric. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric : str or callable, default='minkowski'
The distance metric to use for the tree. The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric. For a list of available metrics, see the documentation of
:class:`~sklearn.metrics.DistanceMetric` and the metrics listed in
`sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`. Note that the
"cosine" metric uses :func:`~sklearn.metrics.pairwise.cosine_distances`.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit. X may be a :term:`sparse graph`,
in which case only "nonzero" elements may be considered neighbors.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Doesn't affect :meth:`fit` method.
Attributes
----------
effective_metric_ : str or callable
The distance metric to use. It will be same as the `metric` parameter
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
'minkowski' and `p` parameter set to 2.
effective_metric_params_ : dict
Additional keyword arguments for the metric function. For most metrics
will be same with `metric_params` parameter, but may also contain the
`p` parameter value if the `effective_metric_` attribute is set to
'minkowski'.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_samples_fit_ : int
Number of samples in the fitted data.
See Also
--------
NearestNeighbors : Unsupervised learner for implementing neighbor searches.
RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius.
KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote.
RadiusNeighborsClassifier : Classifier implementing
a vote among neighbors within a given radius.
Notes
-----
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
.. warning::
Regarding the Nearest Neighbors algorithms, if it is found that two
neighbors, neighbor `k+1` and `k`, have identical distances but
different labels, the results will depend on the ordering of the
training data.
https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
Examples
--------
>>> X = [[0], [1], [2], [3]]
>>> y = [0, 0, 1, 1]
>>> from sklearn.neighbors import KNeighborsRegressor
>>> neigh = KNeighborsRegressor(n_neighbors=2)
>>> neigh.fit(X, y)
KNeighborsRegressor(...)
>>> print(neigh.predict([[1.5]]))
[0.5]
"""
def __init__(
self,
n_neighbors=5,
*,
weights="uniform",
algorithm="auto",
leaf_size=30,
p=2,
metric="minkowski",
metric_params=None,
n_jobs=None,
):
super().__init__(
n_neighbors=n_neighbors,
algorithm=algorithm,
leaf_size=leaf_size,
metric=metric,
p=p,
metric_params=metric_params,
n_jobs=n_jobs,
)
self.weights = weights
def _more_tags(self):
# For cross-validation routines to split data correctly
return {"pairwise": self.metric == "precomputed"}
def fit(self, X, y):
"""Fit the k-nearest neighbors regressor from the training dataset.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples) if metric='precomputed'
Training data.
y : {array-like, sparse matrix} of shape (n_samples,) or \
(n_samples, n_outputs)
Target values.
Returns
-------
self : KNeighborsRegressor
The fitted k-nearest neighbors regressor.
"""
self.weights = _check_weights(self.weights)
return self._fit(X, y)
def predict(self, X):
"""Predict the target for the provided data.
Parameters
----------
X : array-like of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
Returns
-------
y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
Target values.
"""
if self.weights == "uniform":
# In that case, we do not need the distances to perform
# the weighting so we do not compute them.
neigh_ind = self.kneighbors(X, return_distance=False)
neigh_dist = None
else:
neigh_dist, neigh_ind = self.kneighbors(X)
weights = _get_weights(neigh_dist, self.weights)
_y = self._y
if _y.ndim == 1:
_y = _y.reshape((-1, 1))
if weights is None:
y_pred = np.mean(_y[neigh_ind], axis=1)
else:
y_pred = np.empty((neigh_dist.shape[0], _y.shape[1]), dtype=np.float64)
denom = np.sum(weights, axis=1)
for j in range(_y.shape[1]):
num = np.sum(_y[neigh_ind, j] * weights, axis=1)
y_pred[:, j] = num / denom
if self._y.ndim == 1:
y_pred = y_pred.ravel()
return y_pred
class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBase):
"""Regression based on neighbors within a fixed radius.
The target is predicted by local interpolation of the targets
associated of the nearest neighbors in the training set.
Read more in the :ref:`User Guide <regression>`.
.. versionadded:: 0.9
Parameters
----------
radius : float, default=1.0
Range of parameter space to use by default for :meth:`radius_neighbors`
queries.
weights : {'uniform', 'distance'} or callable, default='uniform'
Weight function used in prediction. Possible values:
- 'uniform' : uniform weights. All points in each neighborhood
are weighted equally.
- 'distance' : weight points by the inverse of their distance.
in this case, closer neighbors of a query point will have a
greater influence than neighbors which are further away.
- [callable] : a user-defined function which accepts an
array of distances, and returns an array of the same shape
containing the weights.
Uniform weights are used by default.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, default=30
Leaf size passed to BallTree or KDTree. This can affect the
speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
p : int, default=2
Power parameter for the Minkowski metric. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric : str or callable, default='minkowski'
The distance metric to use for the tree. The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric. See the documentation of :class:`DistanceMetric` for a
list of available metrics.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit. X may be a :term:`sparse graph`,
in which case only "nonzero" elements may be considered neighbors.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
effective_metric_ : str or callable
The distance metric to use. It will be same as the `metric` parameter
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
'minkowski' and `p` parameter set to 2.
effective_metric_params_ : dict
Additional keyword arguments for the metric function. For most metrics
will be same with `metric_params` parameter, but may also contain the
`p` parameter value if the `effective_metric_` attribute is set to
'minkowski'.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_samples_fit_ : int
Number of samples in the fitted data.
See Also
--------
NearestNeighbors : Regression based on nearest neighbors.
KNeighborsRegressor : Regression based on k-nearest neighbors.
KNeighborsClassifier : Classifier based on the k-nearest neighbors.
RadiusNeighborsClassifier : Classifier based on neighbors within a given radius.
Notes
-----
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
Examples
--------
>>> X = [[0], [1], [2], [3]]
>>> y = [0, 0, 1, 1]
>>> from sklearn.neighbors import RadiusNeighborsRegressor
>>> neigh = RadiusNeighborsRegressor(radius=1.0)
>>> neigh.fit(X, y)
RadiusNeighborsRegressor(...)
>>> print(neigh.predict([[1.5]]))
[0.5]
"""
def __init__(
self,
radius=1.0,
*,
weights="uniform",
algorithm="auto",
leaf_size=30,
p=2,
metric="minkowski",
metric_params=None,
n_jobs=None,
):
super().__init__(
radius=radius,
algorithm=algorithm,
leaf_size=leaf_size,
p=p,
metric=metric,
metric_params=metric_params,
n_jobs=n_jobs,
)
self.weights = weights
def fit(self, X, y):
"""Fit the radius neighbors regressor from the training dataset.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples) if metric='precomputed'
Training data.
y : {array-like, sparse matrix} of shape (n_samples,) or \
(n_samples, n_outputs)
Target values.
Returns
-------
self : RadiusNeighborsRegressor
The fitted radius neighbors regressor.
"""
self.weights = _check_weights(self.weights)
return self._fit(X, y)
def predict(self, X):
"""Predict the target for the provided data.
Parameters
----------
X : array-like of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
Returns
-------
y : ndarray of shape (n_queries,) or (n_queries, n_outputs), \
dtype=double
Target values.
"""
neigh_dist, neigh_ind = self.radius_neighbors(X)
weights = _get_weights(neigh_dist, self.weights)
_y = self._y
if _y.ndim == 1:
_y = _y.reshape((-1, 1))
empty_obs = np.full_like(_y[0], np.nan)
if weights is None:
y_pred = np.array(
[
np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs
for (i, ind) in enumerate(neigh_ind)
]
)
else:
y_pred = np.array(
[
np.average(_y[ind, :], axis=0, weights=weights[i])
if len(ind)
else empty_obs
for (i, ind) in enumerate(neigh_ind)
]
)
if np.any(np.isnan(y_pred)):
empty_warning_msg = (
"One or more samples have no neighbors "
"within specified radius; predicting NaN."
)
warnings.warn(empty_warning_msg)
if self._y.ndim == 1:
y_pred = y_pred.ravel()
return y_pred

View File

@@ -0,0 +1,168 @@
"""Unsupervised nearest neighbors learner"""
from ._base import NeighborsBase
from ._base import KNeighborsMixin
from ._base import RadiusNeighborsMixin
class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
"""Unsupervised learner for implementing neighbor searches.
Read more in the :ref:`User Guide <unsupervised_neighbors>`.
.. versionadded:: 0.9
Parameters
----------
n_neighbors : int, default=5
Number of neighbors to use by default for :meth:`kneighbors` queries.
radius : float, default=1.0
Range of parameter space to use by default for :meth:`radius_neighbors`
queries.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, default=30
Leaf size passed to BallTree or KDTree. This can affect the
speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
metric : str or callable, default='minkowski'
The distance metric to use for the tree. The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric. For a list of available metrics, see the documentation of
:class:`~sklearn.metrics.DistanceMetric` and the metrics listed in
`sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`. Note that the
"cosine" metric uses :func:`~sklearn.metrics.pairwise.cosine_distances`.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit. X may be a :term:`sparse graph`,
in which case only "nonzero" elements may be considered neighbors.
p : int, default=2
Parameter for the Minkowski metric from
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
effective_metric_ : str
Metric used to compute distances to neighbors.
effective_metric_params_ : dict
Parameters for the metric used to compute distances to neighbors.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_samples_fit_ : int
Number of samples in the fitted data.
See Also
--------
KNeighborsClassifier : Classifier implementing the k-nearest neighbors
vote.
RadiusNeighborsClassifier : Classifier implementing a vote among neighbors
within a given radius.
KNeighborsRegressor : Regression based on k-nearest neighbors.
RadiusNeighborsRegressor : Regression based on neighbors within a fixed
radius.
BallTree : Space partitioning data structure for organizing points in a
multi-dimensional space, used for nearest neighbor search.
Notes
-----
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
Examples
--------
>>> import numpy as np
>>> from sklearn.neighbors import NearestNeighbors
>>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]
>>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
>>> neigh.fit(samples)
NearestNeighbors(...)
>>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)
array([[2, 0]]...)
>>> nbrs = neigh.radius_neighbors(
... [[0, 0, 1.3]], 0.4, return_distance=False
... )
>>> np.asarray(nbrs[0][0])
array(2)
"""
def __init__(
self,
*,
n_neighbors=5,
radius=1.0,
algorithm="auto",
leaf_size=30,
metric="minkowski",
p=2,
metric_params=None,
n_jobs=None,
):
super().__init__(
n_neighbors=n_neighbors,
radius=radius,
algorithm=algorithm,
leaf_size=leaf_size,
metric=metric,
p=p,
metric_params=metric_params,
n_jobs=n_jobs,
)
def fit(self, X, y=None):
"""Fit the nearest neighbors estimator from the training dataset.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples) if metric='precomputed'
Training data.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : NearestNeighbors
The fitted nearest neighbors estimator.
"""
return self._fit(X)

View File

@@ -0,0 +1,44 @@
import os
def configuration(parent_package="", top_path=None):
import numpy
from numpy.distutils.misc_util import Configuration
config = Configuration("neighbors", parent_package, top_path)
libraries = []
if os.name == "posix":
libraries.append("m")
config.add_extension(
"_ball_tree",
sources=["_ball_tree.pyx"],
include_dirs=[numpy.get_include()],
libraries=libraries,
)
config.add_extension(
"_kd_tree",
sources=["_kd_tree.pyx"],
include_dirs=[numpy.get_include()],
libraries=libraries,
)
config.add_extension(
"_partition_nodes",
sources=["_partition_nodes.pyx"],
include_dirs=[numpy.get_include()],
language="c++",
libraries=libraries,
)
config.add_extension(
"_quad_tree",
sources=["_quad_tree.pyx"],
include_dirs=[numpy.get_include()],
libraries=libraries,
)
config.add_subpackage("tests")
return config

View File

@@ -0,0 +1,104 @@
import itertools
import numpy as np
import pytest
from numpy.testing import assert_array_almost_equal
from sklearn.neighbors._ball_tree import BallTree
from sklearn.utils import check_random_state
from sklearn.utils.validation import check_array
from sklearn.utils._testing import _convert_container
rng = np.random.RandomState(10)
V_mahalanobis = rng.rand(3, 3)
V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
DIMENSION = 3
METRICS = {
"euclidean": {},
"manhattan": {},
"minkowski": dict(p=3),
"chebyshev": {},
"seuclidean": dict(V=rng.random_sample(DIMENSION)),
"wminkowski": dict(p=3, w=rng.random_sample(DIMENSION)),
"mahalanobis": dict(V=V_mahalanobis),
}
DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"]
BOOLEAN_METRICS = [
"matching",
"jaccard",
"dice",
"kulsinski",
"rogerstanimoto",
"russellrao",
"sokalmichener",
"sokalsneath",
]
def brute_force_neighbors(X, Y, k, metric, **kwargs):
from sklearn.metrics import DistanceMetric
X, Y = check_array(X), check_array(Y)
D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
ind = np.argsort(D, axis=1)[:, :k]
dist = D[np.arange(Y.shape[0])[:, None], ind]
return dist, ind
@pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS))
@pytest.mark.parametrize("array_type", ["list", "array"])
def test_ball_tree_query_metrics(metric, array_type):
rng = check_random_state(0)
if metric in BOOLEAN_METRICS:
X = rng.random_sample((40, 10)).round(0)
Y = rng.random_sample((10, 10)).round(0)
elif metric in DISCRETE_METRICS:
X = (4 * rng.random_sample((40, 10))).round(0)
Y = (4 * rng.random_sample((10, 10))).round(0)
X = _convert_container(X, array_type)
Y = _convert_container(Y, array_type)
k = 5
bt = BallTree(X, leaf_size=1, metric=metric)
dist1, ind1 = bt.query(Y, k)
dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
assert_array_almost_equal(dist1, dist2)
def test_query_haversine():
rng = check_random_state(0)
X = 2 * np.pi * rng.random_sample((40, 2))
bt = BallTree(X, leaf_size=1, metric="haversine")
dist1, ind1 = bt.query(X, k=5)
dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine")
assert_array_almost_equal(dist1, dist2)
assert_array_almost_equal(ind1, ind2)
def test_array_object_type():
"""Check that we do not accept object dtype array."""
X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
with pytest.raises(ValueError, match="setting an array element with a sequence"):
BallTree(X)
def test_bad_pyfunc_metric():
def wrong_returned_value(x, y):
return "1"
def one_arg_func(x):
return 1.0 # pragma: no cover
X = np.ones((5, 2))
msg = "Custom distance function must accept two vectors and return a float."
with pytest.raises(TypeError, match=msg):
BallTree(X, metric=wrong_returned_value)
msg = "takes 1 positional argument but 2 were given"
with pytest.raises(TypeError, match=msg):
BallTree(X, metric=one_arg_func)

View File

@@ -0,0 +1,101 @@
import numpy as np
import pytest
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer
from sklearn.neighbors._base import _is_sorted_by_data
from sklearn.utils._testing import assert_array_equal
def test_transformer_result():
# Test the number of neighbors returned
n_neighbors = 5
n_samples_fit = 20
n_queries = 18
n_features = 10
rng = np.random.RandomState(42)
X = rng.randn(n_samples_fit, n_features)
X2 = rng.randn(n_queries, n_features)
radius = np.percentile(euclidean_distances(X), 10)
# with n_neighbors
for mode in ["distance", "connectivity"]:
add_one = mode == "distance"
nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode)
Xt = nnt.fit_transform(X)
assert Xt.shape == (n_samples_fit, n_samples_fit)
assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),)
assert Xt.format == "csr"
assert _is_sorted_by_data(Xt)
X2t = nnt.transform(X2)
assert X2t.shape == (n_queries, n_samples_fit)
assert X2t.data.shape == (n_queries * (n_neighbors + add_one),)
assert X2t.format == "csr"
assert _is_sorted_by_data(X2t)
# with radius
for mode in ["distance", "connectivity"]:
add_one = mode == "distance"
nnt = RadiusNeighborsTransformer(radius=radius, mode=mode)
Xt = nnt.fit_transform(X)
assert Xt.shape == (n_samples_fit, n_samples_fit)
assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),)
assert Xt.format == "csr"
assert _is_sorted_by_data(Xt)
X2t = nnt.transform(X2)
assert X2t.shape == (n_queries, n_samples_fit)
assert not X2t.data.shape == (n_queries * (n_neighbors + add_one),)
assert X2t.format == "csr"
assert _is_sorted_by_data(X2t)
def _has_explicit_diagonal(X):
"""Return True if the diagonal is explicitly stored"""
X = X.tocoo()
explicit = X.row[X.row == X.col]
return len(explicit) == X.shape[0]
def test_explicit_diagonal():
# Test that the diagonal is explicitly stored in the sparse graph
n_neighbors = 5
n_samples_fit, n_samples_transform, n_features = 20, 18, 10
rng = np.random.RandomState(42)
X = rng.randn(n_samples_fit, n_features)
X2 = rng.randn(n_samples_transform, n_features)
nnt = KNeighborsTransformer(n_neighbors=n_neighbors)
Xt = nnt.fit_transform(X)
assert _has_explicit_diagonal(Xt)
assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
Xt = nnt.transform(X)
assert _has_explicit_diagonal(Xt)
assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
# Using transform on new data should not always have zero diagonal
X2t = nnt.transform(X2)
assert not _has_explicit_diagonal(X2t)
@pytest.mark.parametrize("Klass", [KNeighborsTransformer, RadiusNeighborsTransformer])
def test_graph_feature_names_out(Klass):
"""Check `get_feature_names_out` for transformers defined in `_graph.py`."""
n_samples_fit = 20
n_features = 10
rng = np.random.RandomState(42)
X = rng.randn(n_samples_fit, n_features)
est = Klass().fit(X)
names_out = est.get_feature_names_out()
class_name_lower = Klass.__name__.lower()
expected_names_out = np.array(
[f"{class_name_lower}{i}" for i in range(est.n_samples_fit_)],
dtype=object,
)
assert_array_equal(names_out, expected_names_out)

View File

@@ -0,0 +1,31 @@
import numpy as np
import pytest
from joblib import Parallel
from sklearn.utils.fixes import delayed
from sklearn.neighbors._kd_tree import KDTree
DIMENSION = 3
METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)}
def test_array_object_type():
"""Check that we do not accept object dtype array."""
X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
with pytest.raises(ValueError, match="setting an array element with a sequence"):
KDTree(X)
def test_kdtree_picklable_with_joblib():
"""Make sure that KDTree queries work when joblib memmaps.
Non-regression test for #21685 and #21228."""
rng = np.random.RandomState(0)
X = rng.random_sample((10, 3))
tree = KDTree(X, leaf_size=2)
# Call Parallel with max_nbytes=1 to trigger readonly memory mapping that
# use to raise "ValueError: buffer source array is read-only" in a previous
# version of the Cython code.
Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X])

View File

@@ -0,0 +1,249 @@
import numpy as np
import pytest
from sklearn.utils._testing import assert_allclose
from sklearn.neighbors import KernelDensity, KDTree, NearestNeighbors
from sklearn.neighbors._ball_tree import kernel_norm
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_blobs
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError
import joblib
# XXX Duplicated in test_neighbors_tree, test_kde
def compute_kernel_slow(Y, X, kernel, h):
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0]
if kernel == "gaussian":
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
elif kernel == "tophat":
return norm * (d < h).sum(-1)
elif kernel == "epanechnikov":
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
elif kernel == "exponential":
return norm * (np.exp(-d / h)).sum(-1)
elif kernel == "linear":
return norm * ((1 - d / h) * (d < h)).sum(-1)
elif kernel == "cosine":
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
else:
raise ValueError("kernel not recognized")
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol)
log_dens = kde.fit(X).score_samples(Y)
assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1e-7, rtol))
assert_allclose(
np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1e-7, rtol)
)
@pytest.mark.parametrize(
"kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
)
@pytest.mark.parametrize("bandwidth", [0.01, 0.1, 1])
def test_kernel_density(kernel, bandwidth):
n_samples, n_features = (100, 3)
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
Y = rng.randn(n_samples, n_features)
dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)
for rtol in [0, 1e-5]:
for atol in [1e-6, 1e-2]:
for breadth_first in (True, False):
check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true)
def test_kernel_density_sampling(n_samples=100, n_features=3):
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
bandwidth = 0.2
for kernel in ["gaussian", "tophat"]:
# draw a tophat sample
kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
samp = kde.sample(100)
assert X.shape == samp.shape
# check that samples are in the right range
nbrs = NearestNeighbors(n_neighbors=1).fit(X)
dist, ind = nbrs.kneighbors(X, return_distance=True)
if kernel == "tophat":
assert np.all(dist < bandwidth)
elif kernel == "gaussian":
# 5 standard deviations is safe for 100 samples, but there's a
# very small chance this test could fail.
assert np.all(dist < 5 * bandwidth)
# check unsupported kernels
for kernel in ["epanechnikov", "exponential", "linear", "cosine"]:
kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
with pytest.raises(NotImplementedError):
kde.sample(100)
# non-regression test: used to return a scalar
X = rng.randn(4, 1)
kde = KernelDensity(kernel="gaussian").fit(X)
assert kde.sample().shape == (1, 1)
@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree"])
@pytest.mark.parametrize(
"metric", ["euclidean", "minkowski", "manhattan", "chebyshev", "haversine"]
)
def test_kde_algorithm_metric_choice(algorithm, metric):
# Smoke test for various metrics and algorithms
rng = np.random.RandomState(0)
X = rng.randn(10, 2) # 2 features required for haversine dist.
Y = rng.randn(10, 2)
kde = KernelDensity(algorithm=algorithm, metric=metric)
if algorithm == "kd_tree" and metric not in KDTree.valid_metrics:
with pytest.raises(ValueError):
kde.fit(X)
else:
kde.fit(X)
y_dens = kde.score_samples(Y)
assert y_dens.shape == Y.shape[:1]
def test_kde_score(n_samples=100, n_features=3):
pass
# FIXME
# rng = np.random.RandomState(0)
# X = rng.random_sample((n_samples, n_features))
# Y = rng.random_sample((n_samples, n_features))
def test_kde_badargs():
X = np.random.random((200, 10))
with pytest.raises(ValueError):
KernelDensity(algorithm="blah").fit(X)
with pytest.raises(ValueError):
KernelDensity(bandwidth=0).fit(X)
with pytest.raises(ValueError):
KernelDensity(kernel="blah").fit(X)
with pytest.raises(ValueError):
KernelDensity(metric="blah").fit(X)
with pytest.raises(ValueError):
KernelDensity(algorithm="kd_tree", metric="blah").fit(X)
kde = KernelDensity()
with pytest.raises(ValueError):
kde.fit(np.random.random((200, 10)), sample_weight=np.random.random((200, 10)))
with pytest.raises(ValueError):
kde.fit(np.random.random((200, 10)), sample_weight=-np.random.random(200))
def test_kde_pipeline_gridsearch():
# test that kde plays nice in pipelines and grid-searches
X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]])
pipe1 = make_pipeline(
StandardScaler(with_mean=False, with_std=False),
KernelDensity(kernel="gaussian"),
)
params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
search = GridSearchCV(pipe1, param_grid=params)
search.fit(X)
assert search.best_params_["kerneldensity__bandwidth"] == 0.1
def test_kde_sample_weights():
n_samples = 400
size_test = 20
weights_neutral = np.full(n_samples, 3.0)
for d in [1, 2, 10]:
rng = np.random.RandomState(0)
X = rng.rand(n_samples, d)
weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
X_repetitions = np.repeat(X, weights, axis=0)
n_samples_test = size_test // d
test_points = rng.rand(n_samples_test, d)
for algorithm in ["auto", "ball_tree", "kd_tree"]:
for metric in ["euclidean", "minkowski", "manhattan", "chebyshev"]:
if algorithm != "kd_tree" or metric in KDTree.valid_metrics:
kde = KernelDensity(algorithm=algorithm, metric=metric)
# Test that adding a constant sample weight has no effect
kde.fit(X, sample_weight=weights_neutral)
scores_const_weight = kde.score_samples(test_points)
sample_const_weight = kde.sample(random_state=1234)
kde.fit(X)
scores_no_weight = kde.score_samples(test_points)
sample_no_weight = kde.sample(random_state=1234)
assert_allclose(scores_const_weight, scores_no_weight)
assert_allclose(sample_const_weight, sample_no_weight)
# Test equivalence between sampling and (integer) weights
kde.fit(X, sample_weight=weights)
scores_weight = kde.score_samples(test_points)
sample_weight = kde.sample(random_state=1234)
kde.fit(X_repetitions)
scores_ref_sampling = kde.score_samples(test_points)
sample_ref_sampling = kde.sample(random_state=1234)
assert_allclose(scores_weight, scores_ref_sampling)
assert_allclose(sample_weight, sample_ref_sampling)
# Test that sample weights has a non-trivial effect
diff = np.max(np.abs(scores_no_weight - scores_weight))
assert diff > 0.001
# Test invariance with respect to arbitrary scaling
scale_factor = rng.rand()
kde.fit(X, sample_weight=(scale_factor * weights))
scores_scaled_weight = kde.score_samples(test_points)
assert_allclose(scores_scaled_weight, scores_weight)
def test_sample_weight_invalid():
# Check sample weighting raises errors.
kde = KernelDensity()
data = np.reshape([1.0, 2.0, 3.0], (-1, 1))
sample_weight = [0.1, -0.2, 0.3]
expected_err = "Negative values in data passed to `sample_weight`"
with pytest.raises(ValueError, match=expected_err):
kde.fit(data, sample_weight=sample_weight)
@pytest.mark.parametrize("sample_weight", [None, [0.1, 0.2, 0.3]])
def test_pickling(tmpdir, sample_weight):
# Make sure that predictions are the same before and after pickling. Used
# to be a bug because sample_weights wasn't pickled and the resulting tree
# would miss some info.
kde = KernelDensity()
data = np.reshape([1.0, 2.0, 3.0], (-1, 1))
kde.fit(data, sample_weight=sample_weight)
X = np.reshape([1.1, 2.1], (-1, 1))
scores = kde.score_samples(X)
file_path = str(tmpdir.join("dump.pkl"))
joblib.dump(kde, file_path)
kde = joblib.load(file_path)
scores_pickled = kde.score_samples(X)
assert_allclose(scores, scores_pickled)
@pytest.mark.parametrize("method", ["score_samples", "sample"])
def test_check_is_fitted(method):
# Check that predict raises an exception in an unfitted estimator.
# Unfitted estimators should raise a NotFittedError.
rng = np.random.RandomState(0)
X = rng.randn(10, 2)
kde = KernelDensity()
with pytest.raises(NotFittedError):
getattr(kde, method)(X)

View File

@@ -0,0 +1,235 @@
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# License: BSD 3 clause
from math import sqrt
import numpy as np
from sklearn import neighbors
import re
import pytest
from numpy.testing import assert_array_equal
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils.estimator_checks import check_outlier_corruption
from sklearn.utils.estimator_checks import parametrize_with_checks
from sklearn.datasets import load_iris
# load the iris dataset
# and randomly permute it
rng = check_random_state(0)
iris = load_iris()
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]
def test_lof():
# Toy sample (the last two samples are outliers):
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]]
# Test LocalOutlierFactor:
clf = neighbors.LocalOutlierFactor(n_neighbors=5)
score = clf.fit(X).negative_outlier_factor_
assert_array_equal(clf._fit_X, X)
# Assert largest outlier score is smaller than smallest inlier score:
assert np.min(score[:-2]) > np.max(score[-2:])
# Assert predict() works:
clf = neighbors.LocalOutlierFactor(contamination=0.25, n_neighbors=5).fit(X)
assert_array_equal(clf._predict(), 6 * [1] + 2 * [-1])
assert_array_equal(clf.fit_predict(X), 6 * [1] + 2 * [-1])
def test_lof_performance():
# Generate train/test data
rng = check_random_state(2)
X = 0.3 * rng.randn(120, 2)
X_train = X[:100]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X_test = np.r_[X[100:], X_outliers]
y_test = np.array([0] * 20 + [1] * 20)
# fit the model for novelty detection
clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train)
# predict scores (the lower, the more normal)
y_pred = -clf.decision_function(X_test)
# check that roc_auc is good
assert roc_auc_score(y_test, y_pred) > 0.99
def test_lof_values():
# toy samples:
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = neighbors.LocalOutlierFactor(
n_neighbors=2, contamination=0.1, novelty=True
).fit(X_train)
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train)
s_0 = 2.0 * sqrt(2.0) / (1.0 + sqrt(2.0))
s_1 = (1.0 + sqrt(2)) * (1.0 / (4.0 * sqrt(2.0)) + 1.0 / (2.0 + 2.0 * sqrt(2)))
# check predict()
assert_array_almost_equal(-clf1.negative_outlier_factor_, [s_0, s_1, s_1])
assert_array_almost_equal(-clf2.negative_outlier_factor_, [s_0, s_1, s_1])
# check predict(one sample not in train)
assert_array_almost_equal(-clf1.score_samples([[2.0, 2.0]]), [s_0])
assert_array_almost_equal(-clf2.score_samples([[2.0, 2.0]]), [s_0])
# check predict(one sample already in train)
assert_array_almost_equal(-clf1.score_samples([[1.0, 1.0]]), [s_1])
assert_array_almost_equal(-clf2.score_samples([[1.0, 1.0]]), [s_1])
def test_lof_precomputed(random_state=42):
"""Tests LOF with a distance matrix."""
# Note: smaller samples may result in spurious test success
rng = np.random.RandomState(random_state)
X = rng.random_sample((10, 4))
Y = rng.random_sample((3, 4))
DXX = metrics.pairwise_distances(X, metric="euclidean")
DYX = metrics.pairwise_distances(Y, X, metric="euclidean")
# As a feature matrix (n_samples by n_features)
lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
lof_X.fit(X)
pred_X_X = lof_X._predict()
pred_X_Y = lof_X.predict(Y)
# As a dense distance matrix (n_samples by n_samples)
lof_D = neighbors.LocalOutlierFactor(
n_neighbors=3, algorithm="brute", metric="precomputed", novelty=True
)
lof_D.fit(DXX)
pred_D_X = lof_D._predict()
pred_D_Y = lof_D.predict(DYX)
assert_array_almost_equal(pred_X_X, pred_D_X)
assert_array_almost_equal(pred_X_Y, pred_D_Y)
def test_n_neighbors_attribute():
X = iris.data
clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
assert clf.n_neighbors_ == X.shape[0] - 1
clf = neighbors.LocalOutlierFactor(n_neighbors=500)
msg = "n_neighbors will be set to (n_samples - 1)"
with pytest.warns(UserWarning, match=re.escape(msg)):
clf.fit(X)
assert clf.n_neighbors_ == X.shape[0] - 1
def test_score_samples():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = neighbors.LocalOutlierFactor(
n_neighbors=2, contamination=0.1, novelty=True
).fit(X_train)
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train)
assert_array_equal(
clf1.score_samples([[2.0, 2.0]]),
clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
)
assert_array_equal(
clf2.score_samples([[2.0, 2.0]]),
clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
)
assert_array_equal(
clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
)
def test_contamination():
X = [[1, 1], [1, 0]]
clf = neighbors.LocalOutlierFactor(contamination=0.6)
with pytest.raises(ValueError):
clf.fit(X)
def test_novelty_errors():
X = iris.data
# check errors for novelty=False
clf = neighbors.LocalOutlierFactor()
clf.fit(X)
# predict, decision_function and score_samples raise ValueError
for method in ["predict", "decision_function", "score_samples"]:
msg = "{} is not available when novelty=False".format(method)
with pytest.raises(AttributeError, match=msg):
getattr(clf, method)
# check errors for novelty=True
clf = neighbors.LocalOutlierFactor(novelty=True)
msg = "fit_predict is not available when novelty=True"
with pytest.raises(AttributeError, match=msg):
getattr(clf, "fit_predict")
def test_novelty_training_scores():
# check that the scores of the training samples are still accessible
# when novelty=True through the negative_outlier_factor_ attribute
X = iris.data
# fit with novelty=False
clf_1 = neighbors.LocalOutlierFactor()
clf_1.fit(X)
scores_1 = clf_1.negative_outlier_factor_
# fit with novelty=True
clf_2 = neighbors.LocalOutlierFactor(novelty=True)
clf_2.fit(X)
scores_2 = clf_2.negative_outlier_factor_
assert_array_almost_equal(scores_1, scores_2)
def test_hasattr_prediction():
# check availability of prediction methods depending on novelty value.
X = [[1, 1], [1, 2], [2, 1]]
# when novelty=True
clf = neighbors.LocalOutlierFactor(novelty=True)
clf.fit(X)
assert hasattr(clf, "predict")
assert hasattr(clf, "decision_function")
assert hasattr(clf, "score_samples")
assert not hasattr(clf, "fit_predict")
# when novelty=False
clf = neighbors.LocalOutlierFactor(novelty=False)
clf.fit(X)
assert hasattr(clf, "fit_predict")
assert not hasattr(clf, "predict")
assert not hasattr(clf, "decision_function")
assert not hasattr(clf, "score_samples")
@parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)])
def test_novelty_true_common_tests(estimator, check):
# the common tests are run for the default LOF (novelty=False).
# here we run these common tests for LOF when novelty=True
check(estimator)
@pytest.mark.parametrize("expected_outliers", [30, 53])
def test_predicted_outlier_number(expected_outliers):
# the number of predicted outliers should be equal to the number of
# expected outliers unless there are ties in the abnormality scores.
X = iris.data
n_samples = X.shape[0]
contamination = float(expected_outliers) / n_samples
clf = neighbors.LocalOutlierFactor(contamination=contamination)
y_pred = clf.fit_predict(X)
num_outliers = np.sum(y_pred != 1)
if num_outliers != expected_outliers:
y_dec = clf.negative_outlier_factor_
check_outlier_corruption(num_outliers, expected_outliers, y_dec)

View File

@@ -0,0 +1,572 @@
"""
Testing for Neighborhood Component Analysis module (sklearn.neighbors.nca)
"""
# Authors: William de Vazelhes <wdevazelhes@gmail.com>
# John Chiotellis <ioannis.chiotellis@in.tum.de>
# License: BSD 3 clause
import pytest
import re
import numpy as np
from numpy.testing import assert_array_equal, assert_array_almost_equal
from scipy.optimize import check_grad
from sklearn import clone
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils import check_random_state
from sklearn.datasets import load_iris, make_classification, make_blobs
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.metrics import pairwise_distances
rng = check_random_state(0)
# load and shuffle iris dataset
iris = load_iris()
perm = rng.permutation(iris.target.size)
iris_data = iris.data[perm]
iris_target = iris.target[perm]
EPS = np.finfo(float).eps
def test_simple_example():
"""Test on a simple example.
Puts four points in the input space where the opposite labels points are
next to each other. After transform the samples from the same class
should be next to each other.
"""
X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
y = np.array([1, 0, 1, 0])
nca = NeighborhoodComponentsAnalysis(
n_components=2, init="identity", random_state=42
)
nca.fit(X, y)
X_t = nca.transform(X)
assert_array_equal(pairwise_distances(X_t).argsort()[:, 1], np.array([2, 3, 0, 1]))
def test_toy_example_collapse_points():
"""Test on a toy example of three points that should collapse
We build a simple example: two points from the same class and a point from
a different class in the middle of them. On this simple example, the new
(transformed) points should all collapse into one single point. Indeed, the
objective is 2/(1 + exp(d/2)), with d the euclidean distance between the
two samples from the same class. This is maximized for d=0 (because d>=0),
with an objective equal to 1 (loss=-1.).
"""
rng = np.random.RandomState(42)
input_dim = 5
two_points = rng.randn(2, input_dim)
X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
y = [0, 0, 1]
class LossStorer:
def __init__(self, X, y):
self.loss = np.inf # initialize the loss to very high
# Initialize a fake NCA and variables needed to compute the loss:
self.fake_nca = NeighborhoodComponentsAnalysis()
self.fake_nca.n_iter_ = np.inf
self.X, y, _ = self.fake_nca._validate_params(X, y)
self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
def callback(self, transformation, n_iter):
"""Stores the last value of the loss function"""
self.loss, _ = self.fake_nca._loss_grad_lbfgs(
transformation, self.X, self.same_class_mask, -1.0
)
loss_storer = LossStorer(X, y)
nca = NeighborhoodComponentsAnalysis(random_state=42, callback=loss_storer.callback)
X_t = nca.fit_transform(X, y)
print(X_t)
# test that points are collapsed into one point
assert_array_almost_equal(X_t - X_t[0], 0.0)
assert abs(loss_storer.loss + 1) < 1e-10
def test_finite_differences():
"""Test gradient of loss function
Assert that the gradient is almost equal to its finite differences
approximation.
"""
# Initialize the transformation `M`, as well as `X` and `y` and `NCA`
rng = np.random.RandomState(42)
X, y = make_classification()
M = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
nca = NeighborhoodComponentsAnalysis()
nca.n_iter_ = 0
mask = y[:, np.newaxis] == y[np.newaxis, :]
def fun(M):
return nca._loss_grad_lbfgs(M, X, mask)[0]
def grad(M):
return nca._loss_grad_lbfgs(M, X, mask)[1]
# compute relative error
rel_diff = check_grad(fun, grad, M.ravel()) / np.linalg.norm(grad(M))
np.testing.assert_almost_equal(rel_diff, 0.0, decimal=5)
def test_params_validation():
# Test that invalid parameters raise value error
X = np.arange(12).reshape(4, 3)
y = [1, 1, 2, 2]
NCA = NeighborhoodComponentsAnalysis
rng = np.random.RandomState(42)
# TypeError
with pytest.raises(TypeError):
NCA(max_iter="21").fit(X, y)
with pytest.raises(TypeError):
NCA(verbose="true").fit(X, y)
with pytest.raises(TypeError):
NCA(tol="1").fit(X, y)
with pytest.raises(TypeError):
NCA(n_components="invalid").fit(X, y)
with pytest.raises(TypeError):
NCA(warm_start=1).fit(X, y)
# ValueError
msg = (
r"`init` must be 'auto', 'pca', 'lda', 'identity', 'random' or a "
r"numpy array of shape (n_components, n_features)."
)
with pytest.raises(ValueError, match=re.escape(msg)):
NCA(init=1).fit(X, y)
with pytest.raises(ValueError, match="max_iter == -1, must be >= 1."):
NCA(max_iter=-1).fit(X, y)
init = rng.rand(5, 3)
msg = (
f"The output dimensionality ({init.shape[0]}) "
"of the given linear transformation `init` cannot be "
f"greater than its input dimensionality ({init.shape[1]})."
)
with pytest.raises(ValueError, match=re.escape(msg)):
NCA(init=init).fit(X, y)
n_components = 10
msg = (
"The preferred dimensionality of the projected space "
f"`n_components` ({n_components}) cannot be greater "
f"than the given data dimensionality ({X.shape[1]})!"
)
with pytest.raises(ValueError, match=re.escape(msg)):
NCA(n_components=n_components).fit(X, y)
def test_transformation_dimensions():
X = np.arange(12).reshape(4, 3)
y = [1, 1, 2, 2]
# Fail if transformation input dimension does not match inputs dimensions
transformation = np.array([[1, 2], [3, 4]])
with pytest.raises(ValueError):
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
# Fail if transformation output dimension is larger than
# transformation input dimension
transformation = np.array([[1, 2], [3, 4], [5, 6]])
# len(transformation) > len(transformation[0])
with pytest.raises(ValueError):
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
# Pass otherwise
transformation = np.arange(9).reshape(3, 3)
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
def test_n_components():
rng = np.random.RandomState(42)
X = np.arange(12).reshape(4, 3)
y = [1, 1, 2, 2]
init = rng.rand(X.shape[1] - 1, 3)
# n_components = X.shape[1] != transformation.shape[0]
n_components = X.shape[1]
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
msg = (
"The preferred dimensionality of the projected space "
f"`n_components` ({n_components}) does not match the output "
"dimensionality of the given linear transformation "
f"`init` ({init.shape[0]})!"
)
with pytest.raises(ValueError, match=re.escape(msg)):
nca.fit(X, y)
# n_components > X.shape[1]
n_components = X.shape[1] + 2
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
msg = (
"The preferred dimensionality of the projected space "
f"`n_components` ({n_components}) cannot be greater than "
f"the given data dimensionality ({X.shape[1]})!"
)
with pytest.raises(ValueError, match=re.escape(msg)):
nca.fit(X, y)
# n_components < X.shape[1]
nca = NeighborhoodComponentsAnalysis(n_components=2, init="identity")
nca.fit(X, y)
def test_init_transformation():
rng = np.random.RandomState(42)
X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
# Start learning from scratch
nca = NeighborhoodComponentsAnalysis(init="identity")
nca.fit(X, y)
# Initialize with random
nca_random = NeighborhoodComponentsAnalysis(init="random")
nca_random.fit(X, y)
# Initialize with auto
nca_auto = NeighborhoodComponentsAnalysis(init="auto")
nca_auto.fit(X, y)
# Initialize with PCA
nca_pca = NeighborhoodComponentsAnalysis(init="pca")
nca_pca.fit(X, y)
# Initialize with LDA
nca_lda = NeighborhoodComponentsAnalysis(init="lda")
nca_lda.fit(X, y)
init = rng.rand(X.shape[1], X.shape[1])
nca = NeighborhoodComponentsAnalysis(init=init)
nca.fit(X, y)
# init.shape[1] must match X.shape[1]
init = rng.rand(X.shape[1], X.shape[1] + 1)
nca = NeighborhoodComponentsAnalysis(init=init)
msg = (
f"The input dimensionality ({init.shape[1]}) of the given "
"linear transformation `init` must match the "
f"dimensionality of the given inputs `X` ({X.shape[1]})."
)
with pytest.raises(ValueError, match=re.escape(msg)):
nca.fit(X, y)
# init.shape[0] must be <= init.shape[1]
init = rng.rand(X.shape[1] + 1, X.shape[1])
nca = NeighborhoodComponentsAnalysis(init=init)
msg = (
f"The output dimensionality ({init.shape[0]}) of the given "
"linear transformation `init` cannot be "
f"greater than its input dimensionality ({init.shape[1]})."
)
with pytest.raises(ValueError, match=re.escape(msg)):
nca.fit(X, y)
# init.shape[0] must match n_components
init = rng.rand(X.shape[1], X.shape[1])
n_components = X.shape[1] - 2
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
msg = (
"The preferred dimensionality of the "
f"projected space `n_components` ({n_components}) "
"does not match the output dimensionality of the given "
f"linear transformation `init` ({init.shape[0]})!"
)
with pytest.raises(ValueError, match=re.escape(msg)):
nca.fit(X, y)
@pytest.mark.parametrize("n_samples", [3, 5, 7, 11])
@pytest.mark.parametrize("n_features", [3, 5, 7, 11])
@pytest.mark.parametrize("n_classes", [5, 7, 11])
@pytest.mark.parametrize("n_components", [3, 5, 7, 11])
def test_auto_init(n_samples, n_features, n_classes, n_components):
# Test that auto choose the init as expected with every configuration
# of order of n_samples, n_features, n_classes and n_components.
rng = np.random.RandomState(42)
nca_base = NeighborhoodComponentsAnalysis(
init="auto", n_components=n_components, max_iter=1, random_state=rng
)
if n_classes >= n_samples:
pass
# n_classes > n_samples is impossible, and n_classes == n_samples
# throws an error from lda but is an absurd case
else:
X = rng.randn(n_samples, n_features)
y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
if n_components > n_features:
# this would return a ValueError, which is already tested in
# test_params_validation
pass
else:
nca = clone(nca_base)
nca.fit(X, y)
if n_components <= min(n_classes - 1, n_features):
nca_other = clone(nca_base).set_params(init="lda")
elif n_components < min(n_features, n_samples):
nca_other = clone(nca_base).set_params(init="pca")
else:
nca_other = clone(nca_base).set_params(init="identity")
nca_other.fit(X, y)
assert_array_almost_equal(nca.components_, nca_other.components_)
def test_warm_start_validation():
X, y = make_classification(
n_samples=30,
n_features=5,
n_classes=4,
n_redundant=0,
n_informative=5,
random_state=0,
)
nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
nca.fit(X, y)
X_less_features, y = make_classification(
n_samples=30,
n_features=4,
n_classes=4,
n_redundant=0,
n_informative=4,
random_state=0,
)
msg = (
f"The new inputs dimensionality ({X_less_features.shape[1]}) "
"does not match the input dimensionality of the previously learned "
f"transformation ({nca.components_.shape[1]})."
)
with pytest.raises(ValueError, match=re.escape(msg)):
nca.fit(X_less_features, y)
def test_warm_start_effectiveness():
# A 1-iteration second fit on same data should give almost same result
# with warm starting, and quite different result without warm starting.
nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0)
nca_warm.fit(iris_data, iris_target)
transformation_warm = nca_warm.components_
nca_warm.max_iter = 1
nca_warm.fit(iris_data, iris_target)
transformation_warm_plus_one = nca_warm.components_
nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0)
nca_cold.fit(iris_data, iris_target)
transformation_cold = nca_cold.components_
nca_cold.max_iter = 1
nca_cold.fit(iris_data, iris_target)
transformation_cold_plus_one = nca_cold.components_
diff_warm = np.sum(np.abs(transformation_warm_plus_one - transformation_warm))
diff_cold = np.sum(np.abs(transformation_cold_plus_one - transformation_cold))
assert diff_warm < 3.0, (
"Transformer changed significantly after one "
"iteration even though it was warm-started."
)
assert diff_cold > diff_warm, (
"Cold-started transformer changed less "
"significantly than warm-started "
"transformer after one iteration."
)
@pytest.mark.parametrize(
"init_name", ["pca", "lda", "identity", "random", "precomputed"]
)
def test_verbose(init_name, capsys):
# assert there is proper output when verbose = 1, for every initialization
# except auto because auto will call one of the others
rng = np.random.RandomState(42)
X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
regexp_init = r"... done in \ *\d+\.\d{2}s"
msgs = {
"pca": "Finding principal components" + regexp_init,
"lda": "Finding most discriminative components" + regexp_init,
}
if init_name == "precomputed":
init = rng.randn(X.shape[1], X.shape[1])
else:
init = init_name
nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
nca.fit(X, y)
out, _ = capsys.readouterr()
# check output
lines = re.split("\n+", out)
# if pca or lda init, an additional line is printed, so we test
# it and remove it to test the rest equally among initializations
if init_name in ["pca", "lda"]:
assert re.match(msgs[init_name], lines[0])
lines = lines[1:]
assert lines[0] == "[NeighborhoodComponentsAnalysis]"
header = "{:>10} {:>20} {:>10}".format("Iteration", "Objective Value", "Time(s)")
assert lines[1] == "[NeighborhoodComponentsAnalysis] {}".format(header)
assert lines[2] == "[NeighborhoodComponentsAnalysis] {}".format("-" * len(header))
for line in lines[3:-2]:
# The following regex will match for instance:
# '[NeighborhoodComponentsAnalysis] 0 6.988936e+01 0.01'
assert re.match(
r"\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e"
r"[+|-]\d+\ *\d+\.\d{2}",
line,
)
assert re.match(
r"\[NeighborhoodComponentsAnalysis\] Training took\ *" r"\d+\.\d{2}s\.",
lines[-2],
)
assert lines[-1] == ""
def test_no_verbose(capsys):
# assert by default there is no output (verbose=0)
nca = NeighborhoodComponentsAnalysis()
nca.fit(iris_data, iris_target)
out, _ = capsys.readouterr()
# check output
assert out == ""
def test_singleton_class():
X = iris_data
y = iris_target
# one singleton class
singleton_class = 1
(ind_singleton,) = np.where(y == singleton_class)
y[ind_singleton] = 2
y[ind_singleton[0]] = singleton_class
nca = NeighborhoodComponentsAnalysis(max_iter=30)
nca.fit(X, y)
# One non-singleton class
(ind_1,) = np.where(y == 1)
(ind_2,) = np.where(y == 2)
y[ind_1] = 0
y[ind_1[0]] = 1
y[ind_2] = 0
y[ind_2[0]] = 2
nca = NeighborhoodComponentsAnalysis(max_iter=30)
nca.fit(X, y)
# Only singleton classes
(ind_0,) = np.where(y == 0)
(ind_1,) = np.where(y == 1)
(ind_2,) = np.where(y == 2)
X = X[[ind_0[0], ind_1[0], ind_2[0]]]
y = y[[ind_0[0], ind_1[0], ind_2[0]]]
nca = NeighborhoodComponentsAnalysis(init="identity", max_iter=30)
nca.fit(X, y)
assert_array_equal(X, nca.transform(X))
def test_one_class():
X = iris_data[iris_target == 0]
y = iris_target[iris_target == 0]
nca = NeighborhoodComponentsAnalysis(
max_iter=30, n_components=X.shape[1], init="identity"
)
nca.fit(X, y)
assert_array_equal(X, nca.transform(X))
def test_callback(capsys):
X = iris_data
y = iris_target
nca = NeighborhoodComponentsAnalysis(callback="my_cb")
with pytest.raises(ValueError):
nca.fit(X, y)
max_iter = 10
def my_cb(transformation, n_iter):
assert transformation.shape == (iris_data.shape[1] ** 2,)
rem_iter = max_iter - n_iter
print("{} iterations remaining...".format(rem_iter))
# assert that my_cb is called
nca = NeighborhoodComponentsAnalysis(max_iter=max_iter, callback=my_cb, verbose=1)
nca.fit(iris_data, iris_target)
out, _ = capsys.readouterr()
# check output
assert "{} iterations remaining...".format(max_iter - 1) in out
def test_expected_transformation_shape():
"""Test that the transformation has the expected shape."""
X = iris_data
y = iris_target
class TransformationStorer:
def __init__(self, X, y):
# Initialize a fake NCA and variables needed to call the loss
# function:
self.fake_nca = NeighborhoodComponentsAnalysis()
self.fake_nca.n_iter_ = np.inf
self.X, y, _ = self.fake_nca._validate_params(X, y)
self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
def callback(self, transformation, n_iter):
"""Stores the last value of the transformation taken as input by
the optimizer"""
self.transformation = transformation
transformation_storer = TransformationStorer(X, y)
cb = transformation_storer.callback
nca = NeighborhoodComponentsAnalysis(max_iter=5, callback=cb)
nca.fit(X, y)
assert transformation_storer.transformation.size == X.shape[1] ** 2
def test_convergence_warning():
nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)
cls_name = nca.__class__.__name__
msg = "[{}] NCA did not converge".format(cls_name)
with pytest.warns(ConvergenceWarning, match=re.escape(msg)):
nca.fit(iris_data, iris_target)
@pytest.mark.parametrize(
"param, value",
[
("n_components", np.int32(3)),
("max_iter", np.int32(100)),
("tol", np.float32(0.0001)),
],
)
def test_parameters_valid_types(param, value):
# check that no error is raised when parameters have numpy integer or
# floating types.
nca = NeighborhoodComponentsAnalysis(**{param: value})
X = iris_data
y = iris_target
nca.fit(X, y)
def test_nca_feature_names_out():
"""Check `get_feature_names_out` for `NeighborhoodComponentsAnalysis`."""
X = iris_data
y = iris_target
est = NeighborhoodComponentsAnalysis().fit(X, y)
names_out = est.get_feature_names_out()
class_name_lower = est.__class__.__name__.lower()
expected_names_out = np.array(
[f"{class_name_lower}{i}" for i in range(est.components_.shape[1])],
dtype=object,
)
assert_array_equal(names_out, expected_names_out)

View File

@@ -0,0 +1,162 @@
"""
Testing for the nearest centroid module.
"""
import numpy as np
import pytest
from scipy import sparse as sp
from numpy.testing import assert_array_equal
from sklearn.neighbors import NearestCentroid
from sklearn import datasets
# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
X_csr = sp.csr_matrix(X) # Sparse matrix
y = [-1, -1, -1, 1, 1, 1]
T = [[-1, -1], [2, 2], [3, 2]]
T_csr = sp.csr_matrix(T)
true_result = [-1, 1, 1]
# also load the iris dataset
# and randomly permute it
iris = datasets.load_iris()
rng = np.random.RandomState(1)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]
def test_classification_toy():
# Check classification on a toy dataset, including sparse versions.
clf = NearestCentroid()
clf.fit(X, y)
assert_array_equal(clf.predict(T), true_result)
# Same test, but with a sparse matrix to fit and test.
clf = NearestCentroid()
clf.fit(X_csr, y)
assert_array_equal(clf.predict(T_csr), true_result)
# Fit with sparse, test with non-sparse
clf = NearestCentroid()
clf.fit(X_csr, y)
assert_array_equal(clf.predict(T), true_result)
# Fit with non-sparse, test with sparse
clf = NearestCentroid()
clf.fit(X, y)
assert_array_equal(clf.predict(T_csr), true_result)
# Fit and predict with non-CSR sparse matrices
clf = NearestCentroid()
clf.fit(X_csr.tocoo(), y)
assert_array_equal(clf.predict(T_csr.tolil()), true_result)
def test_precomputed():
clf = NearestCentroid(metric="precomputed")
with pytest.raises(ValueError):
clf.fit(X, y)
def test_iris():
# Check consistency on dataset iris.
for metric in ("euclidean", "cosine"):
clf = NearestCentroid(metric=metric).fit(iris.data, iris.target)
score = np.mean(clf.predict(iris.data) == iris.target)
assert score > 0.9, "Failed with score = " + str(score)
def test_iris_shrinkage():
# Check consistency on dataset iris, when using shrinkage.
for metric in ("euclidean", "cosine"):
for shrink_threshold in [None, 0.1, 0.5]:
clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold)
clf = clf.fit(iris.data, iris.target)
score = np.mean(clf.predict(iris.data) == iris.target)
assert score > 0.8, "Failed with score = " + str(score)
def test_pickle():
import pickle
# classification
obj = NearestCentroid()
obj.fit(iris.data, iris.target)
score = obj.score(iris.data, iris.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert type(obj2) == obj.__class__
score2 = obj2.score(iris.data, iris.target)
assert_array_equal(
score,
score2,
"Failed to generate same score after pickling (classification).",
)
def test_shrinkage_correct():
# Ensure that the shrinking is correct.
# The expected result is calculated by R (pamr),
# which is implemented by the author of the original paper.
# (One need to modify the code to output the new centroid in pamr.predict)
X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]])
y = np.array([1, 1, 2, 2, 2])
clf = NearestCentroid(shrink_threshold=0.1)
clf.fit(X, y)
expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]])
np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
def test_shrinkage_threshold_decoded_y():
clf = NearestCentroid(shrink_threshold=0.01)
y_ind = np.asarray(y)
y_ind[y_ind == -1] = 0
clf.fit(X, y_ind)
centroid_encoded = clf.centroids_
clf.fit(X, y)
assert_array_equal(centroid_encoded, clf.centroids_)
def test_predict_translated_data():
# Test that NearestCentroid gives same results on translated data
rng = np.random.RandomState(0)
X = rng.rand(50, 50)
y = rng.randint(0, 3, 50)
noise = rng.rand(50)
clf = NearestCentroid(shrink_threshold=0.1)
clf.fit(X, y)
y_init = clf.predict(X)
clf = NearestCentroid(shrink_threshold=0.1)
X_noise = X + noise
clf.fit(X_noise, y)
y_translate = clf.predict(X_noise)
assert_array_equal(y_init, y_translate)
def test_manhattan_metric():
# Test the manhattan metric.
clf = NearestCentroid(metric="manhattan")
clf.fit(X, y)
dense_centroid = clf.centroids_
clf.fit(X_csr, y)
assert_array_equal(clf.centroids_, dense_centroid)
assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
def test_features_zero_var():
# Test that features with 0 variance throw error
X = np.empty((10, 2))
X[:, 0] = -0.13725701
X[:, 1] = -0.9853293
y = np.zeros((10))
y[0] = 1
clf = NearestCentroid(shrink_threshold=0.1)
with pytest.raises(ValueError):
clf.fit(X, y)

View File

@@ -0,0 +1,261 @@
"""
This is testing the equivalence between some estimators with internal nearest
neighbors computations, and the corresponding pipeline versions with
KNeighborsTransformer or RadiusNeighborsTransformer to precompute the
neighbors.
"""
import numpy as np
import pytest
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.datasets import make_blobs
from sklearn.pipeline import make_pipeline
from sklearn.base import clone
from sklearn.neighbors import KNeighborsTransformer
from sklearn.neighbors import RadiusNeighborsTransformer
from sklearn.cluster import DBSCAN
from sklearn.cluster import SpectralClustering
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neighbors import LocalOutlierFactor
from sklearn.manifold import SpectralEmbedding
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
def test_spectral_clustering():
# Test chaining KNeighborsTransformer and SpectralClustering
n_neighbors = 5
X, _ = make_blobs(random_state=0)
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode="connectivity"),
SpectralClustering(
n_neighbors=n_neighbors, affinity="precomputed", random_state=42
),
)
est_compact = SpectralClustering(
n_neighbors=n_neighbors, affinity="nearest_neighbors", random_state=42
)
labels_compact = est_compact.fit_predict(X)
labels_chain = est_chain.fit_predict(X)
assert_array_almost_equal(labels_chain, labels_compact)
def test_spectral_embedding():
# Test chaining KNeighborsTransformer and SpectralEmbedding
n_neighbors = 5
n_samples = 1000
centers = np.array(
[
[0.0, 5.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 4.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 1.0],
]
)
S, true_labels = make_blobs(
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
)
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode="connectivity"),
SpectralEmbedding(
n_neighbors=n_neighbors, affinity="precomputed", random_state=42
),
)
est_compact = SpectralEmbedding(
n_neighbors=n_neighbors, affinity="nearest_neighbors", random_state=42
)
St_compact = est_compact.fit_transform(S)
St_chain = est_chain.fit_transform(S)
assert_array_almost_equal(St_chain, St_compact)
def test_dbscan():
# Test chaining RadiusNeighborsTransformer and DBSCAN
radius = 0.3
n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
# compare the chained version and the compact version
est_chain = make_pipeline(
RadiusNeighborsTransformer(radius=radius, mode="distance"),
DBSCAN(metric="precomputed", eps=radius),
)
est_compact = DBSCAN(eps=radius)
labels_chain = est_chain.fit_predict(X)
labels_compact = est_compact.fit_predict(X)
assert_array_almost_equal(labels_chain, labels_compact)
def test_isomap():
# Test chaining KNeighborsTransformer and Isomap with
# neighbors_algorithm='precomputed'
algorithm = "auto"
n_neighbors = 10
X, _ = make_blobs(random_state=0)
X2, _ = make_blobs(random_state=1)
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
),
Isomap(n_neighbors=n_neighbors, metric="precomputed"),
)
est_compact = Isomap(n_neighbors=n_neighbors, neighbors_algorithm=algorithm)
Xt_chain = est_chain.fit_transform(X)
Xt_compact = est_compact.fit_transform(X)
assert_array_almost_equal(Xt_chain, Xt_compact)
Xt_chain = est_chain.transform(X2)
Xt_compact = est_compact.transform(X2)
assert_array_almost_equal(Xt_chain, Xt_compact)
# TODO: Remove filterwarning in 1.2
@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
def test_tsne():
# Test chaining KNeighborsTransformer and TSNE
n_iter = 250
perplexity = 5
n_neighbors = int(3.0 * perplexity + 1)
rng = np.random.RandomState(0)
X = rng.randn(20, 2)
for metric in ["minkowski", "sqeuclidean"]:
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(
n_neighbors=n_neighbors, mode="distance", metric=metric
),
TSNE(
metric="precomputed",
perplexity=perplexity,
method="barnes_hut",
random_state=42,
n_iter=n_iter,
),
)
est_compact = TSNE(
metric=metric,
perplexity=perplexity,
n_iter=n_iter,
method="barnes_hut",
random_state=42,
)
Xt_chain = est_chain.fit_transform(X)
Xt_compact = est_compact.fit_transform(X)
assert_array_almost_equal(Xt_chain, Xt_compact)
def test_lof_novelty_false():
# Test chaining KNeighborsTransformer and LocalOutlierFactor
n_neighbors = 4
rng = np.random.RandomState(0)
X = rng.randn(40, 2)
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"),
LocalOutlierFactor(
metric="precomputed",
n_neighbors=n_neighbors,
novelty=False,
contamination="auto",
),
)
est_compact = LocalOutlierFactor(
n_neighbors=n_neighbors, novelty=False, contamination="auto"
)
pred_chain = est_chain.fit_predict(X)
pred_compact = est_compact.fit_predict(X)
assert_array_almost_equal(pred_chain, pred_compact)
def test_lof_novelty_true():
# Test chaining KNeighborsTransformer and LocalOutlierFactor
n_neighbors = 4
rng = np.random.RandomState(0)
X1 = rng.randn(40, 2)
X2 = rng.randn(40, 2)
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"),
LocalOutlierFactor(
metric="precomputed",
n_neighbors=n_neighbors,
novelty=True,
contamination="auto",
),
)
est_compact = LocalOutlierFactor(
n_neighbors=n_neighbors, novelty=True, contamination="auto"
)
pred_chain = est_chain.fit(X1).predict(X2)
pred_compact = est_compact.fit(X1).predict(X2)
assert_array_almost_equal(pred_chain, pred_compact)
def test_kneighbors_regressor():
# Test chaining KNeighborsTransformer and classifiers/regressors
rng = np.random.RandomState(0)
X = 2 * rng.rand(40, 5) - 1
X2 = 2 * rng.rand(40, 5) - 1
y = rng.rand(40, 1)
n_neighbors = 12
radius = 1.5
# We precompute more neighbors than necessary, to have equivalence between
# k-neighbors estimator after radius-neighbors transformer, and vice-versa.
factor = 2
k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance")
k_trans_factor = KNeighborsTransformer(
n_neighbors=int(n_neighbors * factor), mode="distance"
)
r_trans = RadiusNeighborsTransformer(radius=radius, mode="distance")
r_trans_factor = RadiusNeighborsTransformer(
radius=int(radius * factor), mode="distance"
)
k_reg = KNeighborsRegressor(n_neighbors=n_neighbors)
r_reg = RadiusNeighborsRegressor(radius=radius)
test_list = [
(k_trans, k_reg),
(k_trans_factor, r_reg),
(r_trans, r_reg),
(r_trans_factor, k_reg),
]
for trans, reg in test_list:
# compare the chained version and the compact version
reg_compact = clone(reg)
reg_precomp = clone(reg)
reg_precomp.set_params(metric="precomputed")
reg_chain = make_pipeline(clone(trans), reg_precomp)
y_pred_chain = reg_chain.fit(X, y).predict(X2)
y_pred_compact = reg_compact.fit(X, y).predict(X2)
assert_array_almost_equal(y_pred_chain, y_pred_compact)

View File

@@ -0,0 +1,290 @@
# License: BSD 3 clause
import pickle
import itertools
import numpy as np
import pytest
from sklearn.metrics import DistanceMetric
from sklearn.neighbors._ball_tree import (
BallTree,
kernel_norm,
DTYPE,
ITYPE,
NeighborsHeap as NeighborsHeapBT,
simultaneous_sort as simultaneous_sort_bt,
nodeheap_sort as nodeheap_sort_bt,
)
from sklearn.neighbors._kd_tree import (
KDTree,
NeighborsHeap as NeighborsHeapKDT,
simultaneous_sort as simultaneous_sort_kdt,
nodeheap_sort as nodeheap_sort_kdt,
)
from sklearn.utils import check_random_state
from numpy.testing import assert_array_almost_equal, assert_allclose
rng = np.random.RandomState(42)
V_mahalanobis = rng.rand(3, 3)
V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
DIMENSION = 3
METRICS = {
"euclidean": {},
"manhattan": {},
"minkowski": dict(p=3),
"chebyshev": {},
"seuclidean": dict(V=rng.random_sample(DIMENSION)),
"wminkowski": dict(p=3, w=rng.random_sample(DIMENSION)),
"mahalanobis": dict(V=V_mahalanobis),
}
KD_TREE_METRICS = ["euclidean", "manhattan", "chebyshev", "minkowski"]
BALL_TREE_METRICS = list(METRICS)
def dist_func(x1, x2, p):
return np.sum((x1 - x2) ** p) ** (1.0 / p)
def compute_kernel_slow(Y, X, kernel, h):
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
norm = kernel_norm(h, X.shape[1], kernel)
if kernel == "gaussian":
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
elif kernel == "tophat":
return norm * (d < h).sum(-1)
elif kernel == "epanechnikov":
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
elif kernel == "exponential":
return norm * (np.exp(-d / h)).sum(-1)
elif kernel == "linear":
return norm * ((1 - d / h) * (d < h)).sum(-1)
elif kernel == "cosine":
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
else:
raise ValueError("kernel not recognized")
def brute_force_neighbors(X, Y, k, metric, **kwargs):
D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
ind = np.argsort(D, axis=1)[:, :k]
dist = D[np.arange(Y.shape[0])[:, None], ind]
return dist, ind
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
@pytest.mark.parametrize(
"kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"]
)
@pytest.mark.parametrize("h", [0.01, 0.1, 1])
@pytest.mark.parametrize("rtol", [0, 1e-5])
@pytest.mark.parametrize("atol", [1e-6, 1e-2])
@pytest.mark.parametrize("breadth_first", [True, False])
def test_kernel_density(
Cls, kernel, h, rtol, atol, breadth_first, n_samples=100, n_features=3
):
rng = check_random_state(1)
X = rng.random_sample((n_samples, n_features))
Y = rng.random_sample((n_samples, n_features))
dens_true = compute_kernel_slow(Y, X, kernel, h)
tree = Cls(X, leaf_size=10)
dens = tree.kernel_density(
Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first
)
assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7))
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
def test_neighbor_tree_query_radius(Cls, n_samples=100, n_features=10):
rng = check_random_state(0)
X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
query_pt = np.zeros(n_features, dtype=float)
eps = 1e-15 # roundoff error can cause test to fail
tree = Cls(X, leaf_size=5)
rad = np.sqrt(((X - query_pt) ** 2).sum(1))
for r in np.linspace(rad[0], rad[-1], 100):
ind = tree.query_radius([query_pt], r + eps)[0]
i = np.where(rad <= r + eps)[0]
ind.sort()
i.sort()
assert_array_almost_equal(i, ind)
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
def test_neighbor_tree_query_radius_distance(Cls, n_samples=100, n_features=10):
rng = check_random_state(0)
X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
query_pt = np.zeros(n_features, dtype=float)
eps = 1e-15 # roundoff error can cause test to fail
tree = Cls(X, leaf_size=5)
rad = np.sqrt(((X - query_pt) ** 2).sum(1))
for r in np.linspace(rad[0], rad[-1], 100):
ind, dist = tree.query_radius([query_pt], r + eps, return_distance=True)
ind = ind[0]
dist = dist[0]
d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))
assert_array_almost_equal(d, dist)
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
@pytest.mark.parametrize("dualtree", (True, False))
def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3):
rng = check_random_state(0)
X = rng.random_sample((n_samples, n_features))
Y = rng.random_sample((n_samples, n_features))
r = np.linspace(0, 1, 10)
tree = Cls(X, leaf_size=10)
D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
counts_true = [(D <= ri).sum() for ri in r]
counts = tree.two_point_correlation(Y, r=r, dualtree=dualtree)
assert_array_almost_equal(counts, counts_true)
@pytest.mark.parametrize("NeighborsHeap", [NeighborsHeapBT, NeighborsHeapKDT])
def test_neighbors_heap(NeighborsHeap, n_pts=5, n_nbrs=10):
heap = NeighborsHeap(n_pts, n_nbrs)
rng = check_random_state(0)
for row in range(n_pts):
d_in = rng.random_sample(2 * n_nbrs).astype(DTYPE, copy=False)
i_in = np.arange(2 * n_nbrs, dtype=ITYPE)
for d, i in zip(d_in, i_in):
heap.push(row, d, i)
ind = np.argsort(d_in)
d_in = d_in[ind]
i_in = i_in[ind]
d_heap, i_heap = heap.get_arrays(sort=True)
assert_array_almost_equal(d_in[:n_nbrs], d_heap[row])
assert_array_almost_equal(i_in[:n_nbrs], i_heap[row])
@pytest.mark.parametrize("nodeheap_sort", [nodeheap_sort_bt, nodeheap_sort_kdt])
def test_node_heap(nodeheap_sort, n_nodes=50):
rng = check_random_state(0)
vals = rng.random_sample(n_nodes).astype(DTYPE, copy=False)
i1 = np.argsort(vals)
vals2, i2 = nodeheap_sort(vals)
assert_array_almost_equal(i1, i2)
assert_array_almost_equal(vals[i1], vals2)
@pytest.mark.parametrize(
"simultaneous_sort", [simultaneous_sort_bt, simultaneous_sort_kdt]
)
def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201):
rng = check_random_state(0)
dist = rng.random_sample((n_rows, n_pts)).astype(DTYPE, copy=False)
ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(ITYPE, copy=False)
dist2 = dist.copy()
ind2 = ind.copy()
# simultaneous sort rows using function
simultaneous_sort(dist, ind)
# simultaneous sort rows using numpy
i = np.argsort(dist2, axis=1)
row_ind = np.arange(n_rows)[:, None]
dist2 = dist2[row_ind, i]
ind2 = ind2[row_ind, i]
assert_array_almost_equal(dist, dist2)
assert_array_almost_equal(ind, ind2)
@pytest.mark.parametrize("Cls", [KDTree, BallTree])
def test_gaussian_kde(Cls, n_samples=1000):
# Compare gaussian KDE results to scipy.stats.gaussian_kde
from scipy.stats import gaussian_kde
rng = check_random_state(0)
x_in = rng.normal(0, 1, n_samples)
x_out = np.linspace(-5, 5, 30)
for h in [0.01, 0.1, 1]:
tree = Cls(x_in[:, None])
gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))
dens_tree = tree.kernel_density(x_out[:, None], h) / n_samples
dens_gkde = gkde.evaluate(x_out)
assert_array_almost_equal(dens_tree, dens_gkde, decimal=3)
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize(
"Cls, metric",
itertools.chain(
[(KDTree, metric) for metric in KD_TREE_METRICS],
[(BallTree, metric) for metric in BALL_TREE_METRICS],
),
)
@pytest.mark.parametrize("k", (1, 3, 5))
@pytest.mark.parametrize("dualtree", (True, False))
@pytest.mark.parametrize("breadth_first", (True, False))
def test_nn_tree_query(Cls, metric, k, dualtree, breadth_first):
rng = check_random_state(0)
X = rng.random_sample((40, DIMENSION))
Y = rng.random_sample((10, DIMENSION))
kwargs = METRICS[metric]
kdt = Cls(X, leaf_size=1, metric=metric, **kwargs)
dist1, ind1 = kdt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first)
dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)
# don't check indices here: if there are any duplicate distances,
# the indices may not match. Distances should not have this problem.
assert_array_almost_equal(dist1, dist2)
@pytest.mark.parametrize(
"Cls, metric",
[(KDTree, "euclidean"), (BallTree, "euclidean"), (BallTree, dist_func)],
)
@pytest.mark.parametrize("protocol", (0, 1, 2))
def test_pickle(Cls, metric, protocol):
rng = check_random_state(0)
X = rng.random_sample((10, 3))
if hasattr(metric, "__call__"):
kwargs = {"p": 2}
else:
kwargs = {}
tree1 = Cls(X, leaf_size=1, metric=metric, **kwargs)
ind1, dist1 = tree1.query(X)
s = pickle.dumps(tree1, protocol=protocol)
tree2 = pickle.loads(s)
ind2, dist2 = tree2.query(X)
assert_array_almost_equal(ind1, ind2)
assert_array_almost_equal(dist1, dist2)
assert isinstance(tree2, Cls)

View File

@@ -0,0 +1,144 @@
import pickle
import numpy as np
import pytest
from sklearn.neighbors._quad_tree import _QuadTree
from sklearn.utils import check_random_state
def test_quadtree_boundary_computation():
# Introduce a point into a quad tree with boundaries not easy to compute.
Xs = []
# check a random case
Xs.append(np.array([[-1, 1], [-4, -1]], dtype=np.float32))
# check the case where only 0 are inserted
Xs.append(np.array([[0, 0], [0, 0]], dtype=np.float32))
# check the case where only negative are inserted
Xs.append(np.array([[-1, -2], [-4, 0]], dtype=np.float32))
# check the case where only small numbers are inserted
Xs.append(np.array([[-1e-6, 1e-6], [-4e-6, -1e-6]], dtype=np.float32))
for X in Xs:
tree = _QuadTree(n_dimensions=2, verbose=0)
tree.build_tree(X)
tree._check_coherence()
def test_quadtree_similar_point():
# Introduce a point into a quad tree where a similar point already exists.
# Test will hang if it doesn't complete.
Xs = []
# check the case where points are actually different
Xs.append(np.array([[1, 2], [3, 4]], dtype=np.float32))
# check the case where points are the same on X axis
Xs.append(np.array([[1.0, 2.0], [1.0, 3.0]], dtype=np.float32))
# check the case where points are arbitrarily close on X axis
Xs.append(np.array([[1.00001, 2.0], [1.00002, 3.0]], dtype=np.float32))
# check the case where points are the same on Y axis
Xs.append(np.array([[1.0, 2.0], [3.0, 2.0]], dtype=np.float32))
# check the case where points are arbitrarily close on Y axis
Xs.append(np.array([[1.0, 2.00001], [3.0, 2.00002]], dtype=np.float32))
# check the case where points are arbitrarily close on both axes
Xs.append(np.array([[1.00001, 2.00001], [1.00002, 2.00002]], dtype=np.float32))
# check the case where points are arbitrarily close on both axes
# close to machine epsilon - x axis
Xs.append(np.array([[1, 0.0003817754041], [2, 0.0003817753750]], dtype=np.float32))
# check the case where points are arbitrarily close on both axes
# close to machine epsilon - y axis
Xs.append(
np.array([[0.0003817754041, 1.0], [0.0003817753750, 2.0]], dtype=np.float32)
)
for X in Xs:
tree = _QuadTree(n_dimensions=2, verbose=0)
tree.build_tree(X)
tree._check_coherence()
@pytest.mark.parametrize("n_dimensions", (2, 3))
@pytest.mark.parametrize("protocol", (0, 1, 2))
def test_quad_tree_pickle(n_dimensions, protocol):
rng = check_random_state(0)
X = rng.random_sample((10, n_dimensions))
tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
tree.build_tree(X)
s = pickle.dumps(tree, protocol=protocol)
bt2 = pickle.loads(s)
for x in X:
cell_x_tree = tree.get_cell(x)
cell_x_bt2 = bt2.get_cell(x)
assert cell_x_tree == cell_x_bt2
@pytest.mark.parametrize("n_dimensions", (2, 3))
def test_qt_insert_duplicate(n_dimensions):
rng = check_random_state(0)
X = rng.random_sample((10, n_dimensions))
Xd = np.r_[X, X[:5]]
tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
tree.build_tree(Xd)
cumulative_size = tree.cumulative_size
leafs = tree.leafs
# Assert that the first 5 are indeed duplicated and that the next
# ones are single point leaf
for i, x in enumerate(X):
cell_id = tree.get_cell(x)
assert leafs[cell_id]
assert cumulative_size[cell_id] == 1 + (i < 5)
def test_summarize():
# Simple check for quad tree's summarize
angle = 0.9
X = np.array(
[[-10.0, -10.0], [9.0, 10.0], [10.0, 9.0], [10.0, 10.0]], dtype=np.float32
)
query_pt = X[0, :]
n_dimensions = X.shape[1]
offset = n_dimensions + 2
qt = _QuadTree(n_dimensions, verbose=0)
qt.build_tree(X)
idx, summary = qt._py_summarize(query_pt, X, angle)
node_dist = summary[n_dimensions]
node_size = summary[n_dimensions + 1]
# Summary should contain only 1 node with size 3 and distance to
# X[1:] barycenter
barycenter = X[1:].mean(axis=0)
ds2c = ((X[0] - barycenter) ** 2).sum()
assert idx == offset
assert node_size == 3, "summary size = {}".format(node_size)
assert np.isclose(node_dist, ds2c)
# Summary should contain all 3 node with size 1 and distance to
# each point in X[1:] for ``angle=0``
idx, summary = qt._py_summarize(query_pt, X, 0.0)
barycenter = X[1:].mean(axis=0)
ds2c = ((X[0] - barycenter) ** 2).sum()
assert idx == 3 * (offset)
for i in range(3):
node_dist = summary[i * offset + n_dimensions]
node_size = summary[i * offset + n_dimensions + 1]
ds2c = ((X[0] - X[i + 1]) ** 2).sum()
assert node_size == 1, "summary size = {}".format(node_size)
assert np.isclose(node_dist, ds2c)