first commit

This commit is contained in:
Carla Floricel
2022-08-02 09:52:52 -04:00
parent 417ea8660b
commit 05e52aa52b
10444 changed files with 2300232 additions and 0 deletions

View File

@@ -0,0 +1,102 @@
"""
The :mod:`sklearn.linear_model` module implements a variety of linear models.
"""
# See http://scikit-learn.sourceforge.net/modules/sgd.html and
# http://scikit-learn.sourceforge.net/modules/linear_model.html for
# complete documentation.
from ._base import LinearRegression
from ._bayes import BayesianRidge, ARDRegression
from ._least_angle import (
Lars,
LassoLars,
lars_path,
lars_path_gram,
LarsCV,
LassoLarsCV,
LassoLarsIC,
)
from ._coordinate_descent import (
Lasso,
ElasticNet,
LassoCV,
ElasticNetCV,
lasso_path,
enet_path,
MultiTaskLasso,
MultiTaskElasticNet,
MultiTaskElasticNetCV,
MultiTaskLassoCV,
)
from ._glm import PoissonRegressor, GammaRegressor, TweedieRegressor
from ._huber import HuberRegressor
from ._sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
from ._stochastic_gradient import SGDClassifier, SGDRegressor, SGDOneClassSVM
from ._ridge import Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV, ridge_regression
from ._logistic import LogisticRegression, LogisticRegressionCV
from ._omp import (
orthogonal_mp,
orthogonal_mp_gram,
OrthogonalMatchingPursuit,
OrthogonalMatchingPursuitCV,
)
from ._passive_aggressive import PassiveAggressiveClassifier
from ._passive_aggressive import PassiveAggressiveRegressor
from ._perceptron import Perceptron
from ._quantile import QuantileRegressor
from ._ransac import RANSACRegressor
from ._theil_sen import TheilSenRegressor
__all__ = [
"ARDRegression",
"BayesianRidge",
"ElasticNet",
"ElasticNetCV",
"Hinge",
"Huber",
"HuberRegressor",
"Lars",
"LarsCV",
"Lasso",
"LassoCV",
"LassoLars",
"LassoLarsCV",
"LassoLarsIC",
"LinearRegression",
"Log",
"LogisticRegression",
"LogisticRegressionCV",
"ModifiedHuber",
"MultiTaskElasticNet",
"MultiTaskElasticNetCV",
"MultiTaskLasso",
"MultiTaskLassoCV",
"OrthogonalMatchingPursuit",
"OrthogonalMatchingPursuitCV",
"PassiveAggressiveClassifier",
"PassiveAggressiveRegressor",
"Perceptron",
"QuantileRegressor",
"Ridge",
"RidgeCV",
"RidgeClassifier",
"RidgeClassifierCV",
"SGDClassifier",
"SGDRegressor",
"SGDOneClassSVM",
"SquaredLoss",
"TheilSenRegressor",
"enet_path",
"lars_path",
"lars_path_gram",
"lasso_path",
"orthogonal_mp",
"orthogonal_mp_gram",
"ridge_regression",
"RANSACRegressor",
"PoissonRegressor",
"GammaRegressor",
"TweedieRegressor",
]

View File

@@ -0,0 +1,902 @@
"""
Generalized Linear Models.
"""
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
# Olivier Grisel <olivier.grisel@ensta.org>
# Vincent Michel <vincent.michel@inria.fr>
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Mathieu Blondel <mathieu@mblondel.org>
# Lars Buitinck
# Maryan Morel <maryan.morel@polytechnique.edu>
# Giorgio Patrini <giorgio.patrini@anu.edu.au>
# Maria Telenczuk <https://github.com/maikia>
# License: BSD 3 clause
from abc import ABCMeta, abstractmethod
import numbers
import warnings
import numpy as np
import scipy.sparse as sp
from scipy import linalg
from scipy import optimize
from scipy import sparse
from scipy.sparse.linalg import lsqr
from scipy.special import expit
from joblib import Parallel
from ..base import BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin
from ..preprocessing._data import _is_constant_feature
from ..utils import check_array
from ..utils.validation import FLOAT_DTYPES
from ..utils import check_random_state
from ..utils.extmath import safe_sparse_dot
from ..utils.extmath import _incremental_mean_and_var
from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
from ..utils._seq_dataset import ArrayDataset32, CSRDataset32
from ..utils._seq_dataset import ArrayDataset64, CSRDataset64
from ..utils.validation import check_is_fitted, _check_sample_weight
from ..utils.fixes import delayed
# TODO: bayesian_ridge_regression and bayesian_regression_ard
# should be squashed into its respective objects.
SPARSE_INTERCEPT_DECAY = 0.01
# For sparse data intercept updates are scaled by this decay factor to avoid
# intercept oscillation.
# FIXME in 1.2: parameter 'normalize' should be removed from linear models
# in cases where now normalize=False. The default value of 'normalize' should
# be changed to False in linear models where now normalize=True
def _deprecate_normalize(normalize, default, estimator_name):
"""Normalize is to be deprecated from linear models and a use of
a pipeline with a StandardScaler is to be recommended instead.
Here the appropriate message is selected to be displayed to the user
depending on the default normalize value (as it varies between the linear
models and normalize value selected by the user).
Parameters
----------
normalize : bool,
normalize value passed by the user
default : bool,
default normalize value used by the estimator
estimator_name : str
name of the linear estimator which calls this function.
The name will be used for writing the deprecation warnings
Returns
-------
normalize : bool,
normalize value which should further be used by the estimator at this
stage of the depreciation process
Notes
-----
This function should be updated in 1.2 depending on the value of
`normalize`:
- True, warning: `normalize` was deprecated in 1.2 and will be removed in
1.4. Suggest to use pipeline instead.
- False, `normalize` was deprecated in 1.2 and it will be removed in 1.4.
Leave normalize to its default value.
- `deprecated` - this should only be possible with default == False as from
1.2 `normalize` in all the linear models should be either removed or the
default should be set to False.
This function should be completely removed in 1.4.
"""
if normalize not in [True, False, "deprecated"]:
raise ValueError(
"Leave 'normalize' to its default value or set it to True or False"
)
if normalize == "deprecated":
_normalize = default
else:
_normalize = normalize
pipeline_msg = (
"If you wish to scale the data, use Pipeline with a StandardScaler "
"in a preprocessing stage. To reproduce the previous behavior:\n\n"
"from sklearn.pipeline import make_pipeline\n\n"
"model = make_pipeline(StandardScaler(with_mean=False), "
f"{estimator_name}())\n\n"
"If you wish to pass a sample_weight parameter, you need to pass it "
"as a fit parameter to each step of the pipeline as follows:\n\n"
"kwargs = {s[0] + '__sample_weight': sample_weight for s "
"in model.steps}\n"
"model.fit(X, y, **kwargs)\n\n"
)
if estimator_name == "Ridge" or estimator_name == "RidgeClassifier":
alpha_msg = "Set parameter alpha to: original_alpha * n_samples. "
elif "Lasso" in estimator_name:
alpha_msg = "Set parameter alpha to: original_alpha * np.sqrt(n_samples). "
elif "ElasticNet" in estimator_name:
alpha_msg = (
"Set parameter alpha to original_alpha * np.sqrt(n_samples) if "
"l1_ratio is 1, and to original_alpha * n_samples if l1_ratio is "
"0. For other values of l1_ratio, no analytic formula is "
"available."
)
elif estimator_name in ("RidgeCV", "RidgeClassifierCV", "_RidgeGCV"):
alpha_msg = "Set parameter alphas to: original_alphas * n_samples. "
else:
alpha_msg = ""
if default and normalize == "deprecated":
warnings.warn(
"The default of 'normalize' will be set to False in version 1.2 "
"and deprecated in version 1.4.\n"
+ pipeline_msg
+ alpha_msg,
FutureWarning,
)
elif normalize != "deprecated" and normalize and not default:
warnings.warn(
"'normalize' was deprecated in version 1.0 and will be removed in 1.2.\n"
+ pipeline_msg
+ alpha_msg,
FutureWarning,
)
elif not normalize and not default:
warnings.warn(
"'normalize' was deprecated in version 1.0 and will be "
"removed in 1.2. "
"Please leave the normalize parameter to its default value to "
"silence this warning. The default behavior of this estimator "
"is to not do any normalization. If normalization is needed "
"please use sklearn.preprocessing.StandardScaler instead.",
FutureWarning,
)
return _normalize
def make_dataset(X, y, sample_weight, random_state=None):
"""Create ``Dataset`` abstraction for sparse and dense inputs.
This also returns the ``intercept_decay`` which is different
for sparse datasets.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data
y : array-like, shape (n_samples, )
Target values.
sample_weight : numpy array of shape (n_samples,)
The weight of each sample
random_state : int, RandomState instance or None (default)
Determines random number generation for dataset random sampling. It is not
used for dataset shuffling.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
Returns
-------
dataset
The ``Dataset`` abstraction
intercept_decay
The intercept decay
"""
rng = check_random_state(random_state)
# seed should never be 0 in SequentialDataset64
seed = rng.randint(1, np.iinfo(np.int32).max)
if X.dtype == np.float32:
CSRData = CSRDataset32
ArrayData = ArrayDataset32
else:
CSRData = CSRDataset64
ArrayData = ArrayDataset64
if sp.issparse(X):
dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight, seed=seed)
intercept_decay = SPARSE_INTERCEPT_DECAY
else:
X = np.ascontiguousarray(X)
dataset = ArrayData(X, y, sample_weight, seed=seed)
intercept_decay = 1.0
return dataset, intercept_decay
def _preprocess_data(
X,
y,
fit_intercept,
normalize=False,
copy=True,
sample_weight=None,
check_input=True,
):
"""Center and scale data.
Centers data to have mean zero along axis 0. If fit_intercept=False or if
the X is a sparse matrix, no centering is done, but normalization can still
be applied. The function returns the statistics necessary to reconstruct
the input data, which are X_offset, y_offset, X_scale, such that the output
X = (X - X_offset) / X_scale
X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
then the weighted mean of X and y is zero, and not the mean itself. If
fit_intercept=True, the mean, eventually weighted, is returned, independently
of whether X was centered (option used for optimization with sparse data in
coordinate_descend).
This is here because nearly all linear models will want their data to be
centered. This function also systematically makes y consistent with X.dtype
Returns
-------
X_out : {ndarray, sparse matrix} of shape (n_samples, n_features)
If copy=True a copy of the input X is triggered, otherwise operations are
inplace.
If input X is dense, then X_out is centered.
If normalize is True, then X_out is rescaled (dense and sparse case)
y_out : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)
Centered version of y. Likely performed inplace on input y.
X_offset : ndarray of shape (n_features,)
The mean per column of input X.
y_offset : float or ndarray of shape (n_features,)
X_scale : ndarray of shape (n_features,)
The standard deviation per column of input X.
"""
if isinstance(sample_weight, numbers.Number):
sample_weight = None
if sample_weight is not None:
sample_weight = np.asarray(sample_weight)
if check_input:
X = check_array(X, copy=copy, accept_sparse=["csr", "csc"], dtype=FLOAT_DTYPES)
elif copy:
if sp.issparse(X):
X = X.copy()
else:
X = X.copy(order="K")
y = np.asarray(y, dtype=X.dtype)
if fit_intercept:
if sp.issparse(X):
X_offset, X_var = mean_variance_axis(X, axis=0, weights=sample_weight)
else:
if normalize:
X_offset, X_var, _ = _incremental_mean_and_var(
X,
last_mean=0.0,
last_variance=0.0,
last_sample_count=0.0,
sample_weight=sample_weight,
)
else:
X_offset = np.average(X, axis=0, weights=sample_weight)
X_offset = X_offset.astype(X.dtype, copy=False)
X -= X_offset
if normalize:
X_var = X_var.astype(X.dtype, copy=False)
# Detect constant features on the computed variance, before taking
# the np.sqrt. Otherwise constant features cannot be detected with
# sample weights.
constant_mask = _is_constant_feature(X_var, X_offset, X.shape[0])
if sample_weight is None:
X_var *= X.shape[0]
else:
X_var *= sample_weight.sum()
X_scale = np.sqrt(X_var, out=X_var)
X_scale[constant_mask] = 1.0
if sp.issparse(X):
inplace_column_scale(X, 1.0 / X_scale)
else:
X /= X_scale
else:
X_scale = np.ones(X.shape[1], dtype=X.dtype)
y_offset = np.average(y, axis=0, weights=sample_weight)
y = y - y_offset
else:
X_offset = np.zeros(X.shape[1], dtype=X.dtype)
X_scale = np.ones(X.shape[1], dtype=X.dtype)
if y.ndim == 1:
y_offset = X.dtype.type(0)
else:
y_offset = np.zeros(y.shape[1], dtype=X.dtype)
return X, y, X_offset, y_offset, X_scale
# TODO: _rescale_data should be factored into _preprocess_data.
# Currently, the fact that sag implements its own way to deal with
# sample_weight makes the refactoring tricky.
def _rescale_data(X, y, sample_weight):
"""Rescale data sample-wise by square root of sample_weight.
For many linear models, this enables easy support for sample_weight because
(y - X w)' S (y - X w)
with S = diag(sample_weight) becomes
||y_rescaled - X_rescaled w||_2^2
when setting
y_rescaled = sqrt(S) y
X_rescaled = sqrt(S) X
Returns
-------
X_rescaled : {array-like, sparse matrix}
y_rescaled : {array-like, sparse matrix}
"""
n_samples = X.shape[0]
sample_weight = np.asarray(sample_weight)
if sample_weight.ndim == 0:
sample_weight = np.full(n_samples, sample_weight, dtype=sample_weight.dtype)
sample_weight_sqrt = np.sqrt(sample_weight)
sw_matrix = sparse.dia_matrix((sample_weight_sqrt, 0), shape=(n_samples, n_samples))
X = safe_sparse_dot(sw_matrix, X)
y = safe_sparse_dot(sw_matrix, y)
return X, y, sample_weight_sqrt
class LinearModel(BaseEstimator, metaclass=ABCMeta):
"""Base class for Linear Models"""
@abstractmethod
def fit(self, X, y):
"""Fit model."""
def _decision_function(self, X):
check_is_fitted(self)
X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False)
return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
def predict(self, X):
"""
Predict using the linear model.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
Samples.
Returns
-------
C : array, shape (n_samples,)
Returns predicted values.
"""
return self._decision_function(X)
def _set_intercept(self, X_offset, y_offset, X_scale):
"""Set the intercept_"""
if self.fit_intercept:
self.coef_ = self.coef_ / X_scale
self.intercept_ = y_offset - np.dot(X_offset, self.coef_.T)
else:
self.intercept_ = 0.0
def _more_tags(self):
return {"requires_y": True}
# XXX Should this derive from LinearModel? It should be a mixin, not an ABC.
# Maybe the n_features checking can be moved to LinearModel.
class LinearClassifierMixin(ClassifierMixin):
"""Mixin for linear classifiers.
Handles prediction for sparse and dense X.
"""
def decision_function(self, X):
"""
Predict confidence scores for samples.
The confidence score for a sample is proportional to the signed
distance of that sample to the hyperplane.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The data matrix for which we want to get the confidence scores.
Returns
-------
scores : ndarray of shape (n_samples,) or (n_samples, n_classes)
Confidence scores per `(n_samples, n_classes)` combination. In the
binary case, confidence score for `self.classes_[1]` where >0 means
this class would be predicted.
"""
check_is_fitted(self)
X = self._validate_data(X, accept_sparse="csr", reset=False)
scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
return scores.ravel() if scores.shape[1] == 1 else scores
def predict(self, X):
"""
Predict class labels for samples in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The data matrix for which we want to get the predictions.
Returns
-------
y_pred : ndarray of shape (n_samples,)
Vector containing the class labels for each sample.
"""
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(int)
else:
indices = scores.argmax(axis=1)
return self.classes_[indices]
def _predict_proba_lr(self, X):
"""Probability estimation for OvR logistic regression.
Positive class probabilities are computed as
1. / (1. + np.exp(-self.decision_function(X)));
multiclass is handled by normalizing that over all classes.
"""
prob = self.decision_function(X)
expit(prob, out=prob)
if prob.ndim == 1:
return np.vstack([1 - prob, prob]).T
else:
# OvR normalization, like LibLinear's predict_probability
prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
return prob
class SparseCoefMixin:
"""Mixin for converting coef_ to and from CSR format.
L1-regularizing estimators should inherit this.
"""
def densify(self):
"""
Convert coefficient matrix to dense array format.
Converts the ``coef_`` member (back) to a numpy.ndarray. This is the
default format of ``coef_`` and is required for fitting, so calling
this method is only required on models that have previously been
sparsified; otherwise, it is a no-op.
Returns
-------
self
Fitted estimator.
"""
msg = "Estimator, %(name)s, must be fitted before densifying."
check_is_fitted(self, msg=msg)
if sp.issparse(self.coef_):
self.coef_ = self.coef_.toarray()
return self
def sparsify(self):
"""
Convert coefficient matrix to sparse format.
Converts the ``coef_`` member to a scipy.sparse matrix, which for
L1-regularized models can be much more memory- and storage-efficient
than the usual numpy.ndarray representation.
The ``intercept_`` member is not converted.
Returns
-------
self
Fitted estimator.
Notes
-----
For non-sparse models, i.e. when there are not many zeros in ``coef_``,
this may actually *increase* memory usage, so use this method with
care. A rule of thumb is that the number of zero elements, which can
be computed with ``(coef_ == 0).sum()``, must be more than 50% for this
to provide significant benefits.
After calling this method, further fitting with the partial_fit
method (if any) will not work until you call densify.
"""
msg = "Estimator, %(name)s, must be fitted before sparsifying."
check_is_fitted(self, msg=msg)
self.coef_ = sp.csr_matrix(self.coef_)
return self
class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
"""
Ordinary least squares Linear Regression.
LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
to minimize the residual sum of squares between the observed targets in
the dataset, and the targets predicted by the linear approximation.
Parameters
----------
fit_intercept : bool, default=True
Whether to calculate the intercept for this model. If set
to False, no intercept will be used in calculations
(i.e. data is expected to be centered).
normalize : bool, default=False
This parameter is ignored when ``fit_intercept`` is set to False.
If True, the regressors X will be normalized before regression by
subtracting the mean and dividing by the l2-norm.
If you wish to standardize, please use
:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
on an estimator with ``normalize=False``.
.. deprecated:: 1.0
`normalize` was deprecated in version 1.0 and will be
removed in 1.2.
copy_X : bool, default=True
If True, X will be copied; else, it may be overwritten.
n_jobs : int, default=None
The number of jobs to use for the computation. This will only provide
speedup in case of sufficiently large problems, that is if firstly
`n_targets > 1` and secondly `X` is sparse or if `positive` is set
to `True`. ``None`` means 1 unless in a
:obj:`joblib.parallel_backend` context. ``-1`` means using all
processors. See :term:`Glossary <n_jobs>` for more details.
positive : bool, default=False
When set to ``True``, forces the coefficients to be positive. This
option is only supported for dense arrays.
.. versionadded:: 0.24
Attributes
----------
coef_ : array of shape (n_features, ) or (n_targets, n_features)
Estimated coefficients for the linear regression problem.
If multiple targets are passed during the fit (y 2D), this
is a 2D array of shape (n_targets, n_features), while if only
one target is passed, this is a 1D array of length n_features.
rank_ : int
Rank of matrix `X`. Only available when `X` is dense.
singular_ : array of shape (min(X, y),)
Singular values of `X`. Only available when `X` is dense.
intercept_ : float or array of shape (n_targets,)
Independent term in the linear model. Set to 0.0 if
`fit_intercept = False`.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
Ridge : Ridge regression addresses some of the
problems of Ordinary Least Squares by imposing a penalty on the
size of the coefficients with l2 regularization.
Lasso : The Lasso is a linear model that estimates
sparse coefficients with l1 regularization.
ElasticNet : Elastic-Net is a linear regression
model trained with both l1 and l2 -norm regularization of the
coefficients.
Notes
-----
From the implementation point of view, this is just plain Ordinary
Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares
(scipy.optimize.nnls) wrapped as a predictor object.
Examples
--------
>>> import numpy as np
>>> from sklearn.linear_model import LinearRegression
>>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
>>> # y = 1 * x_0 + 2 * x_1 + 3
>>> y = np.dot(X, np.array([1, 2])) + 3
>>> reg = LinearRegression().fit(X, y)
>>> reg.score(X, y)
1.0
>>> reg.coef_
array([1., 2.])
>>> reg.intercept_
3.0...
>>> reg.predict(np.array([[3, 5]]))
array([16.])
"""
def __init__(
self,
*,
fit_intercept=True,
normalize="deprecated",
copy_X=True,
n_jobs=None,
positive=False,
):
self.fit_intercept = fit_intercept
self.normalize = normalize
self.copy_X = copy_X
self.n_jobs = n_jobs
self.positive = positive
def fit(self, X, y, sample_weight=None):
"""
Fit linear model.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : array-like of shape (n_samples,) or (n_samples, n_targets)
Target values. Will be cast to X's dtype if necessary.
sample_weight : array-like of shape (n_samples,), default=None
Individual weights for each sample.
.. versionadded:: 0.17
parameter *sample_weight* support to LinearRegression.
Returns
-------
self : object
Fitted Estimator.
"""
_normalize = _deprecate_normalize(
self.normalize, default=False, estimator_name=self.__class__.__name__
)
n_jobs_ = self.n_jobs
accept_sparse = False if self.positive else ["csr", "csc", "coo"]
X, y = self._validate_data(
X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
)
sample_weight = _check_sample_weight(
sample_weight, X, dtype=X.dtype, only_non_negative=True
)
X, y, X_offset, y_offset, X_scale = _preprocess_data(
X,
y,
fit_intercept=self.fit_intercept,
normalize=_normalize,
copy=self.copy_X,
sample_weight=sample_weight,
)
# Sample weight can be implemented via a simple rescaling.
X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight)
if self.positive:
if y.ndim < 2:
self.coef_ = optimize.nnls(X, y)[0]
else:
# scipy.optimize.nnls cannot handle y with shape (M, K)
outs = Parallel(n_jobs=n_jobs_)(
delayed(optimize.nnls)(X, y[:, j]) for j in range(y.shape[1])
)
self.coef_ = np.vstack([out[0] for out in outs])
elif sp.issparse(X):
X_offset_scale = X_offset / X_scale
def matvec(b):
return X.dot(b) - sample_weight_sqrt * b.dot(X_offset_scale)
def rmatvec(b):
return X.T.dot(b) - X_offset_scale * b.dot(sample_weight_sqrt)
X_centered = sparse.linalg.LinearOperator(
shape=X.shape, matvec=matvec, rmatvec=rmatvec
)
if y.ndim < 2:
self.coef_ = lsqr(X_centered, y)[0]
else:
# sparse_lstsq cannot handle y with shape (M, K)
outs = Parallel(n_jobs=n_jobs_)(
delayed(lsqr)(X_centered, y[:, j].ravel())
for j in range(y.shape[1])
)
self.coef_ = np.vstack([out[0] for out in outs])
else:
self.coef_, _, self.rank_, self.singular_ = linalg.lstsq(X, y)
self.coef_ = self.coef_.T
if y.ndim == 1:
self.coef_ = np.ravel(self.coef_)
self._set_intercept(X_offset, y_offset, X_scale)
return self
def _check_precomputed_gram_matrix(
X, precompute, X_offset, X_scale, rtol=1e-7, atol=1e-5
):
"""Computes a single element of the gram matrix and compares it to
the corresponding element of the user supplied gram matrix.
If the values do not match a ValueError will be thrown.
Parameters
----------
X : ndarray of shape (n_samples, n_features)
Data array.
precompute : array-like of shape (n_features, n_features)
User-supplied gram matrix.
X_offset : ndarray of shape (n_features,)
Array of feature means used to center design matrix.
X_scale : ndarray of shape (n_features,)
Array of feature scale factors used to normalize design matrix.
rtol : float, default=1e-7
Relative tolerance; see numpy.allclose.
atol : float, default=1e-5
absolute tolerance; see :func`numpy.allclose`. Note that the default
here is more tolerant than the default for
:func:`numpy.testing.assert_allclose`, where `atol=0`.
Raises
------
ValueError
Raised when the provided Gram matrix is not consistent.
"""
n_features = X.shape[1]
f1 = n_features // 2
f2 = min(f1 + 1, n_features - 1)
v1 = (X[:, f1] - X_offset[f1]) * X_scale[f1]
v2 = (X[:, f2] - X_offset[f2]) * X_scale[f2]
expected = np.dot(v1, v2)
actual = precompute[f1, f2]
if not np.isclose(expected, actual, rtol=rtol, atol=atol):
raise ValueError(
"Gram matrix passed in via 'precompute' parameter "
"did not pass validation when a single element was "
"checked - please check that it was computed "
f"properly. For element ({f1},{f2}) we computed "
f"{expected} but the user-supplied value was "
f"{actual}."
)
def _pre_fit(
X,
y,
Xy,
precompute,
normalize,
fit_intercept,
copy,
check_input=True,
sample_weight=None,
):
"""Function used at beginning of fit in linear models with L1 or L0 penalty.
This function applies _preprocess_data and additionally computes the gram matrix
`precompute` as needed as well as `Xy`.
Parameters
----------
order : 'F', 'C' or None, default=None
Whether X and y will be forced to be fortran or c-style. Only relevant
if sample_weight is not None.
"""
n_samples, n_features = X.shape
if sparse.isspmatrix(X):
# copy is not needed here as X is not modified inplace when X is sparse
precompute = False
X, y, X_offset, y_offset, X_scale = _preprocess_data(
X,
y,
fit_intercept=fit_intercept,
normalize=normalize,
copy=False,
check_input=check_input,
sample_weight=sample_weight,
)
else:
# copy was done in fit if necessary
X, y, X_offset, y_offset, X_scale = _preprocess_data(
X,
y,
fit_intercept=fit_intercept,
normalize=normalize,
copy=copy,
check_input=check_input,
sample_weight=sample_weight,
)
# Rescale only in dense case. Sparse cd solver directly deals with
# sample_weight.
if sample_weight is not None:
# This triggers copies anyway.
X, y, _ = _rescale_data(X, y, sample_weight=sample_weight)
# FIXME: 'normalize' to be removed in 1.2
if hasattr(precompute, "__array__"):
if (
fit_intercept
and not np.allclose(X_offset, np.zeros(n_features))
or normalize
and not np.allclose(X_scale, np.ones(n_features))
):
warnings.warn(
"Gram matrix was provided but X was centered to fit "
"intercept, or X was normalized : recomputing Gram matrix.",
UserWarning,
)
# recompute Gram
precompute = "auto"
Xy = None
elif check_input:
# If we're going to use the user's precomputed gram matrix, we
# do a quick check to make sure its not totally bogus.
_check_precomputed_gram_matrix(X, precompute, X_offset, X_scale)
# precompute if n_samples > n_features
if isinstance(precompute, str) and precompute == "auto":
precompute = n_samples > n_features
if precompute is True:
# make sure that the 'precompute' array is contiguous.
precompute = np.empty(shape=(n_features, n_features), dtype=X.dtype, order="C")
np.dot(X.T, X, out=precompute)
if not hasattr(precompute, "__array__"):
Xy = None # cannot use Xy if precompute is not Gram
if hasattr(precompute, "__array__") and Xy is None:
common_dtype = np.find_common_type([X.dtype, y.dtype], [])
if y.ndim == 1:
# Xy is 1d, make sure it is contiguous.
Xy = np.empty(shape=n_features, dtype=common_dtype, order="C")
np.dot(X.T, y, out=Xy)
else:
# Make sure that Xy is always F contiguous even if X or y are not
# contiguous: the goal is to make it fast to extract the data for a
# specific target.
n_targets = y.shape[1]
Xy = np.empty(shape=(n_features, n_targets), dtype=common_dtype, order="F")
np.dot(y.T, X, out=Xy.T)
return X, y, X_offset, y_offset, X_scale, precompute, Xy

View File

@@ -0,0 +1,779 @@
"""
Various bayesian regression
"""
# Authors: V. Michel, F. Pedregosa, A. Gramfort
# License: BSD 3 clause
from math import log
import numpy as np
from scipy import linalg
from ._base import LinearModel, _preprocess_data, _rescale_data
from ..base import RegressorMixin
from ._base import _deprecate_normalize
from ..utils.extmath import fast_logdet
from scipy.linalg import pinvh
from ..utils.validation import _check_sample_weight
###############################################################################
# BayesianRidge regression
class BayesianRidge(RegressorMixin, LinearModel):
"""Bayesian ridge regression.
Fit a Bayesian ridge model. See the Notes section for details on this
implementation and the optimization of the regularization parameters
lambda (precision of the weights) and alpha (precision of the noise).
Read more in the :ref:`User Guide <bayesian_regression>`.
Parameters
----------
n_iter : int, default=300
Maximum number of iterations. Should be greater than or equal to 1.
tol : float, default=1e-3
Stop the algorithm if w has converged.
alpha_1 : float, default=1e-6
Hyper-parameter : shape parameter for the Gamma distribution prior
over the alpha parameter.
alpha_2 : float, default=1e-6
Hyper-parameter : inverse scale parameter (rate parameter) for the
Gamma distribution prior over the alpha parameter.
lambda_1 : float, default=1e-6
Hyper-parameter : shape parameter for the Gamma distribution prior
over the lambda parameter.
lambda_2 : float, default=1e-6
Hyper-parameter : inverse scale parameter (rate parameter) for the
Gamma distribution prior over the lambda parameter.
alpha_init : float, default=None
Initial value for alpha (precision of the noise).
If not set, alpha_init is 1/Var(y).
.. versionadded:: 0.22
lambda_init : float, default=None
Initial value for lambda (precision of the weights).
If not set, lambda_init is 1.
.. versionadded:: 0.22
compute_score : bool, default=False
If True, compute the log marginal likelihood at each iteration of the
optimization.
fit_intercept : bool, default=True
Whether to calculate the intercept for this model.
The intercept is not treated as a probabilistic parameter
and thus has no associated variance. If set
to False, no intercept will be used in calculations
(i.e. data is expected to be centered).
normalize : bool, default=False
This parameter is ignored when ``fit_intercept`` is set to False.
If True, the regressors X will be normalized before regression by
subtracting the mean and dividing by the l2-norm.
If you wish to standardize, please use
:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
on an estimator with ``normalize=False``.
.. deprecated:: 1.0
``normalize`` was deprecated in version 1.0 and will be removed in
1.2.
copy_X : bool, default=True
If True, X will be copied; else, it may be overwritten.
verbose : bool, default=False
Verbose mode when fitting the model.
Attributes
----------
coef_ : array-like of shape (n_features,)
Coefficients of the regression model (mean of distribution)
intercept_ : float
Independent term in decision function. Set to 0.0 if
``fit_intercept = False``.
alpha_ : float
Estimated precision of the noise.
lambda_ : float
Estimated precision of the weights.
sigma_ : array-like of shape (n_features, n_features)
Estimated variance-covariance matrix of the weights
scores_ : array-like of shape (n_iter_+1,)
If computed_score is True, value of the log marginal likelihood (to be
maximized) at each iteration of the optimization. The array starts
with the value of the log marginal likelihood obtained for the initial
values of alpha and lambda and ends with the value obtained for the
estimated alpha and lambda.
n_iter_ : int
The actual number of iterations to reach the stopping criterion.
X_offset_ : float
If `normalize=True`, offset subtracted for centering data to a
zero mean.
X_scale_ : float
If `normalize=True`, parameter used to scale data to a unit
standard deviation.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
ARDRegression : Bayesian ARD regression.
Notes
-----
There exist several strategies to perform Bayesian ridge regression. This
implementation is based on the algorithm described in Appendix A of
(Tipping, 2001) where updates of the regularization parameters are done as
suggested in (MacKay, 1992). Note that according to A New
View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these
update rules do not guarantee that the marginal likelihood is increasing
between two consecutive iterations of the optimization.
References
----------
D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
Vol. 4, No. 3, 1992.
M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
Journal of Machine Learning Research, Vol. 1, 2001.
Examples
--------
>>> from sklearn import linear_model
>>> clf = linear_model.BayesianRidge()
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
BayesianRidge()
>>> clf.predict([[1, 1]])
array([1.])
"""
def __init__(
self,
*,
n_iter=300,
tol=1.0e-3,
alpha_1=1.0e-6,
alpha_2=1.0e-6,
lambda_1=1.0e-6,
lambda_2=1.0e-6,
alpha_init=None,
lambda_init=None,
compute_score=False,
fit_intercept=True,
normalize="deprecated",
copy_X=True,
verbose=False,
):
self.n_iter = n_iter
self.tol = tol
self.alpha_1 = alpha_1
self.alpha_2 = alpha_2
self.lambda_1 = lambda_1
self.lambda_2 = lambda_2
self.alpha_init = alpha_init
self.lambda_init = lambda_init
self.compute_score = compute_score
self.fit_intercept = fit_intercept
self.normalize = normalize
self.copy_X = copy_X
self.verbose = verbose
def fit(self, X, y, sample_weight=None):
"""Fit the model.
Parameters
----------
X : ndarray of shape (n_samples, n_features)
Training data.
y : ndarray of shape (n_samples,)
Target values. Will be cast to X's dtype if necessary.
sample_weight : ndarray of shape (n_samples,), default=None
Individual weights for each sample.
.. versionadded:: 0.20
parameter *sample_weight* support to BayesianRidge.
Returns
-------
self : object
Returns the instance itself.
"""
self._normalize = _deprecate_normalize(
self.normalize, default=False, estimator_name=self.__class__.__name__
)
if self.n_iter < 1:
raise ValueError(
"n_iter should be greater than or equal to 1. Got {!r}.".format(
self.n_iter
)
)
X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
X,
y,
self.fit_intercept,
self._normalize,
self.copy_X,
sample_weight=sample_weight,
)
if sample_weight is not None:
# Sample weight can be implemented via a simple rescaling.
X, y, _ = _rescale_data(X, y, sample_weight)
self.X_offset_ = X_offset_
self.X_scale_ = X_scale_
n_samples, n_features = X.shape
# Initialization of the values of the parameters
eps = np.finfo(np.float64).eps
# Add `eps` in the denominator to omit division by zero if `np.var(y)`
# is zero
alpha_ = self.alpha_init
lambda_ = self.lambda_init
if alpha_ is None:
alpha_ = 1.0 / (np.var(y) + eps)
if lambda_ is None:
lambda_ = 1.0
verbose = self.verbose
lambda_1 = self.lambda_1
lambda_2 = self.lambda_2
alpha_1 = self.alpha_1
alpha_2 = self.alpha_2
self.scores_ = list()
coef_old_ = None
XT_y = np.dot(X.T, y)
U, S, Vh = linalg.svd(X, full_matrices=False)
eigen_vals_ = S**2
# Convergence loop of the bayesian ridge regression
for iter_ in range(self.n_iter):
# update posterior mean coef_ based on alpha_ and lambda_ and
# compute corresponding rmse
coef_, rmse_ = self._update_coef_(
X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
)
if self.compute_score:
# compute the log marginal likelihood
s = self._log_marginal_likelihood(
n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
)
self.scores_.append(s)
# Update alpha and lambda according to (MacKay, 1992)
gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)
alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)
# Check for convergence
if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
if verbose:
print("Convergence after ", str(iter_), " iterations")
break
coef_old_ = np.copy(coef_)
self.n_iter_ = iter_ + 1
# return regularization parameters and corresponding posterior mean,
# log marginal likelihood and posterior covariance
self.alpha_ = alpha_
self.lambda_ = lambda_
self.coef_, rmse_ = self._update_coef_(
X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
)
if self.compute_score:
# compute the log marginal likelihood
s = self._log_marginal_likelihood(
n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
)
self.scores_.append(s)
self.scores_ = np.array(self.scores_)
# posterior covariance is given by 1/alpha_ * scaled_sigma_
scaled_sigma_ = np.dot(
Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis]
)
self.sigma_ = (1.0 / alpha_) * scaled_sigma_
self._set_intercept(X_offset_, y_offset_, X_scale_)
return self
def predict(self, X, return_std=False):
"""Predict using the linear model.
In addition to the mean of the predictive distribution, also its
standard deviation can be returned.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Samples.
return_std : bool, default=False
Whether to return the standard deviation of posterior prediction.
Returns
-------
y_mean : array-like of shape (n_samples,)
Mean of predictive distribution of query points.
y_std : array-like of shape (n_samples,)
Standard deviation of predictive distribution of query points.
"""
y_mean = self._decision_function(X)
if return_std is False:
return y_mean
else:
if self._normalize:
X = (X - self.X_offset_) / self.X_scale_
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
return y_mean, y_std
def _update_coef_(
self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
):
"""Update posterior mean and compute corresponding rmse.
Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
+ np.dot(X.T, X))^-1
"""
if n_samples > n_features:
coef_ = np.linalg.multi_dot(
[Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y]
)
else:
coef_ = np.linalg.multi_dot(
[X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y]
)
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
return coef_, rmse_
def _log_marginal_likelihood(
self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse
):
"""Log marginal likelihood."""
alpha_1 = self.alpha_1
alpha_2 = self.alpha_2
lambda_1 = self.lambda_1
lambda_2 = self.lambda_2
# compute the log of the determinant of the posterior covariance.
# posterior covariance is given by
# sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1
if n_samples > n_features:
logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals))
else:
logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype)
logdet_sigma[:n_samples] += alpha_ * eigen_vals
logdet_sigma = -np.sum(np.log(logdet_sigma))
score = lambda_1 * log(lambda_) - lambda_2 * lambda_
score += alpha_1 * log(alpha_) - alpha_2 * alpha_
score += 0.5 * (
n_features * log(lambda_)
+ n_samples * log(alpha_)
- alpha_ * rmse
- lambda_ * np.sum(coef**2)
+ logdet_sigma
- n_samples * log(2 * np.pi)
)
return score
###############################################################################
# ARD (Automatic Relevance Determination) regression
class ARDRegression(RegressorMixin, LinearModel):
"""Bayesian ARD regression.
Fit the weights of a regression model, using an ARD prior. The weights of
the regression model are assumed to be in Gaussian distributions.
Also estimate the parameters lambda (precisions of the distributions of the
weights) and alpha (precision of the distribution of the noise).
The estimation is done by an iterative procedures (Evidence Maximization)
Read more in the :ref:`User Guide <bayesian_regression>`.
Parameters
----------
n_iter : int, default=300
Maximum number of iterations.
tol : float, default=1e-3
Stop the algorithm if w has converged.
alpha_1 : float, default=1e-6
Hyper-parameter : shape parameter for the Gamma distribution prior
over the alpha parameter.
alpha_2 : float, default=1e-6
Hyper-parameter : inverse scale parameter (rate parameter) for the
Gamma distribution prior over the alpha parameter.
lambda_1 : float, default=1e-6
Hyper-parameter : shape parameter for the Gamma distribution prior
over the lambda parameter.
lambda_2 : float, default=1e-6
Hyper-parameter : inverse scale parameter (rate parameter) for the
Gamma distribution prior over the lambda parameter.
compute_score : bool, default=False
If True, compute the objective function at each step of the model.
threshold_lambda : float, default=10 000
Threshold for removing (pruning) weights with high precision from
the computation.
fit_intercept : bool, default=True
Whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(i.e. data is expected to be centered).
normalize : bool, default=False
This parameter is ignored when ``fit_intercept`` is set to False.
If True, the regressors X will be normalized before regression by
subtracting the mean and dividing by the l2-norm.
If you wish to standardize, please use
:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
on an estimator with ``normalize=False``.
.. deprecated:: 1.0
``normalize`` was deprecated in version 1.0 and will be removed in
1.2.
copy_X : bool, default=True
If True, X will be copied; else, it may be overwritten.
verbose : bool, default=False
Verbose mode when fitting the model.
Attributes
----------
coef_ : array-like of shape (n_features,)
Coefficients of the regression model (mean of distribution)
alpha_ : float
estimated precision of the noise.
lambda_ : array-like of shape (n_features,)
estimated precisions of the weights.
sigma_ : array-like of shape (n_features, n_features)
estimated variance-covariance matrix of the weights
scores_ : float
if computed, value of the objective function (to be maximized)
intercept_ : float
Independent term in decision function. Set to 0.0 if
``fit_intercept = False``.
X_offset_ : float
If `normalize=True`, offset subtracted for centering data to a
zero mean.
X_scale_ : float
If `normalize=True`, parameter used to scale data to a unit
standard deviation.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
BayesianRidge : Bayesian ridge regression.
Notes
-----
For an example, see :ref:`examples/linear_model/plot_ard.py
<sphx_glr_auto_examples_linear_model_plot_ard.py>`.
References
----------
D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
competition, ASHRAE Transactions, 1994.
R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
Their beta is our ``self.alpha_``
Their alpha is our ``self.lambda_``
ARD is a little different than the slide: only dimensions/features for
which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
discarded.
Examples
--------
>>> from sklearn import linear_model
>>> clf = linear_model.ARDRegression()
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
ARDRegression()
>>> clf.predict([[1, 1]])
array([1.])
"""
def __init__(
self,
*,
n_iter=300,
tol=1.0e-3,
alpha_1=1.0e-6,
alpha_2=1.0e-6,
lambda_1=1.0e-6,
lambda_2=1.0e-6,
compute_score=False,
threshold_lambda=1.0e4,
fit_intercept=True,
normalize="deprecated",
copy_X=True,
verbose=False,
):
self.n_iter = n_iter
self.tol = tol
self.fit_intercept = fit_intercept
self.normalize = normalize
self.alpha_1 = alpha_1
self.alpha_2 = alpha_2
self.lambda_1 = lambda_1
self.lambda_2 = lambda_2
self.compute_score = compute_score
self.threshold_lambda = threshold_lambda
self.copy_X = copy_X
self.verbose = verbose
def fit(self, X, y):
"""Fit the model according to the given training data and parameters.
Iterative procedure to maximize the evidence
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : array-like of shape (n_samples,)
Target values (integers). Will be cast to X's dtype if necessary.
Returns
-------
self : object
Fitted estimator.
"""
self._normalize = _deprecate_normalize(
self.normalize, default=False, estimator_name=self.__class__.__name__
)
X, y = self._validate_data(
X, y, dtype=[np.float64, np.float32], y_numeric=True, ensure_min_samples=2
)
n_samples, n_features = X.shape
coef_ = np.zeros(n_features, dtype=X.dtype)
X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
X, y, self.fit_intercept, self._normalize, self.copy_X
)
self.X_offset_ = X_offset_
self.X_scale_ = X_scale_
# Launch the convergence loop
keep_lambda = np.ones(n_features, dtype=bool)
lambda_1 = self.lambda_1
lambda_2 = self.lambda_2
alpha_1 = self.alpha_1
alpha_2 = self.alpha_2
verbose = self.verbose
# Initialization of the values of the parameters
eps = np.finfo(np.float64).eps
# Add `eps` in the denominator to omit division by zero if `np.var(y)`
# is zero
alpha_ = 1.0 / (np.var(y) + eps)
lambda_ = np.ones(n_features, dtype=X.dtype)
self.scores_ = list()
coef_old_ = None
def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
coef_[keep_lambda] = alpha_ * np.linalg.multi_dot(
[sigma_, X[:, keep_lambda].T, y]
)
return coef_
update_sigma = (
self._update_sigma
if n_samples >= n_features
else self._update_sigma_woodbury
)
# Iterative procedure of ARDRegression
for iter_ in range(self.n_iter):
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
# Update alpha and lambda
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)
lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (
(coef_[keep_lambda]) ** 2 + 2.0 * lambda_2
)
alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (
rmse_ + 2.0 * alpha_2
)
# Prune the weights with a precision over a threshold
keep_lambda = lambda_ < self.threshold_lambda
coef_[~keep_lambda] = 0
# Compute the objective function
if self.compute_score:
s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
s += alpha_1 * log(alpha_) - alpha_2 * alpha_
s += 0.5 * (
fast_logdet(sigma_)
+ n_samples * log(alpha_)
+ np.sum(np.log(lambda_))
)
s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())
self.scores_.append(s)
# Check for convergence
if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
if verbose:
print("Converged after %s iterations" % iter_)
break
coef_old_ = np.copy(coef_)
if not keep_lambda.any():
break
if keep_lambda.any():
# update sigma and mu using updated params from the last iteration
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
else:
sigma_ = np.array([]).reshape(0, 0)
self.coef_ = coef_
self.alpha_ = alpha_
self.sigma_ = sigma_
self.lambda_ = lambda_
self._set_intercept(X_offset_, y_offset_, X_scale_)
return self
def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
# See slides as referenced in the docstring note
# this function is used when n_samples < n_features and will invert
# a matrix of shape (n_samples, n_samples) making use of the
# woodbury formula:
# https://en.wikipedia.org/wiki/Woodbury_matrix_identity
n_samples = X.shape[0]
X_keep = X[:, keep_lambda]
inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
sigma_ = pinvh(
np.eye(n_samples, dtype=X.dtype) / alpha_
+ np.dot(X_keep * inv_lambda, X_keep.T)
)
sigma_ = np.dot(sigma_, X_keep * inv_lambda)
sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]
return sigma_
def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
# See slides as referenced in the docstring note
# this function is used when n_samples >= n_features and will
# invert a matrix of shape (n_features, n_features)
X_keep = X[:, keep_lambda]
gram = np.dot(X_keep.T, X_keep)
eye = np.eye(gram.shape[0], dtype=X.dtype)
sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
sigma_ = pinvh(sigma_inv)
return sigma_
def predict(self, X, return_std=False):
"""Predict using the linear model.
In addition to the mean of the predictive distribution, also its
standard deviation can be returned.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Samples.
return_std : bool, default=False
Whether to return the standard deviation of posterior prediction.
Returns
-------
y_mean : array-like of shape (n_samples,)
Mean of predictive distribution of query points.
y_std : array-like of shape (n_samples,)
Standard deviation of predictive distribution of query points.
"""
y_mean = self._decision_function(X)
if return_std is False:
return y_mean
else:
if self._normalize:
X = (X - self.X_offset_) / self.X_scale_
X = X[:, self.lambda_ < self.threshold_lambda]
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
return y_mean, y_std

View File

@@ -0,0 +1,15 @@
# License: BSD 3 clause
from .glm import (
_GeneralizedLinearRegressor,
PoissonRegressor,
GammaRegressor,
TweedieRegressor,
)
__all__ = [
"_GeneralizedLinearRegressor",
"PoissonRegressor",
"GammaRegressor",
"TweedieRegressor",
]

View File

@@ -0,0 +1,838 @@
"""
Generalized Linear Models with Exponential Dispersion Family
"""
# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
# some parts and tricks stolen from other sklearn files.
# License: BSD 3 clause
import numbers
import numpy as np
import scipy.optimize
from ..._loss.glm_distribution import TweedieDistribution
from ..._loss.loss import (
HalfGammaLoss,
HalfPoissonLoss,
HalfSquaredError,
HalfTweedieLoss,
HalfTweedieLossIdentity,
)
from ...base import BaseEstimator, RegressorMixin
from ...utils.optimize import _check_optimize_result
from ...utils import check_scalar, check_array, deprecated
from ...utils.validation import check_is_fitted, _check_sample_weight
from ...utils._openmp_helpers import _openmp_effective_n_threads
from .._linear_loss import LinearModelLoss
class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
"""Regression via a penalized Generalized Linear Model (GLM).
GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and
predicting the mean of the target y as y_pred=h(X*w) with coefficients w.
Therefore, the fit minimizes the following objective function with L2 priors as
regularizer::
1/(2*sum(s_i)) * sum(s_i * deviance(y_i, h(x_i*w)) + 1/2 * alpha * ||w||_2^2
with inverse link function h, s=sample_weight and per observation (unit) deviance
deviance(y_i, h(x_i*w)). Note that for an EDM, 1/2 * deviance is the negative
log-likelihood up to a constant (in w) term.
The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
Instead of implementing the EDM family and a link function separately, we directly
use the loss functions `from sklearn._loss` which have the link functions included
in them for performance reasons. We pick the loss functions that implement
(1/2 times) EDM deviances.
Read more in the :ref:`User Guide <Generalized_linear_regression>`.
.. versionadded:: 0.23
Parameters
----------
alpha : float, default=1
Constant that multiplies the penalty term and thus determines the
regularization strength. ``alpha = 0`` is equivalent to unpenalized
GLMs. In this case, the design matrix `X` must have full column rank
(no collinearities).
Values must be in the range `[0.0, inf)`.
fit_intercept : bool, default=True
Specifies if a constant (a.k.a. bias or intercept) should be
added to the linear predictor (X @ coef + intercept).
solver : 'lbfgs', default='lbfgs'
Algorithm to use in the optimization problem:
'lbfgs'
Calls scipy's L-BFGS-B optimizer.
max_iter : int, default=100
The maximal number of iterations for the solver.
Values must be in the range `[1, inf)`.
tol : float, default=1e-4
Stopping criterion. For the lbfgs solver,
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
where ``g_j`` is the j-th component of the gradient (derivative) of
the objective function.
Values must be in the range `(0.0, inf)`.
warm_start : bool, default=False
If set to ``True``, reuse the solution of the previous call to ``fit``
as initialization for ``coef_`` and ``intercept_``.
verbose : int, default=0
For the lbfgs solver set verbose to any positive number for verbosity.
Values must be in the range `[0, inf)`.
Attributes
----------
coef_ : array of shape (n_features,)
Estimated coefficients for the linear predictor (`X @ coef_ +
intercept_`) in the GLM.
intercept_ : float
Intercept (a.k.a. bias) added to linear predictor.
n_iter_ : int
Actual number of iterations used in the solver.
_base_loss : BaseLoss, default=HalfSquaredError()
This is set during fit via `self._get_loss()`.
A `_base_loss` contains a specific loss function as well as the link
function. The loss to be minimized specifies the distributional assumption of
the GLM, i.e. the distribution from the EDM. Here are some examples:
======================= ======== ==========================
_base_loss Link Target Domain
======================= ======== ==========================
HalfSquaredError identity y any real number
HalfPoissonLoss log 0 <= y
HalfGammaLoss log 0 < y
HalfTweedieLoss log dependend on tweedie power
HalfTweedieLossIdentity identity dependend on tweedie power
======================= ======== ==========================
The link function of the GLM, i.e. mapping from linear predictor
`X @ coeff + intercept` to prediction `y_pred`. For instance, with a log link,
we have `y_pred = exp(X @ coeff + intercept)`.
"""
def __init__(
self,
*,
alpha=1.0,
fit_intercept=True,
solver="lbfgs",
max_iter=100,
tol=1e-4,
warm_start=False,
verbose=0,
):
self.alpha = alpha
self.fit_intercept = fit_intercept
self.solver = solver
self.max_iter = max_iter
self.tol = tol
self.warm_start = warm_start
self.verbose = verbose
def fit(self, X, y, sample_weight=None):
"""Fit a Generalized Linear Model.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
self : object
Fitted model.
"""
check_scalar(
self.alpha,
name="alpha",
target_type=numbers.Real,
min_val=0.0,
include_boundaries="left",
)
if not isinstance(self.fit_intercept, bool):
raise ValueError(
"The argument fit_intercept must be bool; got {0}".format(
self.fit_intercept
)
)
if self.solver not in ["lbfgs"]:
raise ValueError(
f"{self.__class__.__name__} supports only solvers 'lbfgs'; "
f"got {self.solver}"
)
solver = self.solver
check_scalar(
self.max_iter,
name="max_iter",
target_type=numbers.Integral,
min_val=1,
)
check_scalar(
self.tol,
name="tol",
target_type=numbers.Real,
min_val=0.0,
include_boundaries="neither",
)
check_scalar(
self.verbose,
name="verbose",
target_type=numbers.Integral,
min_val=0,
)
if not isinstance(self.warm_start, bool):
raise ValueError(
"The argument warm_start must be bool; got {0}".format(self.warm_start)
)
X, y = self._validate_data(
X,
y,
accept_sparse=["csc", "csr"],
dtype=[np.float64, np.float32],
y_numeric=True,
multi_output=False,
)
# required by losses
if solver == "lbfgs":
# lbfgs will force coef and therefore raw_prediction to be float64. The
# base_loss needs y, X @ coef and sample_weight all of same dtype
# (and contiguous).
loss_dtype = np.float64
else:
loss_dtype = min(max(y.dtype, X.dtype), np.float64)
y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False)
# TODO: We could support samples_weight=None as the losses support it.
# Note that _check_sample_weight calls check_array(order="C") required by
# losses.
sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
n_samples, n_features = X.shape
self._base_loss = self._get_loss()
linear_loss = LinearModelLoss(
base_loss=self._base_loss,
fit_intercept=self.fit_intercept,
)
if not linear_loss.base_loss.in_y_true_range(y):
raise ValueError(
"Some value(s) of y are out of the valid range of the loss"
f" {self._base_loss.__class__.__name__!r}."
)
# TODO: if alpha=0 check that X is not rank deficient
# IMPORTANT NOTE: Rescaling of sample_weight:
# We want to minimize
# obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance)
# + 1/2 * alpha * L2,
# with
# deviance = 2 * loss.
# The objective is invariant to multiplying sample_weight by a constant. We
# choose this constant such that sum(sample_weight) = 1. Thus, we end up with
# obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
# Note that LinearModelLoss.loss() computes sum(sample_weight * loss).
sample_weight = sample_weight / sample_weight.sum()
if self.warm_start and hasattr(self, "coef_"):
if self.fit_intercept:
# LinearModelLoss needs intercept at the end of coefficient array.
coef = np.concatenate((self.coef_, np.array([self.intercept_])))
else:
coef = self.coef_
coef = coef.astype(loss_dtype, copy=False)
else:
if self.fit_intercept:
coef = np.zeros(n_features + 1, dtype=loss_dtype)
coef[-1] = linear_loss.base_loss.link.link(
np.average(y, weights=sample_weight)
)
else:
coef = np.zeros(n_features, dtype=loss_dtype)
# Algorithms for optimization:
# Note again that our losses implement 1/2 * deviance.
if solver == "lbfgs":
func = linear_loss.loss_gradient
l2_reg_strength = self.alpha
n_threads = _openmp_effective_n_threads()
opt_res = scipy.optimize.minimize(
func,
coef,
method="L-BFGS-B",
jac=True,
options={
"maxiter": self.max_iter,
"iprint": (self.verbose > 0) - 1,
"gtol": self.tol,
"ftol": 1e3 * np.finfo(float).eps,
},
args=(X, y, sample_weight, l2_reg_strength, n_threads),
)
self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
coef = opt_res.x
if self.fit_intercept:
self.intercept_ = coef[-1]
self.coef_ = coef[:-1]
else:
# set intercept to zero as the other linear models do
self.intercept_ = 0.0
self.coef_ = coef
return self
def _linear_predictor(self, X):
"""Compute the linear_predictor = `X @ coef_ + intercept_`.
Note that we often use the term raw_prediction instead of linear predictor.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Samples.
Returns
-------
y_pred : array of shape (n_samples,)
Returns predicted values of linear predictor.
"""
check_is_fitted(self)
X = self._validate_data(
X,
accept_sparse=["csr", "csc", "coo"],
dtype=[np.float64, np.float32],
ensure_2d=True,
allow_nd=False,
reset=False,
)
return X @ self.coef_ + self.intercept_
def predict(self, X):
"""Predict using GLM with feature matrix X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Samples.
Returns
-------
y_pred : array of shape (n_samples,)
Returns predicted values.
"""
# check_array is done in _linear_predictor
raw_prediction = self._linear_predictor(X)
y_pred = self._base_loss.link.inverse(raw_prediction)
return y_pred
def score(self, X, y, sample_weight=None):
"""Compute D^2, the percentage of deviance explained.
D^2 is a generalization of the coefficient of determination R^2.
R^2 uses squared error and D^2 uses the deviance of this GLM, see the
:ref:`User Guide <regression_metrics>`.
D^2 is defined as
:math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
:math:`D_{null}` is the null deviance, i.e. the deviance of a model
with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
The mean :math:`\\bar{y}` is averaged by sample_weight.
Best possible score is 1.0 and it can be negative (because the model
can be arbitrarily worse).
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Test samples.
y : array-like of shape (n_samples,)
True values of target.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
score : float
D^2 of self.predict(X) w.r.t. y.
"""
# TODO: Adapt link to User Guide in the docstring, once
# https://github.com/scikit-learn/scikit-learn/pull/22118 is merged.
#
# Note, default score defined in RegressorMixin is R^2 score.
# TODO: make D^2 a score function in module metrics (and thereby get
# input validation and so on)
raw_prediction = self._linear_predictor(X) # validates X
# required by losses
y = check_array(y, dtype=raw_prediction.dtype, order="C", ensure_2d=False)
if sample_weight is not None:
# Note that _check_sample_weight calls check_array(order="C") required by
# losses.
sample_weight = _check_sample_weight(sample_weight, X, dtype=y.dtype)
base_loss = self._base_loss
if not base_loss.in_y_true_range(y):
raise ValueError(
"Some value(s) of y are out of the valid range of the loss"
f" {base_loss.__name__}."
)
# Note that constant_to_optimal_zero is already multiplied by sample_weight.
constant = np.mean(base_loss.constant_to_optimal_zero(y_true=y))
if sample_weight is not None:
constant *= sample_weight.shape[0] / np.sum(sample_weight)
# Missing factor of 2 in deviance cancels out.
deviance = base_loss(
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
n_threads=1,
)
y_mean = base_loss.link.link(np.average(y, weights=sample_weight))
deviance_null = base_loss(
y_true=y,
raw_prediction=np.tile(y_mean, y.shape[0]),
sample_weight=sample_weight,
n_threads=1,
)
return 1 - (deviance + constant) / (deviance_null + constant)
def _more_tags(self):
# Create instance of BaseLoss if fit wasn't called yet. This is necessary as
# TweedieRegressor might set the used loss during fit different from
# self._base_loss.
base_loss = self._get_loss()
return {"requires_positive_y": not base_loss.in_y_true_range(-1.0)}
def _get_loss(self):
"""This is only necessary because of the link and power arguments of the
TweedieRegressor.
Note that we do not need to pass sample_weight to the loss class as this is
only needed to set loss.constant_hessian on which GLMs do not rely.
"""
return HalfSquaredError()
# TODO(1.3): remove
@deprecated( # type: ignore
"Attribute `family` was deprecated in version 1.1 and will be removed in 1.3."
)
@property
def family(self):
"""Ensure backward compatibility for the time of deprecation."""
if isinstance(self, PoissonRegressor):
return "poisson"
elif isinstance(self, GammaRegressor):
return "gamma"
elif isinstance(self, TweedieRegressor):
return TweedieDistribution(power=self.power)
else:
raise ValueError( # noqa
"This should never happen. You presumably accessed the deprecated "
"`family` attribute from a subclass of the private scikit-learn class "
"_GeneralizedLinearRegressor."
)
class PoissonRegressor(_GeneralizedLinearRegressor):
"""Generalized Linear Model with a Poisson distribution.
This regressor uses the 'log' link function.
Read more in the :ref:`User Guide <Generalized_linear_regression>`.
.. versionadded:: 0.23
Parameters
----------
alpha : float, default=1
Constant that multiplies the penalty term and thus determines the
regularization strength. ``alpha = 0`` is equivalent to unpenalized
GLMs. In this case, the design matrix `X` must have full column rank
(no collinearities).
Values must be in the range `[0.0, inf)`.
fit_intercept : bool, default=True
Specifies if a constant (a.k.a. bias or intercept) should be
added to the linear predictor (X @ coef + intercept).
max_iter : int, default=100
The maximal number of iterations for the solver.
Values must be in the range `[1, inf)`.
tol : float, default=1e-4
Stopping criterion. For the lbfgs solver,
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
where ``g_j`` is the j-th component of the gradient (derivative) of
the objective function.
Values must be in the range `(0.0, inf)`.
warm_start : bool, default=False
If set to ``True``, reuse the solution of the previous call to ``fit``
as initialization for ``coef_`` and ``intercept_`` .
verbose : int, default=0
For the lbfgs solver set verbose to any positive number for verbosity.
Values must be in the range `[0, inf)`.
Attributes
----------
coef_ : array of shape (n_features,)
Estimated coefficients for the linear predictor (`X @ coef_ +
intercept_`) in the GLM.
intercept_ : float
Intercept (a.k.a. bias) added to linear predictor.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
Actual number of iterations used in the solver.
See Also
--------
TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
Examples
--------
>>> from sklearn import linear_model
>>> clf = linear_model.PoissonRegressor()
>>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
>>> y = [12, 17, 22, 21]
>>> clf.fit(X, y)
PoissonRegressor()
>>> clf.score(X, y)
0.990...
>>> clf.coef_
array([0.121..., 0.158...])
>>> clf.intercept_
2.088...
>>> clf.predict([[1, 1], [3, 4]])
array([10.676..., 21.875...])
"""
def __init__(
self,
*,
alpha=1.0,
fit_intercept=True,
max_iter=100,
tol=1e-4,
warm_start=False,
verbose=0,
):
super().__init__(
alpha=alpha,
fit_intercept=fit_intercept,
max_iter=max_iter,
tol=tol,
warm_start=warm_start,
verbose=verbose,
)
def _get_loss(self):
return HalfPoissonLoss()
class GammaRegressor(_GeneralizedLinearRegressor):
"""Generalized Linear Model with a Gamma distribution.
This regressor uses the 'log' link function.
Read more in the :ref:`User Guide <Generalized_linear_regression>`.
.. versionadded:: 0.23
Parameters
----------
alpha : float, default=1
Constant that multiplies the penalty term and thus determines the
regularization strength. ``alpha = 0`` is equivalent to unpenalized
GLMs. In this case, the design matrix `X` must have full column rank
(no collinearities).
Values must be in the range `[0.0, inf)`.
fit_intercept : bool, default=True
Specifies if a constant (a.k.a. bias or intercept) should be
added to the linear predictor (X @ coef + intercept).
max_iter : int, default=100
The maximal number of iterations for the solver.
Values must be in the range `[1, inf)`.
tol : float, default=1e-4
Stopping criterion. For the lbfgs solver,
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
where ``g_j`` is the j-th component of the gradient (derivative) of
the objective function.
Values must be in the range `(0.0, inf)`.
warm_start : bool, default=False
If set to ``True``, reuse the solution of the previous call to ``fit``
as initialization for ``coef_`` and ``intercept_`` .
verbose : int, default=0
For the lbfgs solver set verbose to any positive number for verbosity.
Values must be in the range `[0, inf)`.
Attributes
----------
coef_ : array of shape (n_features,)
Estimated coefficients for the linear predictor (`X * coef_ +
intercept_`) in the GLM.
intercept_ : float
Intercept (a.k.a. bias) added to linear predictor.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
n_iter_ : int
Actual number of iterations used in the solver.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
PoissonRegressor : Generalized Linear Model with a Poisson distribution.
TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
Examples
--------
>>> from sklearn import linear_model
>>> clf = linear_model.GammaRegressor()
>>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
>>> y = [19, 26, 33, 30]
>>> clf.fit(X, y)
GammaRegressor()
>>> clf.score(X, y)
0.773...
>>> clf.coef_
array([0.072..., 0.066...])
>>> clf.intercept_
2.896...
>>> clf.predict([[1, 0], [2, 8]])
array([19.483..., 35.795...])
"""
def __init__(
self,
*,
alpha=1.0,
fit_intercept=True,
max_iter=100,
tol=1e-4,
warm_start=False,
verbose=0,
):
super().__init__(
alpha=alpha,
fit_intercept=fit_intercept,
max_iter=max_iter,
tol=tol,
warm_start=warm_start,
verbose=verbose,
)
def _get_loss(self):
return HalfGammaLoss()
class TweedieRegressor(_GeneralizedLinearRegressor):
"""Generalized Linear Model with a Tweedie distribution.
This estimator can be used to model different GLMs depending on the
``power`` parameter, which determines the underlying distribution.
Read more in the :ref:`User Guide <Generalized_linear_regression>`.
.. versionadded:: 0.23
Parameters
----------
power : float, default=0
The power determines the underlying target distribution according
to the following table:
+-------+------------------------+
| Power | Distribution |
+=======+========================+
| 0 | Normal |
+-------+------------------------+
| 1 | Poisson |
+-------+------------------------+
| (1,2) | Compound Poisson Gamma |
+-------+------------------------+
| 2 | Gamma |
+-------+------------------------+
| 3 | Inverse Gaussian |
+-------+------------------------+
For ``0 < power < 1``, no distribution exists.
alpha : float, default=1
Constant that multiplies the penalty term and thus determines the
regularization strength. ``alpha = 0`` is equivalent to unpenalized
GLMs. In this case, the design matrix `X` must have full column rank
(no collinearities).
Values must be in the range `[0.0, inf)`.
fit_intercept : bool, default=True
Specifies if a constant (a.k.a. bias or intercept) should be
added to the linear predictor (X @ coef + intercept).
link : {'auto', 'identity', 'log'}, default='auto'
The link function of the GLM, i.e. mapping from linear predictor
`X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
the link depending on the chosen `power` parameter as follows:
- 'identity' for ``power <= 0``, e.g. for the Normal distribution
- 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian
distributions
max_iter : int, default=100
The maximal number of iterations for the solver.
Values must be in the range `[1, inf)`.
tol : float, default=1e-4
Stopping criterion. For the lbfgs solver,
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
where ``g_j`` is the j-th component of the gradient (derivative) of
the objective function.
Values must be in the range `(0.0, inf)`.
warm_start : bool, default=False
If set to ``True``, reuse the solution of the previous call to ``fit``
as initialization for ``coef_`` and ``intercept_`` .
verbose : int, default=0
For the lbfgs solver set verbose to any positive number for verbosity.
Values must be in the range `[0, inf)`.
Attributes
----------
coef_ : array of shape (n_features,)
Estimated coefficients for the linear predictor (`X @ coef_ +
intercept_`) in the GLM.
intercept_ : float
Intercept (a.k.a. bias) added to linear predictor.
n_iter_ : int
Actual number of iterations used in the solver.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
PoissonRegressor : Generalized Linear Model with a Poisson distribution.
GammaRegressor : Generalized Linear Model with a Gamma distribution.
Examples
--------
>>> from sklearn import linear_model
>>> clf = linear_model.TweedieRegressor()
>>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
>>> y = [2, 3.5, 5, 5.5]
>>> clf.fit(X, y)
TweedieRegressor()
>>> clf.score(X, y)
0.839...
>>> clf.coef_
array([0.599..., 0.299...])
>>> clf.intercept_
1.600...
>>> clf.predict([[1, 1], [3, 4]])
array([2.500..., 4.599...])
"""
def __init__(
self,
*,
power=0.0,
alpha=1.0,
fit_intercept=True,
link="auto",
max_iter=100,
tol=1e-4,
warm_start=False,
verbose=0,
):
super().__init__(
alpha=alpha,
fit_intercept=fit_intercept,
max_iter=max_iter,
tol=tol,
warm_start=warm_start,
verbose=verbose,
)
self.link = link
self.power = power
def _get_loss(self):
if self.link == "auto":
if self.power <= 0:
# identity link
return HalfTweedieLossIdentity(power=self.power)
else:
# log link
return HalfTweedieLoss(power=self.power)
elif self.link == "log":
return HalfTweedieLoss(power=self.power)
elif self.link == "identity":
return HalfTweedieLossIdentity(power=self.power)
else:
raise ValueError(
"The link must be an element of ['auto', 'identity', 'log']; "
f"got (link={self.link!r})"
)

View File

@@ -0,0 +1 @@
# License: BSD 3 clause

View File

@@ -0,0 +1,476 @@
# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
#
# License: BSD 3 clause
import re
import numpy as np
from numpy.testing import assert_allclose
import pytest
import warnings
from sklearn.base import clone
from sklearn._loss.glm_distribution import TweedieDistribution
from sklearn._loss.link import IdentityLink, LogLink
from sklearn.datasets import make_regression
from sklearn.linear_model._glm import _GeneralizedLinearRegressor
from sklearn.linear_model import TweedieRegressor, PoissonRegressor, GammaRegressor
from sklearn.linear_model import Ridge
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import d2_tweedie_score
from sklearn.model_selection import train_test_split
@pytest.fixture(scope="module")
def regression_data():
X, y = make_regression(
n_samples=107, n_features=10, n_informative=80, noise=0.5, random_state=2
)
return X, y
def test_sample_weights_validation():
"""Test the raised errors in the validation of sample_weight."""
# scalar value but not positive
X = [[1]]
y = [1]
weights = 0
glm = _GeneralizedLinearRegressor()
# Positive weights are accepted
glm.fit(X, y, sample_weight=1)
# 2d array
weights = [[0]]
with pytest.raises(ValueError, match="must be 1D array or scalar"):
glm.fit(X, y, weights)
# 1d but wrong length
weights = [1, 0]
msg = r"sample_weight.shape == \(2,\), expected \(1,\)!"
with pytest.raises(ValueError, match=msg):
glm.fit(X, y, weights)
@pytest.mark.parametrize("fit_intercept", ["not bool", 1, 0, [True]])
def test_glm_fit_intercept_argument(fit_intercept):
"""Test GLM for invalid fit_intercept argument."""
y = np.array([1, 2])
X = np.array([[1], [1]])
glm = _GeneralizedLinearRegressor(fit_intercept=fit_intercept)
with pytest.raises(ValueError, match="fit_intercept must be bool"):
glm.fit(X, y)
@pytest.mark.parametrize("solver", ["not a solver", 1, [1]])
def test_glm_solver_argument(solver):
"""Test GLM for invalid solver argument."""
y = np.array([1, 2])
X = np.array([[1], [2]])
glm = _GeneralizedLinearRegressor(solver=solver)
with pytest.raises(ValueError):
glm.fit(X, y)
@pytest.mark.parametrize(
"Estimator",
[_GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor, TweedieRegressor],
)
@pytest.mark.parametrize(
"params, err_type, err_msg",
[
({"max_iter": 0}, ValueError, "max_iter == 0, must be >= 1"),
({"max_iter": -1}, ValueError, "max_iter == -1, must be >= 1"),
(
{"max_iter": "not a number"},
TypeError,
"max_iter must be an instance of int, not str",
),
(
{"max_iter": [1]},
TypeError,
"max_iter must be an instance of int, not list",
),
(
{"max_iter": 5.5},
TypeError,
"max_iter must be an instance of int, not float",
),
({"alpha": -1}, ValueError, "alpha == -1, must be >= 0.0"),
(
{"alpha": "1"},
TypeError,
"alpha must be an instance of float, not str",
),
({"tol": -1.0}, ValueError, "tol == -1.0, must be > 0."),
({"tol": 0.0}, ValueError, "tol == 0.0, must be > 0.0"),
({"tol": 0}, ValueError, "tol == 0, must be > 0.0"),
(
{"tol": "1"},
TypeError,
"tol must be an instance of float, not str",
),
(
{"tol": [1e-3]},
TypeError,
"tol must be an instance of float, not list",
),
({"verbose": -1}, ValueError, "verbose == -1, must be >= 0."),
(
{"verbose": "1"},
TypeError,
"verbose must be an instance of int, not str",
),
(
{"verbose": 1.0},
TypeError,
"verbose must be an instance of int, not float",
),
],
)
def test_glm_scalar_argument(Estimator, params, err_type, err_msg):
"""Test GLM for invalid parameter arguments."""
y = np.array([1, 2])
X = np.array([[1], [2]])
glm = Estimator(**params)
with pytest.raises(err_type, match=err_msg):
glm.fit(X, y)
@pytest.mark.parametrize("warm_start", ["not bool", 1, 0, [True]])
def test_glm_warm_start_argument(warm_start):
"""Test GLM for invalid warm_start argument."""
y = np.array([1, 2])
X = np.array([[1], [1]])
glm = _GeneralizedLinearRegressor(warm_start=warm_start)
with pytest.raises(ValueError, match="warm_start must be bool"):
glm.fit(X, y)
@pytest.mark.parametrize(
"glm",
[
TweedieRegressor(power=3),
PoissonRegressor(),
GammaRegressor(),
TweedieRegressor(power=1.5),
],
)
def test_glm_wrong_y_range(glm):
y = np.array([-1, 2])
X = np.array([[1], [1]])
msg = r"Some value\(s\) of y are out of the valid range of the loss"
with pytest.raises(ValueError, match=msg):
glm.fit(X, y)
@pytest.mark.parametrize("fit_intercept", [False, True])
def test_glm_identity_regression(fit_intercept):
"""Test GLM regression with identity link on a simple dataset."""
coef = [1.0, 2.0]
X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
y = np.dot(X, coef)
glm = _GeneralizedLinearRegressor(
alpha=0,
fit_intercept=fit_intercept,
tol=1e-12,
)
if fit_intercept:
glm.fit(X[:, 1:], y)
assert_allclose(glm.coef_, coef[1:], rtol=1e-10)
assert_allclose(glm.intercept_, coef[0], rtol=1e-10)
else:
glm.fit(X, y)
assert_allclose(glm.coef_, coef, rtol=1e-12)
@pytest.mark.parametrize("fit_intercept", [False, True])
@pytest.mark.parametrize("alpha", [0.0, 1.0])
@pytest.mark.parametrize(
"GLMEstimator", [_GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor]
)
def test_glm_sample_weight_consistency(fit_intercept, alpha, GLMEstimator):
"""Test that the impact of sample_weight is consistent"""
rng = np.random.RandomState(0)
n_samples, n_features = 10, 5
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples)
glm_params = dict(alpha=alpha, fit_intercept=fit_intercept)
glm = GLMEstimator(**glm_params).fit(X, y)
coef = glm.coef_.copy()
# sample_weight=np.ones(..) should be equivalent to sample_weight=None
sample_weight = np.ones(y.shape)
glm.fit(X, y, sample_weight=sample_weight)
assert_allclose(glm.coef_, coef, rtol=1e-12)
# sample_weight are normalized to 1 so, scaling them has no effect
sample_weight = 2 * np.ones(y.shape)
glm.fit(X, y, sample_weight=sample_weight)
assert_allclose(glm.coef_, coef, rtol=1e-12)
# setting one element of sample_weight to 0 is equivalent to removing
# the corresponding sample
sample_weight = np.ones(y.shape)
sample_weight[-1] = 0
glm.fit(X, y, sample_weight=sample_weight)
coef1 = glm.coef_.copy()
glm.fit(X[:-1], y[:-1])
assert_allclose(glm.coef_, coef1, rtol=1e-12)
# check that multiplying sample_weight by 2 is equivalent
# to repeating corresponding samples twice
X2 = np.concatenate([X, X[: n_samples // 2]], axis=0)
y2 = np.concatenate([y, y[: n_samples // 2]])
sample_weight_1 = np.ones(len(y))
sample_weight_1[: n_samples // 2] = 2
glm1 = GLMEstimator(**glm_params).fit(X, y, sample_weight=sample_weight_1)
glm2 = GLMEstimator(**glm_params).fit(X2, y2, sample_weight=None)
assert_allclose(glm1.coef_, glm2.coef_)
@pytest.mark.parametrize("fit_intercept", [True, False])
@pytest.mark.parametrize(
"estimator",
[
PoissonRegressor(),
GammaRegressor(),
TweedieRegressor(power=3.0),
TweedieRegressor(power=0, link="log"),
TweedieRegressor(power=1.5),
TweedieRegressor(power=4.5),
],
)
def test_glm_log_regression(fit_intercept, estimator):
"""Test GLM regression with log link on a simple dataset."""
coef = [0.2, -0.1]
X = np.array([[0, 1, 2, 3, 4], [1, 1, 1, 1, 1]]).T
y = np.exp(np.dot(X, coef))
glm = clone(estimator).set_params(
alpha=0,
fit_intercept=fit_intercept,
tol=1e-8,
)
if fit_intercept:
res = glm.fit(X[:, :-1], y)
assert_allclose(res.coef_, coef[:-1], rtol=1e-6)
assert_allclose(res.intercept_, coef[-1], rtol=1e-6)
else:
res = glm.fit(X, y)
assert_allclose(res.coef_, coef, rtol=2e-6)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_warm_start(fit_intercept):
n_samples, n_features = 110, 10
X, y = make_regression(
n_samples=n_samples,
n_features=n_features,
n_informative=n_features - 2,
noise=0.5,
random_state=42,
)
glm1 = _GeneralizedLinearRegressor(
warm_start=False, fit_intercept=fit_intercept, max_iter=1000
)
glm1.fit(X, y)
glm2 = _GeneralizedLinearRegressor(
warm_start=True, fit_intercept=fit_intercept, max_iter=1
)
# As we intentionally set max_iter=1, L-BFGS-B will issue a
# ConvergenceWarning which we here simply ignore.
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=ConvergenceWarning)
glm2.fit(X, y)
assert glm1.score(X, y) > glm2.score(X, y)
glm2.set_params(max_iter=1000)
glm2.fit(X, y)
# The two model are not exactly identical since the lbfgs solver
# computes the approximate hessian from previous iterations, which
# will not be strictly identical in the case of a warm start.
assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5)
assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
# FIXME: 'normalize' to be removed in 1.2 in LinearRegression
@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
@pytest.mark.parametrize("n_samples, n_features", [(100, 10), (10, 100)])
@pytest.mark.parametrize("fit_intercept", [True, False])
@pytest.mark.parametrize("sample_weight", [None, True])
def test_normal_ridge_comparison(
n_samples, n_features, fit_intercept, sample_weight, request
):
"""Compare with Ridge regression for Normal distributions."""
test_size = 10
X, y = make_regression(
n_samples=n_samples + test_size,
n_features=n_features,
n_informative=n_features - 2,
noise=0.5,
random_state=42,
)
if n_samples > n_features:
ridge_params = {"solver": "svd"}
else:
ridge_params = {"solver": "saga", "max_iter": 1000000, "tol": 1e-7}
(
X_train,
X_test,
y_train,
y_test,
) = train_test_split(X, y, test_size=test_size, random_state=0)
alpha = 1.0
if sample_weight is None:
sw_train = None
alpha_ridge = alpha * n_samples
else:
sw_train = np.random.RandomState(0).rand(len(y_train))
alpha_ridge = alpha * sw_train.sum()
# GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
ridge = Ridge(
alpha=alpha_ridge,
normalize=False,
random_state=42,
fit_intercept=fit_intercept,
**ridge_params,
)
ridge.fit(X_train, y_train, sample_weight=sw_train)
glm = _GeneralizedLinearRegressor(
alpha=alpha,
fit_intercept=fit_intercept,
max_iter=300,
tol=1e-5,
)
glm.fit(X_train, y_train, sample_weight=sw_train)
assert glm.coef_.shape == (X.shape[1],)
assert_allclose(glm.coef_, ridge.coef_, atol=5e-5)
assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=2e-4)
assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4)
def test_poisson_glmnet():
"""Compare Poisson regression with L2 regularization and LogLink to glmnet"""
# library("glmnet")
# options(digits=10)
# df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
# x <- data.matrix(df[,c("a", "b")])
# y <- df$y
# fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson",
# standardize=F, thresh=1e-10, nlambda=10000)
# coef(fit, s=1)
# (Intercept) -0.12889386979
# a 0.29019207995
# b 0.03741173122
X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
y = np.array([0, 1, 1, 2])
glm = PoissonRegressor(
alpha=1,
fit_intercept=True,
tol=1e-7,
max_iter=300,
)
glm.fit(X, y)
assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
def test_convergence_warning(regression_data):
X, y = regression_data
est = _GeneralizedLinearRegressor(max_iter=1, tol=1e-20)
with pytest.warns(ConvergenceWarning):
est.fit(X, y)
@pytest.mark.parametrize(
"name, link_class", [("identity", IdentityLink), ("log", LogLink)]
)
def test_tweedie_link_argument(name, link_class):
"""Test GLM link argument set as string."""
y = np.array([0.1, 0.5]) # in range of all distributions
X = np.array([[1], [2]])
glm = TweedieRegressor(power=1, link=name).fit(X, y)
assert isinstance(glm._base_loss.link, link_class)
glm = TweedieRegressor(power=1, link="not a link")
with pytest.raises(
ValueError,
match=re.escape("The link must be an element of ['auto', 'identity', 'log']"),
):
glm.fit(X, y)
@pytest.mark.parametrize(
"power, expected_link_class",
[
(0, IdentityLink), # normal
(1, LogLink), # poisson
(2, LogLink), # gamma
(3, LogLink), # inverse-gaussian
],
)
def test_tweedie_link_auto(power, expected_link_class):
"""Test that link='auto' delivers the expected link function"""
y = np.array([0.1, 0.5]) # in range of all distributions
X = np.array([[1], [2]])
glm = TweedieRegressor(link="auto", power=power).fit(X, y)
assert isinstance(glm._base_loss.link, expected_link_class)
@pytest.mark.parametrize("power", [0, 1, 1.5, 2, 3])
@pytest.mark.parametrize("link", ["log", "identity"])
def test_tweedie_score(regression_data, power, link):
"""Test that GLM score equals d2_tweedie_score for Tweedie losses."""
X, y = regression_data
# make y positive
y = np.abs(y) + 1.0
glm = TweedieRegressor(power=power, link=link).fit(X, y)
assert glm.score(X, y) == pytest.approx(
d2_tweedie_score(y, glm.predict(X), power=power)
)
@pytest.mark.parametrize(
"estimator, value",
[
(PoissonRegressor(), True),
(GammaRegressor(), True),
(TweedieRegressor(power=1.5), True),
(TweedieRegressor(power=0), False),
],
)
def test_tags(estimator, value):
assert estimator._get_tags()["requires_positive_y"] is value
# TODO(1.3): remove
@pytest.mark.parametrize(
"est, family",
[
(PoissonRegressor(), "poisson"),
(GammaRegressor(), "gamma"),
(TweedieRegressor(), TweedieDistribution()),
(TweedieRegressor(power=2), TweedieDistribution(power=2)),
(TweedieRegressor(power=3), TweedieDistribution(power=3)),
],
)
def test_family_deprecation(est, family):
"""Test backward compatibility of the family property."""
with pytest.warns(FutureWarning, match="`family` was deprecated"):
if isinstance(family, str):
assert est.family == family
else:
assert est.family.__class__ == family.__class__
assert est.family.power == family.power

View File

@@ -0,0 +1,342 @@
# Authors: Manoj Kumar mks542@nyu.edu
# License: BSD 3 clause
import numpy as np
from scipy import optimize
from ..base import BaseEstimator, RegressorMixin
from ._base import LinearModel
from ..utils import axis0_safe_slice
from ..utils.validation import _check_sample_weight
from ..utils.extmath import safe_sparse_dot
from ..utils.optimize import _check_optimize_result
def _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None):
"""Returns the Huber loss and the gradient.
Parameters
----------
w : ndarray, shape (n_features + 1,) or (n_features + 2,)
Feature vector.
w[:n_features] gives the coefficients
w[-1] gives the scale factor and if the intercept is fit w[-2]
gives the intercept factor.
X : ndarray of shape (n_samples, n_features)
Input data.
y : ndarray of shape (n_samples,)
Target vector.
epsilon : float
Robustness of the Huber estimator.
alpha : float
Regularization parameter.
sample_weight : ndarray of shape (n_samples,), default=None
Weight assigned to each sample.
Returns
-------
loss : float
Huber loss.
gradient : ndarray, shape (len(w))
Returns the derivative of the Huber loss with respect to each
coefficient, intercept and the scale as a vector.
"""
_, n_features = X.shape
fit_intercept = n_features + 2 == w.shape[0]
if fit_intercept:
intercept = w[-2]
sigma = w[-1]
w = w[:n_features]
n_samples = np.sum(sample_weight)
# Calculate the values where |y - X'w -c / sigma| > epsilon
# The values above this threshold are outliers.
linear_loss = y - safe_sparse_dot(X, w)
if fit_intercept:
linear_loss -= intercept
abs_linear_loss = np.abs(linear_loss)
outliers_mask = abs_linear_loss > epsilon * sigma
# Calculate the linear loss due to the outliers.
# This is equal to (2 * M * |y - X'w -c / sigma| - M**2) * sigma
outliers = abs_linear_loss[outliers_mask]
num_outliers = np.count_nonzero(outliers_mask)
n_non_outliers = X.shape[0] - num_outliers
# n_sq_outliers includes the weight give to the outliers while
# num_outliers is just the number of outliers.
outliers_sw = sample_weight[outliers_mask]
n_sw_outliers = np.sum(outliers_sw)
outlier_loss = (
2.0 * epsilon * np.sum(outliers_sw * outliers)
- sigma * n_sw_outliers * epsilon**2
)
# Calculate the quadratic loss due to the non-outliers.-
# This is equal to |(y - X'w - c)**2 / sigma**2| * sigma
non_outliers = linear_loss[~outliers_mask]
weighted_non_outliers = sample_weight[~outliers_mask] * non_outliers
weighted_loss = np.dot(weighted_non_outliers.T, non_outliers)
squared_loss = weighted_loss / sigma
if fit_intercept:
grad = np.zeros(n_features + 2)
else:
grad = np.zeros(n_features + 1)
# Gradient due to the squared loss.
X_non_outliers = -axis0_safe_slice(X, ~outliers_mask, n_non_outliers)
grad[:n_features] = (
2.0 / sigma * safe_sparse_dot(weighted_non_outliers, X_non_outliers)
)
# Gradient due to the linear loss.
signed_outliers = np.ones_like(outliers)
signed_outliers_mask = linear_loss[outliers_mask] < 0
signed_outliers[signed_outliers_mask] = -1.0
X_outliers = axis0_safe_slice(X, outliers_mask, num_outliers)
sw_outliers = sample_weight[outliers_mask] * signed_outliers
grad[:n_features] -= 2.0 * epsilon * (safe_sparse_dot(sw_outliers, X_outliers))
# Gradient due to the penalty.
grad[:n_features] += alpha * 2.0 * w
# Gradient due to sigma.
grad[-1] = n_samples
grad[-1] -= n_sw_outliers * epsilon**2
grad[-1] -= squared_loss / sigma
# Gradient due to the intercept.
if fit_intercept:
grad[-2] = -2.0 * np.sum(weighted_non_outliers) / sigma
grad[-2] -= 2.0 * epsilon * np.sum(sw_outliers)
loss = n_samples * sigma + squared_loss + outlier_loss
loss += alpha * np.dot(w, w)
return loss, grad
class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
"""Linear regression model that is robust to outliers.
The Huber Regressor optimizes the squared loss for the samples where
``|(y - X'w) / sigma| < epsilon`` and the absolute loss for the samples
where ``|(y - X'w) / sigma| > epsilon``, where w and sigma are parameters
to be optimized. The parameter sigma makes sure that if y is scaled up
or down by a certain factor, one does not need to rescale epsilon to
achieve the same robustness. Note that this does not take into account
the fact that the different features of X may be of different scales.
This makes sure that the loss function is not heavily influenced by the
outliers while not completely ignoring their effect.
Read more in the :ref:`User Guide <huber_regression>`
.. versionadded:: 0.18
Parameters
----------
epsilon : float, greater than 1.0, default=1.35
The parameter epsilon controls the number of samples that should be
classified as outliers. The smaller the epsilon, the more robust it is
to outliers.
max_iter : int, default=100
Maximum number of iterations that
``scipy.optimize.minimize(method="L-BFGS-B")`` should run for.
alpha : float, default=0.0001
Regularization parameter.
warm_start : bool, default=False
This is useful if the stored attributes of a previously used model
has to be reused. If set to False, then the coefficients will
be rewritten for every call to fit.
See :term:`the Glossary <warm_start>`.
fit_intercept : bool, default=True
Whether or not to fit the intercept. This can be set to False
if the data is already centered around the origin.
tol : float, default=1e-05
The iteration will stop when
``max{|proj g_i | i = 1, ..., n}`` <= ``tol``
where pg_i is the i-th component of the projected gradient.
Attributes
----------
coef_ : array, shape (n_features,)
Features got by optimizing the Huber loss.
intercept_ : float
Bias.
scale_ : float
The value by which ``|y - X'w - c|`` is scaled down.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
Number of iterations that
``scipy.optimize.minimize(method="L-BFGS-B")`` has run for.
.. versionchanged:: 0.20
In SciPy <= 1.0.0 the number of lbfgs iterations may exceed
``max_iter``. ``n_iter_`` will now report at most ``max_iter``.
outliers_ : array, shape (n_samples,)
A boolean mask which is set to True where the samples are identified
as outliers.
See Also
--------
RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
References
----------
.. [1] Peter J. Huber, Elvezio M. Ronchetti, Robust Statistics
Concomitant scale estimates, pg 172
.. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression.
https://statweb.stanford.edu/~owen/reports/hhu.pdf
Examples
--------
>>> import numpy as np
>>> from sklearn.linear_model import HuberRegressor, LinearRegression
>>> from sklearn.datasets import make_regression
>>> rng = np.random.RandomState(0)
>>> X, y, coef = make_regression(
... n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0)
>>> X[:4] = rng.uniform(10, 20, (4, 2))
>>> y[:4] = rng.uniform(10, 20, 4)
>>> huber = HuberRegressor().fit(X, y)
>>> huber.score(X, y)
-7.284...
>>> huber.predict(X[:1,])
array([806.7200...])
>>> linear = LinearRegression().fit(X, y)
>>> print("True coefficients:", coef)
True coefficients: [20.4923... 34.1698...]
>>> print("Huber coefficients:", huber.coef_)
Huber coefficients: [17.7906... 31.0106...]
>>> print("Linear Regression coefficients:", linear.coef_)
Linear Regression coefficients: [-1.9221... 7.0226...]
"""
def __init__(
self,
*,
epsilon=1.35,
max_iter=100,
alpha=0.0001,
warm_start=False,
fit_intercept=True,
tol=1e-05,
):
self.epsilon = epsilon
self.max_iter = max_iter
self.alpha = alpha
self.warm_start = warm_start
self.fit_intercept = fit_intercept
self.tol = tol
def fit(self, X, y, sample_weight=None):
"""Fit the model according to the given training data.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : array-like, shape (n_samples,)
Target vector relative to X.
sample_weight : array-like, shape (n_samples,)
Weight given to each sample.
Returns
-------
self : object
Fitted `HuberRegressor` estimator.
"""
X, y = self._validate_data(
X,
y,
copy=False,
accept_sparse=["csr"],
y_numeric=True,
dtype=[np.float64, np.float32],
)
sample_weight = _check_sample_weight(sample_weight, X)
if self.epsilon < 1.0:
raise ValueError(
"epsilon should be greater than or equal to 1.0, got %f" % self.epsilon
)
if self.warm_start and hasattr(self, "coef_"):
parameters = np.concatenate((self.coef_, [self.intercept_, self.scale_]))
else:
if self.fit_intercept:
parameters = np.zeros(X.shape[1] + 2)
else:
parameters = np.zeros(X.shape[1] + 1)
# Make sure to initialize the scale parameter to a strictly
# positive value:
parameters[-1] = 1
# Sigma or the scale factor should be non-negative.
# Setting it to be zero might cause undefined bounds hence we set it
# to a value close to zero.
bounds = np.tile([-np.inf, np.inf], (parameters.shape[0], 1))
bounds[-1][0] = np.finfo(np.float64).eps * 10
opt_res = optimize.minimize(
_huber_loss_and_gradient,
parameters,
method="L-BFGS-B",
jac=True,
args=(X, y, self.epsilon, self.alpha, sample_weight),
options={"maxiter": self.max_iter, "gtol": self.tol, "iprint": -1},
bounds=bounds,
)
parameters = opt_res.x
if opt_res.status == 2:
raise ValueError(
"HuberRegressor convergence failed: l-BFGS-b solver terminated with %s"
% opt_res.message
)
self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
self.scale_ = parameters[-1]
if self.fit_intercept:
self.intercept_ = parameters[-2]
else:
self.intercept_ = 0.0
self.coef_ = parameters[: X.shape[1]]
residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_)
self.outliers_ = residual > self.scale_ * self.epsilon
return self

View File

@@ -0,0 +1,413 @@
"""
Loss functions for linear models with raw_prediction = X @ coef
"""
import numpy as np
from scipy import sparse
from ..utils.extmath import squared_norm
class LinearModelLoss:
"""General class for loss functions with raw_prediction = X @ coef + intercept.
Note that raw_prediction is also known as linear predictor.
The loss is the sum of per sample losses and includes a term for L2
regularization::
loss = sum_i s_i loss(y_i, X_i @ coef + intercept)
+ 1/2 * l2_reg_strength * ||coef||_2^2
with sample weights s_i=1 if sample_weight=None.
Gradient and hessian, for simplicity without intercept, are::
gradient = X.T @ loss.gradient + l2_reg_strength * coef
hessian = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
Conventions:
if fit_intercept:
n_dof = n_features + 1
else:
n_dof = n_features
if base_loss.is_multiclass:
coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
else:
coef.shape = (n_dof,)
The intercept term is at the end of the coef array:
if base_loss.is_multiclass:
if coef.shape (n_classes, n_dof):
intercept = coef[:, -1]
if coef.shape (n_classes * n_dof,)
intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
intercept.shape = (n_classes,)
else:
intercept = coef[-1]
Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as
coef.reshape((n_classes, -1), order="F")
The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
coefficients without intercept, coef[:, :-1], contiguous and speeds up
matrix-vector computations.
Note: If the average loss per sample is wanted instead of the sum of the loss per
sample, one can simply use a rescaled sample_weight such that
sum(sample_weight) = 1.
Parameters
----------
base_loss : instance of class BaseLoss from sklearn._loss.
fit_intercept : bool
"""
def __init__(self, base_loss, fit_intercept):
self.base_loss = base_loss
self.fit_intercept = fit_intercept
def _w_intercept_raw(self, coef, X):
"""Helper function to get coefficients, intercept and raw_prediction.
Parameters
----------
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
Coefficients of a linear model.
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
i.e. one reconstructs the 2d-array via
coef.reshape((n_classes, -1), order="F").
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
Returns
-------
weights : ndarray of shape (n_features,) or (n_classes, n_features)
Coefficients without intercept term.
intercept : float or ndarray of shape (n_classes,)
Intercept terms.
raw_prediction : ndarray of shape (n_samples,) or \
(n_samples, n_classes)
"""
if not self.base_loss.is_multiclass:
if self.fit_intercept:
intercept = coef[-1]
weights = coef[:-1]
else:
intercept = 0.0
weights = coef
raw_prediction = X @ weights + intercept
else:
# reshape to (n_classes, n_dof)
if coef.ndim == 1:
weights = coef.reshape((self.base_loss.n_classes, -1), order="F")
else:
weights = coef
if self.fit_intercept:
intercept = weights[:, -1]
weights = weights[:, :-1]
else:
intercept = 0.0
raw_prediction = X @ weights.T + intercept # ndarray, likely C-contiguous
return weights, intercept, raw_prediction
def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1):
"""Compute the loss as sum over point-wise losses.
Parameters
----------
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
Coefficients of a linear model.
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
i.e. one reconstructs the 2d-array via
coef.reshape((n_classes, -1), order="F").
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : contiguous array of shape (n_samples,)
Observed, true target values.
sample_weight : None or contiguous array of shape (n_samples,), default=None
Sample weights.
l2_reg_strength : float, default=0.0
L2 regularization strength
n_threads : int, default=1
Number of OpenMP threads to use.
Returns
-------
loss : float
Sum of losses per sample plus penalty.
"""
weights, intercept, raw_prediction = self._w_intercept_raw(coef, X)
loss = self.base_loss.loss(
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
n_threads=n_threads,
)
loss = loss.sum()
norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
return loss + 0.5 * l2_reg_strength * norm2_w
def loss_gradient(
self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
):
"""Computes the sum of loss and gradient w.r.t. coef.
Parameters
----------
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
Coefficients of a linear model.
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
i.e. one reconstructs the 2d-array via
coef.reshape((n_classes, -1), order="F").
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : contiguous array of shape (n_samples,)
Observed, true target values.
sample_weight : None or contiguous array of shape (n_samples,), default=None
Sample weights.
l2_reg_strength : float, default=0.0
L2 regularization strength
n_threads : int, default=1
Number of OpenMP threads to use.
Returns
-------
loss : float
Sum of losses per sample plus penalty.
gradient : ndarray of shape coef.shape
The gradient of the loss.
"""
n_features, n_classes = X.shape[1], self.base_loss.n_classes
n_dof = n_features + int(self.fit_intercept)
weights, intercept, raw_prediction = self._w_intercept_raw(coef, X)
loss, grad_per_sample = self.base_loss.loss_gradient(
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
n_threads=n_threads,
)
loss = loss.sum()
if not self.base_loss.is_multiclass:
loss += 0.5 * l2_reg_strength * (weights @ weights)
grad = np.empty_like(coef, dtype=weights.dtype)
grad[:n_features] = X.T @ grad_per_sample + l2_reg_strength * weights
if self.fit_intercept:
grad[-1] = grad_per_sample.sum()
else:
loss += 0.5 * l2_reg_strength * squared_norm(weights)
grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
# grad_per_sample.shape = (n_samples, n_classes)
grad[:, :n_features] = grad_per_sample.T @ X + l2_reg_strength * weights
if self.fit_intercept:
grad[:, -1] = grad_per_sample.sum(axis=0)
if coef.ndim == 1:
grad = grad.ravel(order="F")
return loss, grad
def gradient(
self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
):
"""Computes the gradient w.r.t. coef.
Parameters
----------
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
Coefficients of a linear model.
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
i.e. one reconstructs the 2d-array via
coef.reshape((n_classes, -1), order="F").
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : contiguous array of shape (n_samples,)
Observed, true target values.
sample_weight : None or contiguous array of shape (n_samples,), default=None
Sample weights.
l2_reg_strength : float, default=0.0
L2 regularization strength
n_threads : int, default=1
Number of OpenMP threads to use.
Returns
-------
gradient : ndarray of shape coef.shape
The gradient of the loss.
"""
n_features, n_classes = X.shape[1], self.base_loss.n_classes
n_dof = n_features + int(self.fit_intercept)
weights, intercept, raw_prediction = self._w_intercept_raw(coef, X)
grad_per_sample = self.base_loss.gradient(
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
n_threads=n_threads,
)
if not self.base_loss.is_multiclass:
grad = np.empty_like(coef, dtype=weights.dtype)
grad[:n_features] = X.T @ grad_per_sample + l2_reg_strength * weights
if self.fit_intercept:
grad[-1] = grad_per_sample.sum()
return grad
else:
grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
# gradient.shape = (n_samples, n_classes)
grad[:, :n_features] = grad_per_sample.T @ X + l2_reg_strength * weights
if self.fit_intercept:
grad[:, -1] = grad_per_sample.sum(axis=0)
if coef.ndim == 1:
return grad.ravel(order="F")
else:
return grad
def gradient_hessian_product(
self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
):
"""Computes gradient and hessp (hessian product function) w.r.t. coef.
Parameters
----------
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
Coefficients of a linear model.
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
i.e. one reconstructs the 2d-array via
coef.reshape((n_classes, -1), order="F").
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : contiguous array of shape (n_samples,)
Observed, true target values.
sample_weight : None or contiguous array of shape (n_samples,), default=None
Sample weights.
l2_reg_strength : float, default=0.0
L2 regularization strength
n_threads : int, default=1
Number of OpenMP threads to use.
Returns
-------
gradient : ndarray of shape coef.shape
The gradient of the loss.
hessp : callable
Function that takes in a vector input of shape of gradient and
and returns matrix-vector product with hessian.
"""
(n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
n_dof = n_features + int(self.fit_intercept)
weights, intercept, raw_prediction = self._w_intercept_raw(coef, X)
if not self.base_loss.is_multiclass:
gradient, hessian = self.base_loss.gradient_hessian(
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
n_threads=n_threads,
)
grad = np.empty_like(coef, dtype=weights.dtype)
grad[:n_features] = X.T @ gradient + l2_reg_strength * weights
if self.fit_intercept:
grad[-1] = gradient.sum()
# Precompute as much as possible: hX, hX_sum and hessian_sum
hessian_sum = hessian.sum()
if sparse.issparse(X):
hX = sparse.dia_matrix((hessian, 0), shape=(n_samples, n_samples)) @ X
else:
hX = hessian[:, np.newaxis] * X
if self.fit_intercept:
# Calculate the double derivative with respect to intercept.
# Note: In case hX is sparse, hX.sum is a matrix object.
hX_sum = np.squeeze(np.asarray(hX.sum(axis=0)))
# With intercept included and l2_reg_strength = 0, hessp returns
# res = (X, 1)' @ diag(h) @ (X, 1) @ s
# = (X, 1)' @ (hX @ s[:n_features], sum(h) * s[-1])
# res[:n_features] = X' @ hX @ s[:n_features] + sum(h) * s[-1]
# res[-1] = 1' @ hX @ s[:n_features] + sum(h) * s[-1]
def hessp(s):
ret = np.empty_like(s)
if sparse.issparse(X):
ret[:n_features] = X.T @ (hX @ s[:n_features])
else:
ret[:n_features] = np.linalg.multi_dot([X.T, hX, s[:n_features]])
ret[:n_features] += l2_reg_strength * s[:n_features]
if self.fit_intercept:
ret[:n_features] += s[-1] * hX_sum
ret[-1] = hX_sum @ s[:n_features] + hessian_sum * s[-1]
return ret
else:
# Here we may safely assume HalfMultinomialLoss aka categorical
# cross-entropy.
# HalfMultinomialLoss computes only the diagonal part of the hessian, i.e.
# diagonal in the classes. Here, we want the matrix-vector product of the
# full hessian. Therefore, we call gradient_proba.
gradient, proba = self.base_loss.gradient_proba(
y_true=y,
raw_prediction=raw_prediction,
sample_weight=sample_weight,
n_threads=n_threads,
)
grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
grad[:, :n_features] = gradient.T @ X + l2_reg_strength * weights
if self.fit_intercept:
grad[:, -1] = gradient.sum(axis=0)
# Full hessian-vector product, i.e. not only the diagonal part of the
# hessian. Derivation with some index battle for input vector s:
# - sample index i
# - feature indices j, m
# - class indices k, l
# - 1_{k=l} is one if k=l else 0
# - p_i_k is the (predicted) probability that sample i belongs to class k
# for all i: sum_k p_i_k = 1
# - s_l_m is input vector for class l and feature m
# - X' = X transposed
#
# Note: Hessian with dropping most indices is just:
# X' @ p_k (1(k=l) - p_l) @ X
#
# result_{k j} = sum_{i, l, m} Hessian_{i, k j, m l} * s_l_m
# = sum_{i, l, m} (X')_{ji} * p_i_k * (1_{k=l} - p_i_l)
# * X_{im} s_l_m
# = sum_{i, m} (X')_{ji} * p_i_k
# * (X_{im} * s_k_m - sum_l p_i_l * X_{im} * s_l_m)
#
# See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411 # noqa
def hessp(s):
s = s.reshape((n_classes, -1), order="F") # shape = (n_classes, n_dof)
if self.fit_intercept:
s_intercept = s[:, -1]
s = s[:, :-1] # shape = (n_classes, n_features)
else:
s_intercept = 0
tmp = X @ s.T + s_intercept # X_{im} * s_k_m
tmp += (-proba * tmp).sum(axis=1)[:, np.newaxis] # - sum_l ..
tmp *= proba # * p_i_k
if sample_weight is not None:
tmp *= sample_weight[:, np.newaxis]
# hess_prod = empty_like(grad), but we ravel grad below and this
# function is run after that.
hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
hess_prod[:, :n_features] = tmp.T @ X + l2_reg_strength * s
if self.fit_intercept:
hess_prod[:, -1] = tmp.sum(axis=0)
if coef.ndim == 1:
return hess_prod.ravel(order="F")
else:
return hess_prod
if coef.ndim == 1:
return grad.ravel(order="F"), hessp
return grad, hessp

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,550 @@
# Authors: Rob Zinkov, Mathieu Blondel
# License: BSD 3 clause
from ._stochastic_gradient import BaseSGDClassifier
from ._stochastic_gradient import BaseSGDRegressor
from ._stochastic_gradient import DEFAULT_EPSILON
class PassiveAggressiveClassifier(BaseSGDClassifier):
"""Passive Aggressive Classifier.
Read more in the :ref:`User Guide <passive_aggressive>`.
Parameters
----------
C : float, default=1.0
Maximum step size (regularization). Defaults to 1.0.
fit_intercept : bool, default=True
Whether the intercept should be estimated or not. If False, the
data is assumed to be already centered.
max_iter : int, default=1000
The maximum number of passes over the training data (aka epochs).
It only impacts the behavior in the ``fit`` method, and not the
:meth:`partial_fit` method.
.. versionadded:: 0.19
tol : float or None, default=1e-3
The stopping criterion. If it is not None, the iterations will stop
when (loss > previous_loss - tol).
.. versionadded:: 0.19
early_stopping : bool, default=False
Whether to use early stopping to terminate training when validation.
score is not improving. If set to True, it will automatically set aside
a stratified fraction of training data as validation and terminate
training when validation score is not improving by at least tol for
n_iter_no_change consecutive epochs.
.. versionadded:: 0.20
validation_fraction : float, default=0.1
The proportion of training data to set aside as validation set for
early stopping. Must be between 0 and 1.
Only used if early_stopping is True.
.. versionadded:: 0.20
n_iter_no_change : int, default=5
Number of iterations with no improvement to wait before early stopping.
.. versionadded:: 0.20
shuffle : bool, default=True
Whether or not the training data should be shuffled after each epoch.
verbose : int, default=0
The verbosity level.
loss : str, default="hinge"
The loss function to be used:
hinge: equivalent to PA-I in the reference paper.
squared_hinge: equivalent to PA-II in the reference paper.
n_jobs : int or None, default=None
The number of CPUs to use to do the OVA (One Versus All, for
multi-class problems) computation.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
random_state : int, RandomState instance, default=None
Used to shuffle the training data, when ``shuffle`` is set to
``True``. Pass an int for reproducible output across multiple
function calls.
See :term:`Glossary <random_state>`.
warm_start : bool, default=False
When set to True, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
See :term:`the Glossary <warm_start>`.
Repeatedly calling fit or partial_fit when warm_start is True can
result in a different solution than when calling fit a single time
because of the way the data is shuffled.
class_weight : dict, {class_label: weight} or "balanced" or None, \
default=None
Preset for the class_weight fit parameter.
Weights associated with classes. If not given, all classes
are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``.
.. versionadded:: 0.17
parameter *class_weight* to automatically weight samples.
average : bool or int, default=False
When set to True, computes the averaged SGD weights and stores the
result in the ``coef_`` attribute. If set to an int greater than 1,
averaging will begin once the total number of samples seen reaches
average. So average=10 will begin averaging after seeing 10 samples.
.. versionadded:: 0.19
parameter *average* to use weights averaging in SGD.
Attributes
----------
coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
(n_classes, n_features)
Weights assigned to the features.
intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
Constants in decision function.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
The actual number of iterations to reach the stopping criterion.
For multiclass fits, it is the maximum over every binary fit.
classes_ : ndarray of shape (n_classes,)
The unique classes labels.
t_ : int
Number of weight updates performed during training.
Same as ``(n_iter_ * n_samples)``.
loss_function_ : callable
Loss function used by the algorithm.
See Also
--------
SGDClassifier : Incrementally trained logistic regression.
Perceptron : Linear perceptron classifier.
References
----------
Online Passive-Aggressive Algorithms
<http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>
K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
Examples
--------
>>> from sklearn.linear_model import PassiveAggressiveClassifier
>>> from sklearn.datasets import make_classification
>>> X, y = make_classification(n_features=4, random_state=0)
>>> clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0,
... tol=1e-3)
>>> clf.fit(X, y)
PassiveAggressiveClassifier(random_state=0)
>>> print(clf.coef_)
[[0.26642044 0.45070924 0.67251877 0.64185414]]
>>> print(clf.intercept_)
[1.84127814]
>>> print(clf.predict([[0, 0, 0, 0]]))
[1]
"""
def __init__(
self,
*,
C=1.0,
fit_intercept=True,
max_iter=1000,
tol=1e-3,
early_stopping=False,
validation_fraction=0.1,
n_iter_no_change=5,
shuffle=True,
verbose=0,
loss="hinge",
n_jobs=None,
random_state=None,
warm_start=False,
class_weight=None,
average=False,
):
super().__init__(
penalty=None,
fit_intercept=fit_intercept,
max_iter=max_iter,
tol=tol,
early_stopping=early_stopping,
validation_fraction=validation_fraction,
n_iter_no_change=n_iter_no_change,
shuffle=shuffle,
verbose=verbose,
random_state=random_state,
eta0=1.0,
warm_start=warm_start,
class_weight=class_weight,
average=average,
n_jobs=n_jobs,
)
self.C = C
self.loss = loss
def partial_fit(self, X, y, classes=None):
"""Fit linear model with Passive Aggressive algorithm.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Subset of the training data.
y : array-like of shape (n_samples,)
Subset of the target values.
classes : ndarray of shape (n_classes,)
Classes across all calls to partial_fit.
Can be obtained by via `np.unique(y_all)`, where y_all is the
target vector of the entire dataset.
This argument is required for the first call to partial_fit
and can be omitted in the subsequent calls.
Note that y doesn't need to contain all labels in `classes`.
Returns
-------
self : object
Fitted estimator.
"""
self._validate_params(for_partial_fit=True)
if self.class_weight == "balanced":
raise ValueError(
"class_weight 'balanced' is not supported for "
"partial_fit. For 'balanced' weights, use "
"`sklearn.utils.compute_class_weight` with "
"`class_weight='balanced'`. In place of y you "
"can use a large enough subset of the full "
"training set target to properly estimate the "
"class frequency distributions. Pass the "
"resulting weights as the class_weight "
"parameter."
)
lr = "pa1" if self.loss == "hinge" else "pa2"
return self._partial_fit(
X,
y,
alpha=1.0,
C=self.C,
loss="hinge",
learning_rate=lr,
max_iter=1,
classes=classes,
sample_weight=None,
coef_init=None,
intercept_init=None,
)
def fit(self, X, y, coef_init=None, intercept_init=None):
"""Fit linear model with Passive Aggressive algorithm.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : array-like of shape (n_samples,)
Target values.
coef_init : ndarray of shape (n_classes, n_features)
The initial coefficients to warm-start the optimization.
intercept_init : ndarray of shape (n_classes,)
The initial intercept to warm-start the optimization.
Returns
-------
self : object
Fitted estimator.
"""
self._validate_params()
lr = "pa1" if self.loss == "hinge" else "pa2"
return self._fit(
X,
y,
alpha=1.0,
C=self.C,
loss="hinge",
learning_rate=lr,
coef_init=coef_init,
intercept_init=intercept_init,
)
class PassiveAggressiveRegressor(BaseSGDRegressor):
"""Passive Aggressive Regressor.
Read more in the :ref:`User Guide <passive_aggressive>`.
Parameters
----------
C : float, default=1.0
Maximum step size (regularization). Defaults to 1.0.
fit_intercept : bool, default=True
Whether the intercept should be estimated or not. If False, the
data is assumed to be already centered. Defaults to True.
max_iter : int, default=1000
The maximum number of passes over the training data (aka epochs).
It only impacts the behavior in the ``fit`` method, and not the
:meth:`partial_fit` method.
.. versionadded:: 0.19
tol : float or None, default=1e-3
The stopping criterion. If it is not None, the iterations will stop
when (loss > previous_loss - tol).
.. versionadded:: 0.19
early_stopping : bool, default=False
Whether to use early stopping to terminate training when validation.
score is not improving. If set to True, it will automatically set aside
a fraction of training data as validation and terminate
training when validation score is not improving by at least tol for
n_iter_no_change consecutive epochs.
.. versionadded:: 0.20
validation_fraction : float, default=0.1
The proportion of training data to set aside as validation set for
early stopping. Must be between 0 and 1.
Only used if early_stopping is True.
.. versionadded:: 0.20
n_iter_no_change : int, default=5
Number of iterations with no improvement to wait before early stopping.
.. versionadded:: 0.20
shuffle : bool, default=True
Whether or not the training data should be shuffled after each epoch.
verbose : int, default=0
The verbosity level.
loss : str, default="epsilon_insensitive"
The loss function to be used:
epsilon_insensitive: equivalent to PA-I in the reference paper.
squared_epsilon_insensitive: equivalent to PA-II in the reference
paper.
epsilon : float, default=0.1
If the difference between the current prediction and the correct label
is below this threshold, the model is not updated.
random_state : int, RandomState instance, default=None
Used to shuffle the training data, when ``shuffle`` is set to
``True``. Pass an int for reproducible output across multiple
function calls.
See :term:`Glossary <random_state>`.
warm_start : bool, default=False
When set to True, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
See :term:`the Glossary <warm_start>`.
Repeatedly calling fit or partial_fit when warm_start is True can
result in a different solution than when calling fit a single time
because of the way the data is shuffled.
average : bool or int, default=False
When set to True, computes the averaged SGD weights and stores the
result in the ``coef_`` attribute. If set to an int greater than 1,
averaging will begin once the total number of samples seen reaches
average. So average=10 will begin averaging after seeing 10 samples.
.. versionadded:: 0.19
parameter *average* to use weights averaging in SGD.
Attributes
----------
coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
n_features]
Weights assigned to the features.
intercept_ : array, shape = [1] if n_classes == 2 else [n_classes]
Constants in decision function.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
The actual number of iterations to reach the stopping criterion.
t_ : int
Number of weight updates performed during training.
Same as ``(n_iter_ * n_samples)``.
See Also
--------
SGDRegressor : Linear model fitted by minimizing a regularized
empirical loss with SGD.
References
----------
Online Passive-Aggressive Algorithms
<http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>
K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006).
Examples
--------
>>> from sklearn.linear_model import PassiveAggressiveRegressor
>>> from sklearn.datasets import make_regression
>>> X, y = make_regression(n_features=4, random_state=0)
>>> regr = PassiveAggressiveRegressor(max_iter=100, random_state=0,
... tol=1e-3)
>>> regr.fit(X, y)
PassiveAggressiveRegressor(max_iter=100, random_state=0)
>>> print(regr.coef_)
[20.48736655 34.18818427 67.59122734 87.94731329]
>>> print(regr.intercept_)
[-0.02306214]
>>> print(regr.predict([[0, 0, 0, 0]]))
[-0.02306214]
"""
def __init__(
self,
*,
C=1.0,
fit_intercept=True,
max_iter=1000,
tol=1e-3,
early_stopping=False,
validation_fraction=0.1,
n_iter_no_change=5,
shuffle=True,
verbose=0,
loss="epsilon_insensitive",
epsilon=DEFAULT_EPSILON,
random_state=None,
warm_start=False,
average=False,
):
super().__init__(
penalty=None,
l1_ratio=0,
epsilon=epsilon,
eta0=1.0,
fit_intercept=fit_intercept,
max_iter=max_iter,
tol=tol,
early_stopping=early_stopping,
validation_fraction=validation_fraction,
n_iter_no_change=n_iter_no_change,
shuffle=shuffle,
verbose=verbose,
random_state=random_state,
warm_start=warm_start,
average=average,
)
self.C = C
self.loss = loss
def partial_fit(self, X, y):
"""Fit linear model with Passive Aggressive algorithm.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Subset of training data.
y : numpy array of shape [n_samples]
Subset of target values.
Returns
-------
self : object
Fitted estimator.
"""
self._validate_params(for_partial_fit=True)
lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
return self._partial_fit(
X,
y,
alpha=1.0,
C=self.C,
loss="epsilon_insensitive",
learning_rate=lr,
max_iter=1,
sample_weight=None,
coef_init=None,
intercept_init=None,
)
def fit(self, X, y, coef_init=None, intercept_init=None):
"""Fit linear model with Passive Aggressive algorithm.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : numpy array of shape [n_samples]
Target values.
coef_init : array, shape = [n_features]
The initial coefficients to warm-start the optimization.
intercept_init : array, shape = [1]
The initial intercept to warm-start the optimization.
Returns
-------
self : object
Fitted estimator.
"""
self._validate_params()
lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
return self._fit(
X,
y,
alpha=1.0,
C=self.C,
loss="epsilon_insensitive",
learning_rate=lr,
coef_init=coef_init,
intercept_init=intercept_init,
)

View File

@@ -0,0 +1,207 @@
# Author: Mathieu Blondel
# License: BSD 3 clause
from ._stochastic_gradient import BaseSGDClassifier
class Perceptron(BaseSGDClassifier):
"""Linear perceptron classifier.
Read more in the :ref:`User Guide <perceptron>`.
Parameters
----------
penalty : {'l2','l1','elasticnet'}, default=None
The penalty (aka regularization term) to be used.
alpha : float, default=0.0001
Constant that multiplies the regularization term if regularization is
used.
l1_ratio : float, default=0.15
The Elastic Net mixing parameter, with `0 <= l1_ratio <= 1`.
`l1_ratio=0` corresponds to L2 penalty, `l1_ratio=1` to L1.
Only used if `penalty='elasticnet'`.
.. versionadded:: 0.24
fit_intercept : bool, default=True
Whether the intercept should be estimated or not. If False, the
data is assumed to be already centered.
max_iter : int, default=1000
The maximum number of passes over the training data (aka epochs).
It only impacts the behavior in the ``fit`` method, and not the
:meth:`partial_fit` method.
.. versionadded:: 0.19
tol : float, default=1e-3
The stopping criterion. If it is not None, the iterations will stop
when (loss > previous_loss - tol).
.. versionadded:: 0.19
shuffle : bool, default=True
Whether or not the training data should be shuffled after each epoch.
verbose : int, default=0
The verbosity level.
eta0 : float, default=1
Constant by which the updates are multiplied.
n_jobs : int, default=None
The number of CPUs to use to do the OVA (One Versus All, for
multi-class problems) computation.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
random_state : int, RandomState instance or None, default=0
Used to shuffle the training data, when ``shuffle`` is set to
``True``. Pass an int for reproducible output across multiple
function calls.
See :term:`Glossary <random_state>`.
early_stopping : bool, default=False
Whether to use early stopping to terminate training when validation.
score is not improving. If set to True, it will automatically set aside
a stratified fraction of training data as validation and terminate
training when validation score is not improving by at least tol for
n_iter_no_change consecutive epochs.
.. versionadded:: 0.20
validation_fraction : float, default=0.1
The proportion of training data to set aside as validation set for
early stopping. Must be between 0 and 1.
Only used if early_stopping is True.
.. versionadded:: 0.20
n_iter_no_change : int, default=5
Number of iterations with no improvement to wait before early stopping.
.. versionadded:: 0.20
class_weight : dict, {class_label: weight} or "balanced", default=None
Preset for the class_weight fit parameter.
Weights associated with classes. If not given, all classes
are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``.
warm_start : bool, default=False
When set to True, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution. See
:term:`the Glossary <warm_start>`.
Attributes
----------
classes_ : ndarray of shape (n_classes,)
The unique classes labels.
coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
(n_classes, n_features)
Weights assigned to the features.
intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
Constants in decision function.
loss_function_ : concrete LossFunction
The function that determines the loss, or difference between the
output of the algorithm and the target values.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
The actual number of iterations to reach the stopping criterion.
For multiclass fits, it is the maximum over every binary fit.
t_ : int
Number of weight updates performed during training.
Same as ``(n_iter_ * n_samples)``.
See Also
--------
sklearn.linear_model.SGDClassifier : Linear classifiers
(SVM, logistic regression, etc.) with SGD training.
Notes
-----
``Perceptron`` is a classification algorithm which shares the same
underlying implementation with ``SGDClassifier``. In fact,
``Perceptron()`` is equivalent to `SGDClassifier(loss="perceptron",
eta0=1, learning_rate="constant", penalty=None)`.
References
----------
https://en.wikipedia.org/wiki/Perceptron and references therein.
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.linear_model import Perceptron
>>> X, y = load_digits(return_X_y=True)
>>> clf = Perceptron(tol=1e-3, random_state=0)
>>> clf.fit(X, y)
Perceptron()
>>> clf.score(X, y)
0.939...
"""
def __init__(
self,
*,
penalty=None,
alpha=0.0001,
l1_ratio=0.15,
fit_intercept=True,
max_iter=1000,
tol=1e-3,
shuffle=True,
verbose=0,
eta0=1.0,
n_jobs=None,
random_state=0,
early_stopping=False,
validation_fraction=0.1,
n_iter_no_change=5,
class_weight=None,
warm_start=False,
):
super().__init__(
loss="perceptron",
penalty=penalty,
alpha=alpha,
l1_ratio=l1_ratio,
fit_intercept=fit_intercept,
max_iter=max_iter,
tol=tol,
shuffle=shuffle,
verbose=verbose,
random_state=random_state,
learning_rate="constant",
eta0=eta0,
early_stopping=early_stopping,
validation_fraction=validation_fraction,
n_iter_no_change=n_iter_no_change,
power_t=0.5,
warm_start=warm_start,
class_weight=class_weight,
n_jobs=n_jobs,
)

View File

@@ -0,0 +1,304 @@
# Authors: David Dale <dale.david@mail.ru>
# Christian Lorentzen <lorentzen.ch@gmail.com>
# License: BSD 3 clause
import warnings
import numpy as np
from scipy import sparse
from scipy.optimize import linprog
from ..base import BaseEstimator, RegressorMixin
from ._base import LinearModel
from ..exceptions import ConvergenceWarning
from ..utils import _safe_indexing
from ..utils.validation import _check_sample_weight
from ..utils.fixes import sp_version, parse_version
class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator):
"""Linear regression model that predicts conditional quantiles.
The linear :class:`QuantileRegressor` optimizes the pinball loss for a
desired `quantile` and is robust to outliers.
This model uses an L1 regularization like
:class:`~sklearn.linear_model.Lasso`.
Read more in the :ref:`User Guide <quantile_regression>`.
.. versionadded:: 1.0
Parameters
----------
quantile : float, default=0.5
The quantile that the model tries to predict. It must be strictly
between 0 and 1. If 0.5 (default), the model predicts the 50%
quantile, i.e. the median.
alpha : float, default=1.0
Regularization constant that multiplies the L1 penalty term.
fit_intercept : bool, default=True
Whether or not to fit the intercept.
solver : {'highs-ds', 'highs-ipm', 'highs', 'interior-point', \
'revised simplex'}, default='interior-point'
Method used by :func:`scipy.optimize.linprog` to solve the linear
programming formulation. Note that the highs methods are recommended
for usage with `scipy>=1.6.0` because they are the fastest ones.
Solvers "highs-ds", "highs-ipm" and "highs" support
sparse input data and, in fact, always convert to sparse csc.
solver_options : dict, default=None
Additional parameters passed to :func:`scipy.optimize.linprog` as
options. If `None` and if `solver='interior-point'`, then
`{"lstsq": True}` is passed to :func:`scipy.optimize.linprog` for the
sake of stability.
Attributes
----------
coef_ : array of shape (n_features,)
Estimated coefficients for the features.
intercept_ : float
The intercept of the model, aka bias term.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
The actual number of iterations performed by the solver.
See Also
--------
Lasso : The Lasso is a linear model that estimates sparse coefficients
with l1 regularization.
HuberRegressor : Linear regression model that is robust to outliers.
Examples
--------
>>> from sklearn.linear_model import QuantileRegressor
>>> import numpy as np
>>> n_samples, n_features = 10, 2
>>> rng = np.random.RandomState(0)
>>> y = rng.randn(n_samples)
>>> X = rng.randn(n_samples, n_features)
>>> reg = QuantileRegressor(quantile=0.8).fit(X, y)
>>> np.mean(y <= reg.predict(X))
0.8
"""
def __init__(
self,
*,
quantile=0.5,
alpha=1.0,
fit_intercept=True,
solver="interior-point",
solver_options=None,
):
self.quantile = quantile
self.alpha = alpha
self.fit_intercept = fit_intercept
self.solver = solver
self.solver_options = solver_options
def fit(self, X, y, sample_weight=None):
"""Fit the model according to the given training data.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : array-like of shape (n_samples,)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
self : object
Returns self.
"""
X, y = self._validate_data(
X,
y,
accept_sparse=["csc", "csr", "coo"],
y_numeric=True,
multi_output=False,
)
sample_weight = _check_sample_weight(sample_weight, X)
n_features = X.shape[1]
n_params = n_features
if self.fit_intercept:
n_params += 1
# Note that centering y and X with _preprocess_data does not work
# for quantile regression.
# The objective is defined as 1/n * sum(pinball loss) + alpha * L1.
# So we rescale the penalty term, which is equivalent.
if self.alpha >= 0:
alpha = np.sum(sample_weight) * self.alpha
else:
raise ValueError(
f"Penalty alpha must be a non-negative number, got {self.alpha}"
)
if self.quantile >= 1.0 or self.quantile <= 0.0:
raise ValueError(
f"Quantile should be strictly between 0.0 and 1.0, got {self.quantile}"
)
if not isinstance(self.fit_intercept, bool):
raise ValueError(
f"The argument fit_intercept must be bool, got {self.fit_intercept}"
)
if self.solver not in (
"highs-ds",
"highs-ipm",
"highs",
"interior-point",
"revised simplex",
):
raise ValueError(f"Invalid value for argument solver, got {self.solver}")
elif self.solver in (
"highs-ds",
"highs-ipm",
"highs",
) and sp_version < parse_version("1.6.0"):
raise ValueError(
f"Solver {self.solver} is only available "
f"with scipy>=1.6.0, got {sp_version}"
)
if sparse.issparse(X) and self.solver not in ["highs", "highs-ds", "highs-ipm"]:
raise ValueError(
f"Solver {self.solver} does not support sparse X. "
"Use solver 'highs' for example."
)
if self.solver_options is not None and not isinstance(
self.solver_options, dict
):
raise ValueError(
"Invalid value for argument solver_options, "
"must be None or a dictionary, got "
f"{self.solver_options}"
)
# make default solver more stable
if self.solver_options is None and self.solver == "interior-point":
solver_options = {"lstsq": True}
else:
solver_options = self.solver_options
# After rescaling alpha, the minimization problem is
# min sum(pinball loss) + alpha * L1
# Use linear programming formulation of quantile regression
# min_x c x
# A_eq x = b_eq
# 0 <= x
# x = (s0, s, t0, t, u, v) = slack variables >= 0
# intercept = s0 - t0
# coef = s - t
# c = (0, alpha * 1_p, 0, alpha * 1_p, quantile * 1_n, (1-quantile) * 1_n)
# residual = y - X@coef - intercept = u - v
# A_eq = (1_n, X, -1_n, -X, diag(1_n), -diag(1_n))
# b_eq = y
# p = n_features
# n = n_samples
# 1_n = vector of length n with entries equal one
# see https://stats.stackexchange.com/questions/384909/
#
# Filtering out zero sample weights from the beginning makes life
# easier for the linprog solver.
indices = np.nonzero(sample_weight)[0]
n_indices = len(indices) # use n_mask instead of n_samples
if n_indices < len(sample_weight):
sample_weight = sample_weight[indices]
X = _safe_indexing(X, indices)
y = _safe_indexing(y, indices)
c = np.concatenate(
[
np.full(2 * n_params, fill_value=alpha),
sample_weight * self.quantile,
sample_weight * (1 - self.quantile),
]
)
if self.fit_intercept:
# do not penalize the intercept
c[0] = 0
c[n_params] = 0
if self.solver in ["highs", "highs-ds", "highs-ipm"]:
# Note that highs methods always use a sparse CSC memory layout internally,
# even for optimization problems parametrized using dense numpy arrays.
# Therefore, we work with CSC matrices as early as possible to limit
# unnecessary repeated memory copies.
eye = sparse.eye(n_indices, dtype=X.dtype, format="csc")
if self.fit_intercept:
ones = sparse.csc_matrix(np.ones(shape=(n_indices, 1), dtype=X.dtype))
A_eq = sparse.hstack([ones, X, -ones, -X, eye, -eye], format="csc")
else:
A_eq = sparse.hstack([X, -X, eye, -eye], format="csc")
else:
eye = np.eye(n_indices)
if self.fit_intercept:
ones = np.ones((n_indices, 1))
A_eq = np.concatenate([ones, X, -ones, -X, eye, -eye], axis=1)
else:
A_eq = np.concatenate([X, -X, eye, -eye], axis=1)
b_eq = y
result = linprog(
c=c,
A_eq=A_eq,
b_eq=b_eq,
method=self.solver,
options=solver_options,
)
solution = result.x
if not result.success:
failure = {
1: "Iteration limit reached.",
2: "Problem appears to be infeasible.",
3: "Problem appears to be unbounded.",
4: "Numerical difficulties encountered.",
}
warnings.warn(
"Linear programming for QuantileRegressor did not succeed.\n"
f"Status is {result.status}: "
+ failure.setdefault(result.status, "unknown reason")
+ "\n"
+ "Result message of linprog:\n"
+ result.message,
ConvergenceWarning,
)
# positive slack - negative slack
# solution is an array with (params_pos, params_neg, u, v)
params = solution[:n_params] - solution[n_params : 2 * n_params]
self.n_iter_ = result.nit
if self.fit_intercept:
self.coef_ = params[1:]
self.intercept_ = params[0]
else:
self.coef_ = params
self.intercept_ = 0.0
return self

View File

@@ -0,0 +1,632 @@
# Author: Johannes Schönberger
#
# License: BSD 3 clause
import numpy as np
import warnings
from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone
from ..base import MultiOutputMixin
from ..utils import check_random_state, check_consistent_length
from ..utils.random import sample_without_replacement
from ..utils.validation import check_is_fitted, _check_sample_weight
from ._base import LinearRegression
from ..utils.validation import has_fit_parameter
from ..exceptions import ConvergenceWarning
_EPSILON = np.spacing(1)
def _dynamic_max_trials(n_inliers, n_samples, min_samples, probability):
"""Determine number trials such that at least one outlier-free subset is
sampled for the given inlier/outlier ratio.
Parameters
----------
n_inliers : int
Number of inliers in the data.
n_samples : int
Total number of samples in the data.
min_samples : int
Minimum number of samples chosen randomly from original data.
probability : float
Probability (confidence) that one outlier-free sample is generated.
Returns
-------
trials : int
Number of trials.
"""
inlier_ratio = n_inliers / float(n_samples)
nom = max(_EPSILON, 1 - probability)
denom = max(_EPSILON, 1 - inlier_ratio**min_samples)
if nom == 1:
return 0
if denom == 1:
return float("inf")
return abs(float(np.ceil(np.log(nom) / np.log(denom))))
class RANSACRegressor(
MetaEstimatorMixin, RegressorMixin, MultiOutputMixin, BaseEstimator
):
"""RANSAC (RANdom SAmple Consensus) algorithm.
RANSAC is an iterative algorithm for the robust estimation of parameters
from a subset of inliers from the complete data set.
Read more in the :ref:`User Guide <ransac_regression>`.
Parameters
----------
estimator : object, default=None
Base estimator object which implements the following methods:
* `fit(X, y)`: Fit model to given training data and target values.
* `score(X, y)`: Returns the mean accuracy on the given test data,
which is used for the stop criterion defined by `stop_score`.
Additionally, the score is used to decide which of two equally
large consensus sets is chosen as the better one.
* `predict(X)`: Returns predicted values using the linear model,
which is used to compute residual error using loss function.
If `estimator` is None, then
:class:`~sklearn.linear_model.LinearRegression` is used for
target values of dtype float.
Note that the current implementation only supports regression
estimators.
min_samples : int (>= 1) or float ([0, 1]), default=None
Minimum number of samples chosen randomly from original data. Treated
as an absolute number of samples for `min_samples >= 1`, treated as a
relative number `ceil(min_samples * X.shape[0])` for
`min_samples < 1`. This is typically chosen as the minimal number of
samples necessary to estimate the given `estimator`. By default a
``sklearn.linear_model.LinearRegression()`` estimator is assumed and
`min_samples` is chosen as ``X.shape[1] + 1``. This parameter is highly
dependent upon the model, so if a `estimator` other than
:class:`linear_model.LinearRegression` is used, the user is
encouraged to provide a value.
.. deprecated:: 1.0
Not setting `min_samples` explicitly will raise an error in version
1.2 for models other than
:class:`~sklearn.linear_model.LinearRegression`. To keep the old
default behavior, set `min_samples=X.shape[1] + 1` explicitly.
residual_threshold : float, default=None
Maximum residual for a data sample to be classified as an inlier.
By default the threshold is chosen as the MAD (median absolute
deviation) of the target values `y`. Points whose residuals are
strictly equal to the threshold are considered as inliers.
is_data_valid : callable, default=None
This function is called with the randomly selected data before the
model is fitted to it: `is_data_valid(X, y)`. If its return value is
False the current randomly chosen sub-sample is skipped.
is_model_valid : callable, default=None
This function is called with the estimated model and the randomly
selected data: `is_model_valid(model, X, y)`. If its return value is
False the current randomly chosen sub-sample is skipped.
Rejecting samples with this function is computationally costlier than
with `is_data_valid`. `is_model_valid` should therefore only be used if
the estimated model is needed for making the rejection decision.
max_trials : int, default=100
Maximum number of iterations for random sample selection.
max_skips : int, default=np.inf
Maximum number of iterations that can be skipped due to finding zero
inliers or invalid data defined by ``is_data_valid`` or invalid models
defined by ``is_model_valid``.
.. versionadded:: 0.19
stop_n_inliers : int, default=np.inf
Stop iteration if at least this number of inliers are found.
stop_score : float, default=np.inf
Stop iteration if score is greater equal than this threshold.
stop_probability : float in range [0, 1], default=0.99
RANSAC iteration stops if at least one outlier-free set of the training
data is sampled in RANSAC. This requires to generate at least N
samples (iterations)::
N >= log(1 - probability) / log(1 - e**m)
where the probability (confidence) is typically set to high value such
as 0.99 (the default) and e is the current fraction of inliers w.r.t.
the total number of samples.
loss : str, callable, default='absolute_error'
String inputs, 'absolute_error' and 'squared_error' are supported which
find the absolute error and squared error per sample respectively.
If ``loss`` is a callable, then it should be a function that takes
two arrays as inputs, the true and predicted value and returns a 1-D
array with the i-th value of the array corresponding to the loss
on ``X[i]``.
If the loss on a sample is greater than the ``residual_threshold``,
then this sample is classified as an outlier.
.. versionadded:: 0.18
.. deprecated:: 1.0
The loss 'squared_loss' was deprecated in v1.0 and will be removed
in version 1.2. Use `loss='squared_error'` which is equivalent.
.. deprecated:: 1.0
The loss 'absolute_loss' was deprecated in v1.0 and will be removed
in version 1.2. Use `loss='absolute_error'` which is equivalent.
random_state : int, RandomState instance, default=None
The generator used to initialize the centers.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
base_estimator : object, default="deprecated"
Use `estimator` instead.
.. deprecated:: 1.1
`base_estimator` is deprecated and will be removed in 1.3.
Use `estimator` instead.
Attributes
----------
estimator_ : object
Best fitted model (copy of the `estimator` object).
n_trials_ : int
Number of random selection trials until one of the stop criteria is
met. It is always ``<= max_trials``.
inlier_mask_ : bool array of shape [n_samples]
Boolean mask of inliers classified as ``True``.
n_skips_no_inliers_ : int
Number of iterations skipped due to finding zero inliers.
.. versionadded:: 0.19
n_skips_invalid_data_ : int
Number of iterations skipped due to invalid data defined by
``is_data_valid``.
.. versionadded:: 0.19
n_skips_invalid_model_ : int
Number of iterations skipped due to an invalid model defined by
``is_model_valid``.
.. versionadded:: 0.19
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
HuberRegressor : Linear regression model that is robust to outliers.
TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
References
----------
.. [1] https://en.wikipedia.org/wiki/RANSAC
.. [2] https://www.sri.com/sites/default/files/publications/ransac-publication.pdf
.. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf
Examples
--------
>>> from sklearn.linear_model import RANSACRegressor
>>> from sklearn.datasets import make_regression
>>> X, y = make_regression(
... n_samples=200, n_features=2, noise=4.0, random_state=0)
>>> reg = RANSACRegressor(random_state=0).fit(X, y)
>>> reg.score(X, y)
0.9885...
>>> reg.predict(X[:1,])
array([-31.9417...])
""" # noqa: E501
def __init__(
self,
estimator=None,
*,
min_samples=None,
residual_threshold=None,
is_data_valid=None,
is_model_valid=None,
max_trials=100,
max_skips=np.inf,
stop_n_inliers=np.inf,
stop_score=np.inf,
stop_probability=0.99,
loss="absolute_error",
random_state=None,
base_estimator="deprecated",
):
self.estimator = estimator
self.min_samples = min_samples
self.residual_threshold = residual_threshold
self.is_data_valid = is_data_valid
self.is_model_valid = is_model_valid
self.max_trials = max_trials
self.max_skips = max_skips
self.stop_n_inliers = stop_n_inliers
self.stop_score = stop_score
self.stop_probability = stop_probability
self.random_state = random_state
self.loss = loss
self.base_estimator = base_estimator
def fit(self, X, y, sample_weight=None):
"""Fit estimator using RANSAC algorithm.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : array-like of shape (n_samples,) or (n_samples, n_targets)
Target values.
sample_weight : array-like of shape (n_samples,), default=None
Individual weights for each sample
raises error if sample_weight is passed and estimator
fit method does not support it.
.. versionadded:: 0.18
Returns
-------
self : object
Fitted `RANSACRegressor` estimator.
Raises
------
ValueError
If no valid consensus set could be found. This occurs if
`is_data_valid` and `is_model_valid` return False for all
`max_trials` randomly chosen sub-samples.
"""
# Need to validate separately here. We can't pass multi_output=True
# because that would allow y to be csr. Delay expensive finiteness
# check to the estimator's own input validation.
check_X_params = dict(accept_sparse="csr", force_all_finite=False)
check_y_params = dict(ensure_2d=False)
X, y = self._validate_data(
X, y, validate_separately=(check_X_params, check_y_params)
)
check_consistent_length(X, y)
if self.base_estimator != "deprecated":
warnings.warn(
"`base_estimator` was renamed to `estimator` in version 1.1 and "
"will be removed in 1.3.",
FutureWarning,
)
self.estimator = self.base_estimator
if self.estimator is not None:
estimator = clone(self.estimator)
else:
estimator = LinearRegression()
if self.min_samples is None:
if not isinstance(estimator, LinearRegression):
# FIXME: in 1.2, turn this warning into an error
warnings.warn(
"From version 1.2, `min_samples` needs to be explicitly "
"set otherwise an error will be raised. To keep the "
"current behavior, you need to set `min_samples` to "
f"`X.shape[1] + 1 that is {X.shape[1] + 1}",
FutureWarning,
)
min_samples = X.shape[1] + 1
elif 0 < self.min_samples < 1:
min_samples = np.ceil(self.min_samples * X.shape[0])
elif self.min_samples >= 1:
if self.min_samples % 1 != 0:
raise ValueError("Absolute number of samples must be an integer value.")
min_samples = self.min_samples
else:
raise ValueError("Value for `min_samples` must be scalar and positive.")
if min_samples > X.shape[0]:
raise ValueError(
"`min_samples` may not be larger than number "
"of samples: n_samples = %d." % (X.shape[0])
)
if self.stop_probability < 0 or self.stop_probability > 1:
raise ValueError("`stop_probability` must be in range [0, 1].")
if self.residual_threshold is None:
# MAD (median absolute deviation)
residual_threshold = np.median(np.abs(y - np.median(y)))
else:
residual_threshold = self.residual_threshold
# TODO: Remove absolute_loss in v1.2.
if self.loss in ("absolute_error", "absolute_loss"):
if self.loss == "absolute_loss":
warnings.warn(
"The loss 'absolute_loss' was deprecated in v1.0 and will "
"be removed in version 1.2. Use `loss='absolute_error'` "
"which is equivalent.",
FutureWarning,
)
if y.ndim == 1:
loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)
else:
loss_function = lambda y_true, y_pred: np.sum(
np.abs(y_true - y_pred), axis=1
)
# TODO: Remove squared_loss in v1.2.
elif self.loss in ("squared_error", "squared_loss"):
if self.loss == "squared_loss":
warnings.warn(
"The loss 'squared_loss' was deprecated in v1.0 and will "
"be removed in version 1.2. Use `loss='squared_error'` "
"which is equivalent.",
FutureWarning,
)
if y.ndim == 1:
loss_function = lambda y_true, y_pred: (y_true - y_pred) ** 2
else:
loss_function = lambda y_true, y_pred: np.sum(
(y_true - y_pred) ** 2, axis=1
)
elif callable(self.loss):
loss_function = self.loss
else:
raise ValueError(
"loss should be 'absolute_error', 'squared_error' or a "
"callable. Got %s. "
% self.loss
)
random_state = check_random_state(self.random_state)
try: # Not all estimator accept a random_state
estimator.set_params(random_state=random_state)
except ValueError:
pass
estimator_fit_has_sample_weight = has_fit_parameter(estimator, "sample_weight")
estimator_name = type(estimator).__name__
if sample_weight is not None and not estimator_fit_has_sample_weight:
raise ValueError(
"%s does not support sample_weight. Samples"
" weights are only used for the calibration"
" itself." % estimator_name
)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
n_inliers_best = 1
score_best = -np.inf
inlier_mask_best = None
X_inlier_best = None
y_inlier_best = None
inlier_best_idxs_subset = None
self.n_skips_no_inliers_ = 0
self.n_skips_invalid_data_ = 0
self.n_skips_invalid_model_ = 0
# number of data samples
n_samples = X.shape[0]
sample_idxs = np.arange(n_samples)
self.n_trials_ = 0
max_trials = self.max_trials
while self.n_trials_ < max_trials:
self.n_trials_ += 1
if (
self.n_skips_no_inliers_
+ self.n_skips_invalid_data_
+ self.n_skips_invalid_model_
) > self.max_skips:
break
# choose random sample set
subset_idxs = sample_without_replacement(
n_samples, min_samples, random_state=random_state
)
X_subset = X[subset_idxs]
y_subset = y[subset_idxs]
# check if random sample set is valid
if self.is_data_valid is not None and not self.is_data_valid(
X_subset, y_subset
):
self.n_skips_invalid_data_ += 1
continue
# fit model for current random sample set
if sample_weight is None:
estimator.fit(X_subset, y_subset)
else:
estimator.fit(
X_subset, y_subset, sample_weight=sample_weight[subset_idxs]
)
# check if estimated model is valid
if self.is_model_valid is not None and not self.is_model_valid(
estimator, X_subset, y_subset
):
self.n_skips_invalid_model_ += 1
continue
# residuals of all data for current random sample model
y_pred = estimator.predict(X)
residuals_subset = loss_function(y, y_pred)
# classify data into inliers and outliers
inlier_mask_subset = residuals_subset <= residual_threshold
n_inliers_subset = np.sum(inlier_mask_subset)
# less inliers -> skip current random sample
if n_inliers_subset < n_inliers_best:
self.n_skips_no_inliers_ += 1
continue
# extract inlier data set
inlier_idxs_subset = sample_idxs[inlier_mask_subset]
X_inlier_subset = X[inlier_idxs_subset]
y_inlier_subset = y[inlier_idxs_subset]
# score of inlier data set
score_subset = estimator.score(X_inlier_subset, y_inlier_subset)
# same number of inliers but worse score -> skip current random
# sample
if n_inliers_subset == n_inliers_best and score_subset < score_best:
continue
# save current random sample as best sample
n_inliers_best = n_inliers_subset
score_best = score_subset
inlier_mask_best = inlier_mask_subset
X_inlier_best = X_inlier_subset
y_inlier_best = y_inlier_subset
inlier_best_idxs_subset = inlier_idxs_subset
max_trials = min(
max_trials,
_dynamic_max_trials(
n_inliers_best, n_samples, min_samples, self.stop_probability
),
)
# break if sufficient number of inliers or score is reached
if n_inliers_best >= self.stop_n_inliers or score_best >= self.stop_score:
break
# if none of the iterations met the required criteria
if inlier_mask_best is None:
if (
self.n_skips_no_inliers_
+ self.n_skips_invalid_data_
+ self.n_skips_invalid_model_
) > self.max_skips:
raise ValueError(
"RANSAC skipped more iterations than `max_skips` without"
" finding a valid consensus set. Iterations were skipped"
" because each randomly chosen sub-sample failed the"
" passing criteria. See estimator attributes for"
" diagnostics (n_skips*)."
)
else:
raise ValueError(
"RANSAC could not find a valid consensus set. All"
" `max_trials` iterations were skipped because each"
" randomly chosen sub-sample failed the passing criteria."
" See estimator attributes for diagnostics (n_skips*)."
)
else:
if (
self.n_skips_no_inliers_
+ self.n_skips_invalid_data_
+ self.n_skips_invalid_model_
) > self.max_skips:
warnings.warn(
"RANSAC found a valid consensus set but exited"
" early due to skipping more iterations than"
" `max_skips`. See estimator attributes for"
" diagnostics (n_skips*).",
ConvergenceWarning,
)
# estimate final model using all inliers
if sample_weight is None:
estimator.fit(X_inlier_best, y_inlier_best)
else:
estimator.fit(
X_inlier_best,
y_inlier_best,
sample_weight=sample_weight[inlier_best_idxs_subset],
)
self.estimator_ = estimator
self.inlier_mask_ = inlier_mask_best
return self
def predict(self, X):
"""Predict using the estimated model.
This is a wrapper for `estimator_.predict(X)`.
Parameters
----------
X : {array-like or sparse matrix} of shape (n_samples, n_features)
Input data.
Returns
-------
y : array, shape = [n_samples] or [n_samples, n_targets]
Returns predicted values.
"""
check_is_fitted(self)
X = self._validate_data(
X,
force_all_finite=False,
accept_sparse=True,
reset=False,
)
return self.estimator_.predict(X)
def score(self, X, y):
"""Return the score of the prediction.
This is a wrapper for `estimator_.score(X, y)`.
Parameters
----------
X : (array-like or sparse matrix} of shape (n_samples, n_features)
Training data.
y : array-like of shape (n_samples,) or (n_samples, n_targets)
Target values.
Returns
-------
z : float
Score of the prediction.
"""
check_is_fitted(self)
X = self._validate_data(
X,
force_all_finite=False,
accept_sparse=True,
reset=False,
)
return self.estimator_.score(X, y)
def _more_tags(self):
return {
"_xfail_checks": {
"check_sample_weights_invariance": (
"zero sample_weight is not equivalent to removing samples"
),
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,372 @@
"""Solvers for Ridge and LogisticRegression using SAG algorithm"""
# Authors: Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
#
# License: BSD 3 clause
import warnings
import numpy as np
from ._base import make_dataset
from ._sag_fast import sag32, sag64
from ..exceptions import ConvergenceWarning
from ..utils import check_array
from ..utils.validation import _check_sample_weight
from ..utils.extmath import row_norms
def get_auto_step_size(
max_squared_sum, alpha_scaled, loss, fit_intercept, n_samples=None, is_saga=False
):
"""Compute automatic step size for SAG solver.
The step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is
the max sum of squares for over all samples.
Parameters
----------
max_squared_sum : float
Maximum squared sum of X over samples.
alpha_scaled : float
Constant that multiplies the regularization term, scaled by
1. / n_samples, the number of samples.
loss : {'log', 'squared', 'multinomial'}
The loss function used in SAG solver.
fit_intercept : bool
Specifies if a constant (a.k.a. bias or intercept) will be
added to the decision function.
n_samples : int, default=None
Number of rows in X. Useful if is_saga=True.
is_saga : bool, default=False
Whether to return step size for the SAGA algorithm or the SAG
algorithm.
Returns
-------
step_size : float
Step size used in SAG solver.
References
----------
Schmidt, M., Roux, N. L., & Bach, F. (2013).
Minimizing finite sums with the stochastic average gradient
https://hal.inria.fr/hal-00860051/document
:arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
"SAGA: A Fast Incremental Gradient Method With Support
for Non-Strongly Convex Composite Objectives" <1407.0202>`
"""
if loss in ("log", "multinomial"):
L = 0.25 * (max_squared_sum + int(fit_intercept)) + alpha_scaled
elif loss == "squared":
# inverse Lipschitz constant for squared loss
L = max_squared_sum + int(fit_intercept) + alpha_scaled
else:
raise ValueError(
"Unknown loss function for SAG solver, got %s instead of 'log' or 'squared'"
% loss
)
if is_saga:
# SAGA theoretical step size is 1/3L or 1 / (2 * (L + mu n))
# See Defazio et al. 2014
mun = min(2 * n_samples * alpha_scaled, L)
step = 1.0 / (2 * L + mun)
else:
# SAG theoretical step size is 1/16L but it is recommended to use 1 / L
# see http://www.birs.ca//workshops//2014/14w5003/files/schmidt.pdf,
# slide 65
step = 1.0 / L
return step
def sag_solver(
X,
y,
sample_weight=None,
loss="log",
alpha=1.0,
beta=0.0,
max_iter=1000,
tol=0.001,
verbose=0,
random_state=None,
check_input=True,
max_squared_sum=None,
warm_start_mem=None,
is_saga=False,
):
"""SAG solver for Ridge and LogisticRegression.
SAG stands for Stochastic Average Gradient: the gradient of the loss is
estimated each sample at a time and the model is updated along the way with
a constant learning rate.
IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the
same scale. You can normalize the data by using
sklearn.preprocessing.StandardScaler on your data before passing it to the
fit method.
This implementation works with data represented as dense numpy arrays or
sparse scipy arrays of floating point values for the features. It will
fit the data according to squared loss or log loss.
The regularizer is a penalty added to the loss function that shrinks model
parameters towards the zero vector using the squared euclidean norm L2.
.. versionadded:: 0.17
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : ndarray of shape (n_samples,)
Target values. With loss='multinomial', y must be label encoded
(see preprocessing.LabelEncoder).
sample_weight : array-like of shape (n_samples,), default=None
Weights applied to individual samples (1. for unweighted).
loss : {'log', 'squared', 'multinomial'}, default='log'
Loss function that will be optimized:
-'log' is the binary logistic loss, as used in LogisticRegression.
-'squared' is the squared loss, as used in Ridge.
-'multinomial' is the multinomial logistic loss, as used in
LogisticRegression.
.. versionadded:: 0.18
*loss='multinomial'*
alpha : float, default=1.
L2 regularization term in the objective function
``(0.5 * alpha * || W ||_F^2)``.
beta : float, default=0.
L1 regularization term in the objective function
``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True.
max_iter : int, default=1000
The max number of passes over the training data if the stopping
criteria is not reached.
tol : float, default=0.001
The stopping criteria for the weights. The iterations will stop when
max(change in weights) / max(weights) < tol.
verbose : int, default=0
The verbosity level.
random_state : int, RandomState instance or None, default=None
Used when shuffling the data. Pass an int for reproducible output
across multiple function calls.
See :term:`Glossary <random_state>`.
check_input : bool, default=True
If False, the input arrays X and y will not be checked.
max_squared_sum : float, default=None
Maximum squared sum of X over samples. If None, it will be computed,
going through all the samples. The value should be precomputed
to speed up cross validation.
warm_start_mem : dict, default=None
The initialization parameters used for warm starting. Warm starting is
currently used in LogisticRegression but not in Ridge.
It contains:
- 'coef': the weight vector, with the intercept in last line
if the intercept is fitted.
- 'gradient_memory': the scalar gradient for all seen samples.
- 'sum_gradient': the sum of gradient over all seen samples,
for each feature.
- 'intercept_sum_gradient': the sum of gradient over all seen
samples, for the intercept.
- 'seen': array of boolean describing the seen samples.
- 'num_seen': the number of seen samples.
is_saga : bool, default=False
Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves
better in the first epochs, and allow for l1 regularisation.
Returns
-------
coef_ : ndarray of shape (n_features,)
Weight vector.
n_iter_ : int
The number of full pass on all samples.
warm_start_mem : dict
Contains a 'coef' key with the fitted result, and possibly the
fitted intercept at the end of the array. Contains also other keys
used for warm starting.
Examples
--------
>>> import numpy as np
>>> from sklearn import linear_model
>>> n_samples, n_features = 10, 5
>>> rng = np.random.RandomState(0)
>>> X = rng.randn(n_samples, n_features)
>>> y = rng.randn(n_samples)
>>> clf = linear_model.Ridge(solver='sag')
>>> clf.fit(X, y)
Ridge(solver='sag')
>>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
>>> y = np.array([1, 1, 2, 2])
>>> clf = linear_model.LogisticRegression(
... solver='sag', multi_class='multinomial')
>>> clf.fit(X, y)
LogisticRegression(multi_class='multinomial', solver='sag')
References
----------
Schmidt, M., Roux, N. L., & Bach, F. (2013).
Minimizing finite sums with the stochastic average gradient
https://hal.inria.fr/hal-00860051/document
:arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
"SAGA: A Fast Incremental Gradient Method With Support
for Non-Strongly Convex Composite Objectives" <1407.0202>`
See Also
--------
Ridge, SGDRegressor, ElasticNet, Lasso, SVR,
LogisticRegression, SGDClassifier, LinearSVC, Perceptron
"""
if warm_start_mem is None:
warm_start_mem = {}
# Ridge default max_iter is None
if max_iter is None:
max_iter = 1000
if check_input:
_dtype = [np.float64, np.float32]
X = check_array(X, dtype=_dtype, accept_sparse="csr", order="C")
y = check_array(y, dtype=_dtype, ensure_2d=False, order="C")
n_samples, n_features = X.shape[0], X.shape[1]
# As in SGD, the alpha is scaled by n_samples.
alpha_scaled = float(alpha) / n_samples
beta_scaled = float(beta) / n_samples
# if loss == 'multinomial', y should be label encoded.
n_classes = int(y.max()) + 1 if loss == "multinomial" else 1
# initialization
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
if "coef" in warm_start_mem.keys():
coef_init = warm_start_mem["coef"]
else:
# assume fit_intercept is False
coef_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C")
# coef_init contains possibly the intercept_init at the end.
# Note that Ridge centers the data before fitting, so fit_intercept=False.
fit_intercept = coef_init.shape[0] == (n_features + 1)
if fit_intercept:
intercept_init = coef_init[-1, :]
coef_init = coef_init[:-1, :]
else:
intercept_init = np.zeros(n_classes, dtype=X.dtype)
if "intercept_sum_gradient" in warm_start_mem.keys():
intercept_sum_gradient = warm_start_mem["intercept_sum_gradient"]
else:
intercept_sum_gradient = np.zeros(n_classes, dtype=X.dtype)
if "gradient_memory" in warm_start_mem.keys():
gradient_memory_init = warm_start_mem["gradient_memory"]
else:
gradient_memory_init = np.zeros(
(n_samples, n_classes), dtype=X.dtype, order="C"
)
if "sum_gradient" in warm_start_mem.keys():
sum_gradient_init = warm_start_mem["sum_gradient"]
else:
sum_gradient_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C")
if "seen" in warm_start_mem.keys():
seen_init = warm_start_mem["seen"]
else:
seen_init = np.zeros(n_samples, dtype=np.int32, order="C")
if "num_seen" in warm_start_mem.keys():
num_seen_init = warm_start_mem["num_seen"]
else:
num_seen_init = 0
dataset, intercept_decay = make_dataset(X, y, sample_weight, random_state)
if max_squared_sum is None:
max_squared_sum = row_norms(X, squared=True).max()
step_size = get_auto_step_size(
max_squared_sum,
alpha_scaled,
loss,
fit_intercept,
n_samples=n_samples,
is_saga=is_saga,
)
if step_size * alpha_scaled == 1:
raise ZeroDivisionError(
"Current sag implementation does not handle "
"the case step_size * alpha_scaled == 1"
)
sag = sag64 if X.dtype == np.float64 else sag32
num_seen, n_iter_ = sag(
dataset,
coef_init,
intercept_init,
n_samples,
n_features,
n_classes,
tol,
max_iter,
loss,
step_size,
alpha_scaled,
beta_scaled,
sum_gradient_init,
gradient_memory_init,
seen_init,
num_seen_init,
fit_intercept,
intercept_sum_gradient,
intercept_decay,
is_saga,
verbose,
)
if n_iter_ == max_iter:
warnings.warn(
"The max_iter was reached which means the coef_ did not converge",
ConvergenceWarning,
)
if fit_intercept:
coef_init = np.vstack((coef_init, intercept_init))
warm_start_mem = {
"coef": coef_init,
"sum_gradient": sum_gradient_init,
"intercept_sum_gradient": intercept_sum_gradient,
"gradient_memory": gradient_memory_init,
"seen": seen_init,
"num_seen": num_seen,
}
if loss == "multinomial":
coef_ = coef_init.T
else:
coef_ = coef_init[:, 0]
return coef_, n_iter_, warm_start_mem

View File

@@ -0,0 +1,26 @@
# License: BSD 3 clause
"""Helper to load LossFunction from sgd_fast.pyx to sag_fast.pyx"""
cdef class LossFunction:
cdef double loss(self, double p, double y) nogil
cdef double dloss(self, double p, double y) nogil
cdef class Regression(LossFunction):
cdef double loss(self, double p, double y) nogil
cdef double dloss(self, double p, double y) nogil
cdef class Classification(LossFunction):
cdef double loss(self, double p, double y) nogil
cdef double dloss(self, double p, double y) nogil
cdef class Log(Classification):
cdef double loss(self, double p, double y) nogil
cdef double dloss(self, double p, double y) nogil
cdef class SquaredLoss(Regression):
cdef double loss(self, double p, double y) nogil
cdef double dloss(self, double p, double y) nogil

View File

@@ -0,0 +1,450 @@
"""
A Theil-Sen Estimator for Multiple Linear Regression Model
"""
# Author: Florian Wilhelm <florian.wilhelm@gmail.com>
#
# License: BSD 3 clause
import warnings
import numbers
from itertools import combinations
import numpy as np
from scipy import linalg
from scipy.special import binom
from scipy.linalg.lapack import get_lapack_funcs
from joblib import Parallel, effective_n_jobs
from ._base import LinearModel
from ..base import RegressorMixin
from ..utils import check_random_state
from ..utils.validation import check_scalar
from ..utils.fixes import delayed
from ..exceptions import ConvergenceWarning
_EPSILON = np.finfo(np.double).eps
def _modified_weiszfeld_step(X, x_old):
"""Modified Weiszfeld step.
This function defines one iteration step in order to approximate the
spatial median (L1 median). It is a form of an iteratively re-weighted
least squares method.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples and
`n_features` is the number of features.
x_old : ndarray of shape = (n_features,)
Current start vector.
Returns
-------
x_new : ndarray of shape (n_features,)
New iteration step.
References
----------
- On Computation of Spatial Median for Robust Data Mining, 2005
T. Kärkkäinen and S. Äyrämö
http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
"""
diff = X - x_old
diff_norm = np.sqrt(np.sum(diff**2, axis=1))
mask = diff_norm >= _EPSILON
# x_old equals one of our samples
is_x_old_in_X = int(mask.sum() < X.shape[0])
diff = diff[mask]
diff_norm = diff_norm[mask][:, np.newaxis]
quotient_norm = linalg.norm(np.sum(diff / diff_norm, axis=0))
if quotient_norm > _EPSILON: # to avoid division by zero
new_direction = np.sum(X[mask, :] / diff_norm, axis=0) / np.sum(
1 / diff_norm, axis=0
)
else:
new_direction = 1.0
quotient_norm = 1.0
return (
max(0.0, 1.0 - is_x_old_in_X / quotient_norm) * new_direction
+ min(1.0, is_x_old_in_X / quotient_norm) * x_old
)
def _spatial_median(X, max_iter=300, tol=1.0e-3):
"""Spatial median (L1 median).
The spatial median is member of a class of so-called M-estimators which
are defined by an optimization problem. Given a number of p points in an
n-dimensional space, the point x minimizing the sum of all distances to the
p other points is called spatial median.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples and
`n_features` is the number of features.
max_iter : int, default=300
Maximum number of iterations.
tol : float, default=1.e-3
Stop the algorithm if spatial_median has converged.
Returns
-------
spatial_median : ndarray of shape = (n_features,)
Spatial median.
n_iter : int
Number of iterations needed.
References
----------
- On Computation of Spatial Median for Robust Data Mining, 2005
T. Kärkkäinen and S. Äyrämö
http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
"""
if X.shape[1] == 1:
return 1, np.median(X.ravel(), keepdims=True)
tol **= 2 # We are computing the tol on the squared norm
spatial_median_old = np.mean(X, axis=0)
for n_iter in range(max_iter):
spatial_median = _modified_weiszfeld_step(X, spatial_median_old)
if np.sum((spatial_median_old - spatial_median) ** 2) < tol:
break
else:
spatial_median_old = spatial_median
else:
warnings.warn(
"Maximum number of iterations {max_iter} reached in "
"spatial median for TheilSen regressor."
"".format(max_iter=max_iter),
ConvergenceWarning,
)
return n_iter, spatial_median
def _breakdown_point(n_samples, n_subsamples):
"""Approximation of the breakdown point.
Parameters
----------
n_samples : int
Number of samples.
n_subsamples : int
Number of subsamples to consider.
Returns
-------
breakdown_point : float
Approximation of breakdown point.
"""
return (
1
- (
0.5 ** (1 / n_subsamples) * (n_samples - n_subsamples + 1)
+ n_subsamples
- 1
)
/ n_samples
)
def _lstsq(X, y, indices, fit_intercept):
"""Least Squares Estimator for TheilSenRegressor class.
This function calculates the least squares method on a subset of rows of X
and y defined by the indices array. Optionally, an intercept column is
added if intercept is set to true.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Design matrix, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : ndarray of shape (n_samples,)
Target vector, where `n_samples` is the number of samples.
indices : ndarray of shape (n_subpopulation, n_subsamples)
Indices of all subsamples with respect to the chosen subpopulation.
fit_intercept : bool
Fit intercept or not.
Returns
-------
weights : ndarray of shape (n_subpopulation, n_features + intercept)
Solution matrix of n_subpopulation solved least square problems.
"""
fit_intercept = int(fit_intercept)
n_features = X.shape[1] + fit_intercept
n_subsamples = indices.shape[1]
weights = np.empty((indices.shape[0], n_features))
X_subpopulation = np.ones((n_subsamples, n_features))
# gelss need to pad y_subpopulation to be of the max dim of X_subpopulation
y_subpopulation = np.zeros((max(n_subsamples, n_features)))
(lstsq,) = get_lapack_funcs(("gelss",), (X_subpopulation, y_subpopulation))
for index, subset in enumerate(indices):
X_subpopulation[:, fit_intercept:] = X[subset, :]
y_subpopulation[:n_subsamples] = y[subset]
weights[index] = lstsq(X_subpopulation, y_subpopulation)[1][:n_features]
return weights
class TheilSenRegressor(RegressorMixin, LinearModel):
"""Theil-Sen Estimator: robust multivariate regression model.
The algorithm calculates least square solutions on subsets with size
n_subsamples of the samples in X. Any value of n_subsamples between the
number of features and samples leads to an estimator with a compromise
between robustness and efficiency. Since the number of least square
solutions is "n_samples choose n_subsamples", it can be extremely large
and can therefore be limited with max_subpopulation. If this limit is
reached, the subsets are chosen randomly. In a final step, the spatial
median (or L1 median) is calculated of all least square solutions.
Read more in the :ref:`User Guide <theil_sen_regression>`.
Parameters
----------
fit_intercept : bool, default=True
Whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations.
copy_X : bool, default=True
If True, X will be copied; else, it may be overwritten.
max_subpopulation : int, default=1e4
Instead of computing with a set of cardinality 'n choose k', where n is
the number of samples and k is the number of subsamples (at least
number of features), consider only a stochastic subpopulation of a
given maximal size if 'n choose k' is larger than max_subpopulation.
For other than small problem sizes this parameter will determine
memory usage and runtime if n_subsamples is not changed. Note that the
data type should be int but floats such as 1e4 can be accepted too.
n_subsamples : int, default=None
Number of samples to calculate the parameters. This is at least the
number of features (plus 1 if fit_intercept=True) and the number of
samples as a maximum. A lower number leads to a higher breakdown
point and a low efficiency while a high number leads to a low
breakdown point and a high efficiency. If None, take the
minimum number of subsamples leading to maximal robustness.
If n_subsamples is set to n_samples, Theil-Sen is identical to least
squares.
max_iter : int, default=300
Maximum number of iterations for the calculation of spatial median.
tol : float, default=1e-3
Tolerance when calculating spatial median.
random_state : int, RandomState instance or None, default=None
A random number generator instance to define the state of the random
permutations generator. Pass an int for reproducible output across
multiple function calls.
See :term:`Glossary <random_state>`.
n_jobs : int, default=None
Number of CPUs to use during the cross validation.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
verbose : bool, default=False
Verbose mode when fitting the model.
Attributes
----------
coef_ : ndarray of shape (n_features,)
Coefficients of the regression model (median of distribution).
intercept_ : float
Estimated intercept of regression model.
breakdown_ : float
Approximated breakdown point.
n_iter_ : int
Number of iterations needed for the spatial median.
n_subpopulation_ : int
Number of combinations taken into account from 'n choose k', where n is
the number of samples and k is the number of subsamples.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
HuberRegressor : Linear regression model that is robust to outliers.
RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
References
----------
- Theil-Sen Estimators in a Multiple Linear Regression Model, 2009
Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang
http://home.olemiss.edu/~xdang/papers/MTSE.pdf
Examples
--------
>>> from sklearn.linear_model import TheilSenRegressor
>>> from sklearn.datasets import make_regression
>>> X, y = make_regression(
... n_samples=200, n_features=2, noise=4.0, random_state=0)
>>> reg = TheilSenRegressor(random_state=0).fit(X, y)
>>> reg.score(X, y)
0.9884...
>>> reg.predict(X[:1,])
array([-31.5871...])
"""
def __init__(
self,
*,
fit_intercept=True,
copy_X=True,
max_subpopulation=1e4,
n_subsamples=None,
max_iter=300,
tol=1.0e-3,
random_state=None,
n_jobs=None,
verbose=False,
):
self.fit_intercept = fit_intercept
self.copy_X = copy_X
self.max_subpopulation = max_subpopulation
self.n_subsamples = n_subsamples
self.max_iter = max_iter
self.tol = tol
self.random_state = random_state
self.n_jobs = n_jobs
self.verbose = verbose
def _check_subparams(self, n_samples, n_features):
n_subsamples = self.n_subsamples
if self.fit_intercept:
n_dim = n_features + 1
else:
n_dim = n_features
if n_subsamples is not None:
if n_subsamples > n_samples:
raise ValueError(
"Invalid parameter since n_subsamples > "
"n_samples ({0} > {1}).".format(n_subsamples, n_samples)
)
if n_samples >= n_features:
if n_dim > n_subsamples:
plus_1 = "+1" if self.fit_intercept else ""
raise ValueError(
"Invalid parameter since n_features{0} "
"> n_subsamples ({1} > {2})."
"".format(plus_1, n_dim, n_subsamples)
)
else: # if n_samples < n_features
if n_subsamples != n_samples:
raise ValueError(
"Invalid parameter since n_subsamples != "
"n_samples ({0} != {1}) while n_samples "
"< n_features.".format(n_subsamples, n_samples)
)
else:
n_subsamples = min(n_dim, n_samples)
self._max_subpopulation = check_scalar(
self.max_subpopulation,
"max_subpopulation",
# target_type should be numbers.Integral but can accept float
# for backward compatibility reasons
target_type=(numbers.Real, numbers.Integral),
min_val=1,
)
all_combinations = max(1, np.rint(binom(n_samples, n_subsamples)))
n_subpopulation = int(min(self._max_subpopulation, all_combinations))
return n_subsamples, n_subpopulation
def fit(self, X, y):
"""Fit linear model.
Parameters
----------
X : ndarray of shape (n_samples, n_features)
Training data.
y : ndarray of shape (n_samples,)
Target values.
Returns
-------
self : returns an instance of self.
Fitted `TheilSenRegressor` estimator.
"""
random_state = check_random_state(self.random_state)
X, y = self._validate_data(X, y, y_numeric=True)
n_samples, n_features = X.shape
n_subsamples, self.n_subpopulation_ = self._check_subparams(
n_samples, n_features
)
self.breakdown_ = _breakdown_point(n_samples, n_subsamples)
if self.verbose:
print("Breakdown point: {0}".format(self.breakdown_))
print("Number of samples: {0}".format(n_samples))
tol_outliers = int(self.breakdown_ * n_samples)
print("Tolerable outliers: {0}".format(tol_outliers))
print("Number of subpopulations: {0}".format(self.n_subpopulation_))
# Determine indices of subpopulation
if np.rint(binom(n_samples, n_subsamples)) <= self._max_subpopulation:
indices = list(combinations(range(n_samples), n_subsamples))
else:
indices = [
random_state.choice(n_samples, size=n_subsamples, replace=False)
for _ in range(self.n_subpopulation_)
]
n_jobs = effective_n_jobs(self.n_jobs)
index_list = np.array_split(indices, n_jobs)
weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
delayed(_lstsq)(X, y, index_list[job], self.fit_intercept)
for job in range(n_jobs)
)
weights = np.vstack(weights)
self.n_iter_, coefs = _spatial_median(
weights, max_iter=self.max_iter, tol=self.tol
)
if self.fit_intercept:
self.intercept_ = coefs[0]
self.coef_ = coefs[1:]
else:
self.intercept_ = 0.0
self.coef_ = coefs
return self

View File

@@ -0,0 +1,49 @@
import os
import numpy
from sklearn._build_utils import gen_from_templates
def configuration(parent_package="", top_path=None):
from numpy.distutils.misc_util import Configuration
config = Configuration("linear_model", parent_package, top_path)
libraries = []
if os.name == "posix":
libraries.append("m")
config.add_extension(
"_cd_fast",
sources=["_cd_fast.pyx"],
include_dirs=numpy.get_include(),
libraries=libraries,
)
config.add_extension(
"_sgd_fast",
sources=["_sgd_fast.pyx"],
include_dirs=numpy.get_include(),
libraries=libraries,
)
# generate sag_fast from template
templates = ["sklearn/linear_model/_sag_fast.pyx.tp"]
gen_from_templates(templates)
config.add_extension(
"_sag_fast", sources=["_sag_fast.pyx"], include_dirs=numpy.get_include()
)
# add other directories
config.add_subpackage("tests")
config.add_subpackage("_glm")
config.add_subpackage("_glm/tests")
return config
if __name__ == "__main__":
from numpy.distutils.core import setup
setup(**configuration(top_path="").todict())

View File

@@ -0,0 +1,742 @@
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
# Maria Telenczuk <https://github.com/maikia>
#
# License: BSD 3 clause
import pytest
import warnings
import numpy as np
from scipy import sparse
from scipy import linalg
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose
from sklearn.utils import check_random_state
from sklearn.linear_model import LinearRegression
from sklearn.linear_model._base import _deprecate_normalize
from sklearn.linear_model._base import _preprocess_data
from sklearn.linear_model._base import _rescale_data
from sklearn.linear_model._base import make_dataset
from sklearn.datasets import make_sparse_uncorrelated
from sklearn.datasets import make_regression
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import add_dummy_feature
rng = np.random.RandomState(0)
rtol = 1e-6
def test_linear_regression():
# Test LinearRegression on a simple dataset.
# a simple dataset
X = [[1], [2]]
Y = [1, 2]
reg = LinearRegression()
reg.fit(X, Y)
assert_array_almost_equal(reg.coef_, [1])
assert_array_almost_equal(reg.intercept_, [0])
assert_array_almost_equal(reg.predict(X), [1, 2])
# test it also for degenerate input
X = [[1]]
Y = [0]
reg = LinearRegression()
reg.fit(X, Y)
assert_array_almost_equal(reg.coef_, [0])
assert_array_almost_equal(reg.intercept_, [0])
assert_array_almost_equal(reg.predict(X), [0])
@pytest.mark.parametrize("array_constr", [np.array, sparse.csr_matrix])
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_linear_regression_sample_weights(array_constr, fit_intercept):
rng = np.random.RandomState(0)
# It would not work with under-determined systems
n_samples, n_features = 6, 5
X = array_constr(rng.normal(size=(n_samples, n_features)))
y = rng.normal(size=n_samples)
sample_weight = 1.0 + rng.uniform(size=n_samples)
# LinearRegression with explicit sample_weight
reg = LinearRegression(fit_intercept=fit_intercept)
reg.fit(X, y, sample_weight=sample_weight)
coefs1 = reg.coef_
inter1 = reg.intercept_
assert reg.coef_.shape == (X.shape[1],) # sanity checks
assert reg.score(X, y) > 0.5
# Closed form of the weighted least square
# theta = (X^T W X)^(-1) @ X^T W y
W = np.diag(sample_weight)
X_aug = X if not fit_intercept else add_dummy_feature(X)
Xw = X_aug.T @ W @ X_aug
yw = X_aug.T @ W @ y
coefs2 = linalg.solve(Xw, yw)
if not fit_intercept:
assert_allclose(coefs1, coefs2)
else:
assert_allclose(coefs1, coefs2[1:])
assert_allclose(inter1, coefs2[0])
def test_raises_value_error_if_positive_and_sparse():
error_msg = "A sparse matrix was passed, but dense data is required."
# X must not be sparse if positive == True
X = sparse.eye(10)
y = np.ones(10)
reg = LinearRegression(positive=True)
with pytest.raises(TypeError, match=error_msg):
reg.fit(X, y)
def test_raises_value_error_if_sample_weights_greater_than_1d():
# Sample weights must be either scalar or 1D
n_sampless = [2, 3]
n_featuress = [3, 2]
for n_samples, n_features in zip(n_sampless, n_featuress):
X = rng.randn(n_samples, n_features)
y = rng.randn(n_samples)
sample_weights_OK = rng.randn(n_samples) ** 2 + 1
sample_weights_OK_1 = 1.0
sample_weights_OK_2 = 2.0
reg = LinearRegression()
# make sure the "OK" sample weights actually work
reg.fit(X, y, sample_weights_OK)
reg.fit(X, y, sample_weights_OK_1)
reg.fit(X, y, sample_weights_OK_2)
def test_fit_intercept():
# Test assertions on betas shape.
X2 = np.array([[0.38349978, 0.61650022], [0.58853682, 0.41146318]])
X3 = np.array(
[[0.27677969, 0.70693172, 0.01628859], [0.08385139, 0.20692515, 0.70922346]]
)
y = np.array([1, 1])
lr2_without_intercept = LinearRegression(fit_intercept=False).fit(X2, y)
lr2_with_intercept = LinearRegression().fit(X2, y)
lr3_without_intercept = LinearRegression(fit_intercept=False).fit(X3, y)
lr3_with_intercept = LinearRegression().fit(X3, y)
assert lr2_with_intercept.coef_.shape == lr2_without_intercept.coef_.shape
assert lr3_with_intercept.coef_.shape == lr3_without_intercept.coef_.shape
assert lr2_without_intercept.coef_.ndim == lr3_without_intercept.coef_.ndim
def test_error_on_wrong_normalize():
normalize = "wrong"
default = True
error_msg = "Leave 'normalize' to its default"
with pytest.raises(ValueError, match=error_msg):
_deprecate_normalize(normalize, default, "estimator")
@pytest.mark.parametrize("normalize", [True, False, "deprecated"])
@pytest.mark.parametrize("default", [True, False])
# FIXME update test in 1.2 for new versions
def test_deprecate_normalize(normalize, default):
# test all possible case of the normalize parameter deprecation
if not default:
if normalize == "deprecated":
# no warning
output = default
expected = None
warning_msg = []
else:
output = normalize
expected = FutureWarning
warning_msg = ["1.2"]
if not normalize:
warning_msg.append("default value")
else:
warning_msg.append("StandardScaler(")
elif default:
if normalize == "deprecated":
# warning to pass False and use StandardScaler
output = default
expected = FutureWarning
warning_msg = ["False", "1.2", "StandardScaler("]
else:
# no warning
output = normalize
expected = None
warning_msg = []
if expected is None:
with warnings.catch_warnings():
warnings.simplefilter("error", FutureWarning)
_normalize = _deprecate_normalize(normalize, default, "estimator")
else:
with pytest.warns(expected) as record:
_normalize = _deprecate_normalize(normalize, default, "estimator")
assert all([warning in str(record[0].message) for warning in warning_msg])
assert _normalize == output
def test_linear_regression_sparse(random_state=0):
# Test that linear regression also works with sparse data
random_state = check_random_state(random_state)
for i in range(10):
n = 100
X = sparse.eye(n, n)
beta = random_state.rand(n)
y = X * beta[:, np.newaxis]
ols = LinearRegression()
ols.fit(X, y.ravel())
assert_array_almost_equal(beta, ols.coef_ + ols.intercept_)
assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
# FIXME: 'normalize' to be removed in 1.2 in LinearRegression
@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_linear_regression_sparse_equal_dense(normalize, fit_intercept):
# Test that linear regression agrees between sparse and dense
rng = check_random_state(0)
n_samples = 200
n_features = 2
X = rng.randn(n_samples, n_features)
X[X < 0.1] = 0.0
Xcsr = sparse.csr_matrix(X)
y = rng.rand(n_samples)
params = dict(normalize=normalize, fit_intercept=fit_intercept)
clf_dense = LinearRegression(**params)
clf_sparse = LinearRegression(**params)
clf_dense.fit(X, y)
clf_sparse.fit(Xcsr, y)
assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_)
assert_allclose(clf_dense.coef_, clf_sparse.coef_)
def test_linear_regression_multiple_outcome(random_state=0):
# Test multiple-outcome linear regressions
X, y = make_regression(random_state=random_state)
Y = np.vstack((y, y)).T
n_features = X.shape[1]
reg = LinearRegression()
reg.fit((X), Y)
assert reg.coef_.shape == (2, n_features)
Y_pred = reg.predict(X)
reg.fit(X, y)
y_pred = reg.predict(X)
assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_sparse_multiple_outcome(random_state=0):
# Test multiple-outcome linear regressions with sparse data
random_state = check_random_state(random_state)
X, y = make_sparse_uncorrelated(random_state=random_state)
X = sparse.coo_matrix(X)
Y = np.vstack((y, y)).T
n_features = X.shape[1]
ols = LinearRegression()
ols.fit(X, Y)
assert ols.coef_.shape == (2, n_features)
Y_pred = ols.predict(X)
ols.fit(X, y.ravel())
y_pred = ols.predict(X)
assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_positive():
# Test nonnegative LinearRegression on a simple dataset.
X = [[1], [2]]
y = [1, 2]
reg = LinearRegression(positive=True)
reg.fit(X, y)
assert_array_almost_equal(reg.coef_, [1])
assert_array_almost_equal(reg.intercept_, [0])
assert_array_almost_equal(reg.predict(X), [1, 2])
# test it also for degenerate input
X = [[1]]
y = [0]
reg = LinearRegression(positive=True)
reg.fit(X, y)
assert_allclose(reg.coef_, [0])
assert_allclose(reg.intercept_, [0])
assert_allclose(reg.predict(X), [0])
def test_linear_regression_positive_multiple_outcome(random_state=0):
# Test multiple-outcome nonnegative linear regressions
random_state = check_random_state(random_state)
X, y = make_sparse_uncorrelated(random_state=random_state)
Y = np.vstack((y, y)).T
n_features = X.shape[1]
ols = LinearRegression(positive=True)
ols.fit(X, Y)
assert ols.coef_.shape == (2, n_features)
assert np.all(ols.coef_ >= 0.0)
Y_pred = ols.predict(X)
ols.fit(X, y.ravel())
y_pred = ols.predict(X)
assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred)
def test_linear_regression_positive_vs_nonpositive():
# Test differences with LinearRegression when positive=False.
X, y = make_sparse_uncorrelated(random_state=0)
reg = LinearRegression(positive=True)
reg.fit(X, y)
regn = LinearRegression(positive=False)
regn.fit(X, y)
assert np.mean((reg.coef_ - regn.coef_) ** 2) > 1e-3
def test_linear_regression_positive_vs_nonpositive_when_positive():
# Test LinearRegression fitted coefficients
# when the problem is positive.
n_samples = 200
n_features = 4
X = rng.rand(n_samples, n_features)
y = X[:, 0] + 2 * X[:, 1] + 3 * X[:, 2] + 1.5 * X[:, 3]
reg = LinearRegression(positive=True)
reg.fit(X, y)
regn = LinearRegression(positive=False)
regn.fit(X, y)
assert np.mean((reg.coef_ - regn.coef_) ** 2) < 1e-6
def test_linear_regression_pd_sparse_dataframe_warning():
pd = pytest.importorskip("pandas")
# Warning is raised only when some of the columns is sparse
df = pd.DataFrame({"0": np.random.randn(10)})
for col in range(1, 4):
arr = np.random.randn(10)
arr[:8] = 0
# all columns but the first column is sparse
if col != 0:
arr = pd.arrays.SparseArray(arr, fill_value=0)
df[str(col)] = arr
msg = "pandas.DataFrame with sparse columns found."
reg = LinearRegression()
with pytest.warns(UserWarning, match=msg):
reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
# does not warn when the whole dataframe is sparse
df["0"] = pd.arrays.SparseArray(df["0"], fill_value=0)
assert hasattr(df, "sparse")
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
def test_preprocess_data():
n_samples = 200
n_features = 2
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples)
expected_X_mean = np.mean(X, axis=0)
expected_X_scale = np.std(X, axis=0) * np.sqrt(X.shape[0])
expected_y_mean = np.mean(y, axis=0)
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
X, y, fit_intercept=False, normalize=False
)
assert_array_almost_equal(X_mean, np.zeros(n_features))
assert_array_almost_equal(y_mean, 0)
assert_array_almost_equal(X_scale, np.ones(n_features))
assert_array_almost_equal(Xt, X)
assert_array_almost_equal(yt, y)
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
X, y, fit_intercept=True, normalize=False
)
assert_array_almost_equal(X_mean, expected_X_mean)
assert_array_almost_equal(y_mean, expected_y_mean)
assert_array_almost_equal(X_scale, np.ones(n_features))
assert_array_almost_equal(Xt, X - expected_X_mean)
assert_array_almost_equal(yt, y - expected_y_mean)
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
X, y, fit_intercept=True, normalize=True
)
assert_array_almost_equal(X_mean, expected_X_mean)
assert_array_almost_equal(y_mean, expected_y_mean)
assert_array_almost_equal(X_scale, expected_X_scale)
assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale)
assert_array_almost_equal(yt, y - expected_y_mean)
def test_preprocess_data_multioutput():
n_samples = 200
n_features = 3
n_outputs = 2
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples, n_outputs)
expected_y_mean = np.mean(y, axis=0)
args = [X, sparse.csc_matrix(X)]
for X in args:
_, yt, _, y_mean, _ = _preprocess_data(
X, y, fit_intercept=False, normalize=False
)
assert_array_almost_equal(y_mean, np.zeros(n_outputs))
assert_array_almost_equal(yt, y)
_, yt, _, y_mean, _ = _preprocess_data(
X, y, fit_intercept=True, normalize=False
)
assert_array_almost_equal(y_mean, expected_y_mean)
assert_array_almost_equal(yt, y - y_mean)
_, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=True)
assert_array_almost_equal(y_mean, expected_y_mean)
assert_array_almost_equal(yt, y - y_mean)
@pytest.mark.parametrize("is_sparse", [False, True])
def test_preprocess_data_weighted(is_sparse):
n_samples = 200
n_features = 4
# Generate random data with 50% of zero values to make sure
# that the sparse variant of this test is actually sparse. This also
# shifts the mean value for each columns in X further away from
# zero.
X = rng.rand(n_samples, n_features)
X[X < 0.5] = 0.0
# Scale the first feature of X to be 10 larger than the other to
# better check the impact of feature scaling.
X[:, 0] *= 10
# Constant non-zero feature.
X[:, 2] = 1.0
# Constant zero feature (non-materialized in the sparse case)
X[:, 3] = 0.0
y = rng.rand(n_samples)
sample_weight = rng.rand(n_samples)
expected_X_mean = np.average(X, axis=0, weights=sample_weight)
expected_y_mean = np.average(y, axis=0, weights=sample_weight)
X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0)
X_sample_weight_var = np.average(
(X - X_sample_weight_avg) ** 2, weights=sample_weight, axis=0
)
constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
assert_array_equal(constant_mask, [0, 0, 1, 1])
expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(sample_weight.sum())
# near constant features should not be scaled
expected_X_scale[constant_mask] = 1
if is_sparse:
X = sparse.csr_matrix(X)
# normalize is False
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
X,
y,
fit_intercept=True,
normalize=False,
sample_weight=sample_weight,
)
assert_array_almost_equal(X_mean, expected_X_mean)
assert_array_almost_equal(y_mean, expected_y_mean)
assert_array_almost_equal(X_scale, np.ones(n_features))
if is_sparse:
assert_array_almost_equal(Xt.toarray(), X.toarray())
else:
assert_array_almost_equal(Xt, X - expected_X_mean)
assert_array_almost_equal(yt, y - expected_y_mean)
# normalize is True
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
X,
y,
fit_intercept=True,
normalize=True,
sample_weight=sample_weight,
)
assert_array_almost_equal(X_mean, expected_X_mean)
assert_array_almost_equal(y_mean, expected_y_mean)
assert_array_almost_equal(X_scale, expected_X_scale)
if is_sparse:
# X is not centered
assert_array_almost_equal(Xt.toarray(), X.toarray() / expected_X_scale)
else:
assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale)
# _preprocess_data with normalize=True scales the data by the feature-wise
# euclidean norms while StandardScaler scales the data by the feature-wise
# standard deviations.
# The two are equivalent up to a ratio of np.sqrt(n_samples) if unweighted
# or np.sqrt(sample_weight.sum()) if weighted.
if is_sparse:
scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight)
# Non-constant features are scaled similarly with np.sqrt(n_samples)
assert_array_almost_equal(
scaler.transform(X).toarray()[:, :2] / np.sqrt(sample_weight.sum()),
Xt.toarray()[:, :2],
)
# Constant features go through un-scaled.
assert_array_almost_equal(
scaler.transform(X).toarray()[:, 2:], Xt.toarray()[:, 2:]
)
else:
scaler = StandardScaler(with_mean=True).fit(X, sample_weight=sample_weight)
assert_array_almost_equal(scaler.mean_, X_mean)
assert_array_almost_equal(
scaler.transform(X) / np.sqrt(sample_weight.sum()),
Xt,
)
assert_array_almost_equal(yt, y - expected_y_mean)
def test_sparse_preprocess_data_offsets():
n_samples = 200
n_features = 2
# random_state not supported yet in sparse.rand
X = sparse.rand(n_samples, n_features, density=0.5) # , random_state=rng
X = X.tolil()
y = rng.rand(n_samples)
XA = X.toarray()
expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0])
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
X, y, fit_intercept=False, normalize=False
)
assert_array_almost_equal(X_mean, np.zeros(n_features))
assert_array_almost_equal(y_mean, 0)
assert_array_almost_equal(X_scale, np.ones(n_features))
assert_array_almost_equal(Xt.A, XA)
assert_array_almost_equal(yt, y)
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
X, y, fit_intercept=True, normalize=False
)
assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
assert_array_almost_equal(y_mean, np.mean(y, axis=0))
assert_array_almost_equal(X_scale, np.ones(n_features))
assert_array_almost_equal(Xt.A, XA)
assert_array_almost_equal(yt, y - np.mean(y, axis=0))
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
X, y, fit_intercept=True, normalize=True
)
assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
assert_array_almost_equal(y_mean, np.mean(y, axis=0))
assert_array_almost_equal(X_scale, expected_X_scale)
assert_array_almost_equal(Xt.A, XA / expected_X_scale)
assert_array_almost_equal(yt, y - np.mean(y, axis=0))
def test_csr_preprocess_data():
# Test output format of _preprocess_data, when input is csr
X, y = make_regression()
X[X < 2.5] = 0.0
csr = sparse.csr_matrix(X)
csr_, y, _, _, _ = _preprocess_data(csr, y, True)
assert csr_.getformat() == "csr"
@pytest.mark.parametrize("is_sparse", (True, False))
@pytest.mark.parametrize("to_copy", (True, False))
def test_preprocess_copy_data_no_checks(is_sparse, to_copy):
X, y = make_regression()
X[X < 2.5] = 0.0
if is_sparse:
X = sparse.csr_matrix(X)
X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False)
if to_copy and is_sparse:
assert not np.may_share_memory(X_.data, X.data)
elif to_copy:
assert not np.may_share_memory(X_, X)
elif is_sparse:
assert np.may_share_memory(X_.data, X.data)
else:
assert np.may_share_memory(X_, X)
def test_dtype_preprocess_data():
n_samples = 200
n_features = 2
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples)
X_32 = np.asarray(X, dtype=np.float32)
y_32 = np.asarray(y, dtype=np.float32)
X_64 = np.asarray(X, dtype=np.float64)
y_64 = np.asarray(y, dtype=np.float64)
for fit_intercept in [True, False]:
for normalize in [True, False]:
Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data(
X_32,
y_32,
fit_intercept=fit_intercept,
normalize=normalize,
)
Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data(
X_64,
y_64,
fit_intercept=fit_intercept,
normalize=normalize,
)
Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = _preprocess_data(
X_32,
y_64,
fit_intercept=fit_intercept,
normalize=normalize,
)
Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = _preprocess_data(
X_64,
y_32,
fit_intercept=fit_intercept,
normalize=normalize,
)
assert Xt_32.dtype == np.float32
assert yt_32.dtype == np.float32
assert X_mean_32.dtype == np.float32
assert y_mean_32.dtype == np.float32
assert X_scale_32.dtype == np.float32
assert Xt_64.dtype == np.float64
assert yt_64.dtype == np.float64
assert X_mean_64.dtype == np.float64
assert y_mean_64.dtype == np.float64
assert X_scale_64.dtype == np.float64
assert Xt_3264.dtype == np.float32
assert yt_3264.dtype == np.float32
assert X_mean_3264.dtype == np.float32
assert y_mean_3264.dtype == np.float32
assert X_scale_3264.dtype == np.float32
assert Xt_6432.dtype == np.float64
assert yt_6432.dtype == np.float64
assert X_mean_6432.dtype == np.float64
assert y_mean_6432.dtype == np.float64
assert X_scale_6432.dtype == np.float64
assert X_32.dtype == np.float32
assert y_32.dtype == np.float32
assert X_64.dtype == np.float64
assert y_64.dtype == np.float64
assert_array_almost_equal(Xt_32, Xt_64)
assert_array_almost_equal(yt_32, yt_64)
assert_array_almost_equal(X_mean_32, X_mean_64)
assert_array_almost_equal(y_mean_32, y_mean_64)
assert_array_almost_equal(X_scale_32, X_scale_64)
@pytest.mark.parametrize("n_targets", [None, 2])
def test_rescale_data_dense(n_targets):
n_samples = 200
n_features = 2
sample_weight = 1.0 + rng.rand(n_samples)
X = rng.rand(n_samples, n_features)
if n_targets is None:
y = rng.rand(n_samples)
else:
y = rng.rand(n_samples, n_targets)
rescaled_X, rescaled_y, sqrt_sw = _rescale_data(X, y, sample_weight)
rescaled_X2 = X * sqrt_sw[:, np.newaxis]
if n_targets is None:
rescaled_y2 = y * sqrt_sw
else:
rescaled_y2 = y * sqrt_sw[:, np.newaxis]
assert_array_almost_equal(rescaled_X, rescaled_X2)
assert_array_almost_equal(rescaled_y, rescaled_y2)
def test_fused_types_make_dataset():
iris = load_iris()
X_32 = iris.data.astype(np.float32)
y_32 = iris.target.astype(np.float32)
X_csr_32 = sparse.csr_matrix(X_32)
sample_weight_32 = np.arange(y_32.size, dtype=np.float32)
X_64 = iris.data.astype(np.float64)
y_64 = iris.target.astype(np.float64)
X_csr_64 = sparse.csr_matrix(X_64)
sample_weight_64 = np.arange(y_64.size, dtype=np.float64)
# array
dataset_32, _ = make_dataset(X_32, y_32, sample_weight_32)
dataset_64, _ = make_dataset(X_64, y_64, sample_weight_64)
xi_32, yi_32, _, _ = dataset_32._next_py()
xi_64, yi_64, _, _ = dataset_64._next_py()
xi_data_32, _, _ = xi_32
xi_data_64, _, _ = xi_64
assert xi_data_32.dtype == np.float32
assert xi_data_64.dtype == np.float64
assert_allclose(yi_64, yi_32, rtol=rtol)
# csr
datasetcsr_32, _ = make_dataset(X_csr_32, y_32, sample_weight_32)
datasetcsr_64, _ = make_dataset(X_csr_64, y_64, sample_weight_64)
xicsr_32, yicsr_32, _, _ = datasetcsr_32._next_py()
xicsr_64, yicsr_64, _, _ = datasetcsr_64._next_py()
xicsr_data_32, _, _ = xicsr_32
xicsr_data_64, _, _ = xicsr_64
assert xicsr_data_32.dtype == np.float32
assert xicsr_data_64.dtype == np.float64
assert_allclose(xicsr_data_64, xicsr_data_32, rtol=rtol)
assert_allclose(yicsr_64, yicsr_32, rtol=rtol)
assert_array_equal(xi_data_32, xicsr_data_32)
assert_array_equal(xi_data_64, xicsr_data_64)
assert_array_equal(yi_32, yicsr_32)
assert_array_equal(yi_64, yicsr_64)

View File

@@ -0,0 +1,318 @@
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
#
# License: BSD 3 clause
from math import log
import numpy as np
import pytest
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_less
from sklearn.utils import check_random_state
from sklearn.linear_model import BayesianRidge, ARDRegression
from sklearn.linear_model import Ridge
from sklearn import datasets
from sklearn.utils.extmath import fast_logdet
diabetes = datasets.load_diabetes()
def test_n_iter():
"""Check value of n_iter."""
X = np.array([[1], [2], [6], [8], [10]])
y = np.array([1, 2, 6, 8, 10])
clf = BayesianRidge(n_iter=0)
msg = "n_iter should be greater than or equal to 1."
with pytest.raises(ValueError, match=msg):
clf.fit(X, y)
def test_bayesian_ridge_scores():
"""Check scores attribute shape"""
X, y = diabetes.data, diabetes.target
clf = BayesianRidge(compute_score=True)
clf.fit(X, y)
assert clf.scores_.shape == (clf.n_iter_ + 1,)
def test_bayesian_ridge_score_values():
"""Check value of score on toy example.
Compute log marginal likelihood with equation (36) in Sparse Bayesian
Learning and the Relevance Vector Machine (Tipping, 2001):
- 0.5 * (log |Id/alpha + X.X^T/lambda| +
y^T.(Id/alpha + X.X^T/lambda).y + n * log(2 * pi))
+ lambda_1 * log(lambda) - lambda_2 * lambda
+ alpha_1 * log(alpha) - alpha_2 * alpha
and check equality with the score computed during training.
"""
X, y = diabetes.data, diabetes.target
n_samples = X.shape[0]
# check with initial values of alpha and lambda (see code for the values)
eps = np.finfo(np.float64).eps
alpha_ = 1.0 / (np.var(y) + eps)
lambda_ = 1.0
# value of the parameters of the Gamma hyperpriors
alpha_1 = 0.1
alpha_2 = 0.1
lambda_1 = 0.1
lambda_2 = 0.1
# compute score using formula of docstring
score = lambda_1 * log(lambda_) - lambda_2 * lambda_
score += alpha_1 * log(alpha_) - alpha_2 * alpha_
M = 1.0 / alpha_ * np.eye(n_samples) + 1.0 / lambda_ * np.dot(X, X.T)
M_inv_dot_y = np.linalg.solve(M, y)
score += -0.5 * (
fast_logdet(M) + np.dot(y.T, M_inv_dot_y) + n_samples * log(2 * np.pi)
)
# compute score with BayesianRidge
clf = BayesianRidge(
alpha_1=alpha_1,
alpha_2=alpha_2,
lambda_1=lambda_1,
lambda_2=lambda_2,
n_iter=1,
fit_intercept=False,
compute_score=True,
)
clf.fit(X, y)
assert_almost_equal(clf.scores_[0], score, decimal=9)
def test_bayesian_ridge_parameter():
# Test correctness of lambda_ and alpha_ parameters (GitHub issue #8224)
X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
y = np.array([1, 2, 3, 2, 0, 4, 5]).T
# A Ridge regression model using an alpha value equal to the ratio of
# lambda_ and alpha_ from the Bayesian Ridge model must be identical
br_model = BayesianRidge(compute_score=True).fit(X, y)
rr_model = Ridge(alpha=br_model.lambda_ / br_model.alpha_).fit(X, y)
assert_array_almost_equal(rr_model.coef_, br_model.coef_)
assert_almost_equal(rr_model.intercept_, br_model.intercept_)
def test_bayesian_sample_weights():
# Test correctness of the sample_weights method
X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
y = np.array([1, 2, 3, 2, 0, 4, 5]).T
w = np.array([4, 3, 3, 1, 1, 2, 3]).T
# A Ridge regression model using an alpha value equal to the ratio of
# lambda_ and alpha_ from the Bayesian Ridge model must be identical
br_model = BayesianRidge(compute_score=True).fit(X, y, sample_weight=w)
rr_model = Ridge(alpha=br_model.lambda_ / br_model.alpha_).fit(
X, y, sample_weight=w
)
assert_array_almost_equal(rr_model.coef_, br_model.coef_)
assert_almost_equal(rr_model.intercept_, br_model.intercept_)
def test_toy_bayesian_ridge_object():
# Test BayesianRidge on toy
X = np.array([[1], [2], [6], [8], [10]])
Y = np.array([1, 2, 6, 8, 10])
clf = BayesianRidge(compute_score=True)
clf.fit(X, Y)
# Check that the model could approximately learn the identity function
test = [[1], [3], [4]]
assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
def test_bayesian_initial_params():
# Test BayesianRidge with initial values (alpha_init, lambda_init)
X = np.vander(np.linspace(0, 4, 5), 4)
y = np.array([0.0, 1.0, 0.0, -1.0, 0.0]) # y = (x^3 - 6x^2 + 8x) / 3
# In this case, starting from the default initial values will increase
# the bias of the fitted curve. So, lambda_init should be small.
reg = BayesianRidge(alpha_init=1.0, lambda_init=1e-3)
# Check the R2 score nearly equals to one.
r2 = reg.fit(X, y).score(X, y)
assert_almost_equal(r2, 1.0)
def test_prediction_bayesian_ridge_ard_with_constant_input():
# Test BayesianRidge and ARDRegression predictions for edge case of
# constant target vectors
n_samples = 4
n_features = 5
random_state = check_random_state(42)
constant_value = random_state.rand()
X = random_state.random_sample((n_samples, n_features))
y = np.full(n_samples, constant_value, dtype=np.array(constant_value).dtype)
expected = np.full(n_samples, constant_value, dtype=np.array(constant_value).dtype)
for clf in [BayesianRidge(), ARDRegression()]:
y_pred = clf.fit(X, y).predict(X)
assert_array_almost_equal(y_pred, expected)
def test_std_bayesian_ridge_ard_with_constant_input():
# Test BayesianRidge and ARDRegression standard dev. for edge case of
# constant target vector
# The standard dev. should be relatively small (< 0.01 is tested here)
n_samples = 10
n_features = 5
random_state = check_random_state(42)
constant_value = random_state.rand()
X = random_state.random_sample((n_samples, n_features))
y = np.full(n_samples, constant_value, dtype=np.array(constant_value).dtype)
expected_upper_boundary = 0.01
for clf in [BayesianRidge(), ARDRegression()]:
_, y_std = clf.fit(X, y).predict(X, return_std=True)
assert_array_less(y_std, expected_upper_boundary)
def test_update_of_sigma_in_ard():
# Checks that `sigma_` is updated correctly after the last iteration
# of the ARDRegression algorithm. See issue #10128.
X = np.array([[1, 0], [0, 0]])
y = np.array([0, 0])
clf = ARDRegression(n_iter=1)
clf.fit(X, y)
# With the inputs above, ARDRegression prunes both of the two coefficients
# in the first iteration. Hence, the expected shape of `sigma_` is (0, 0).
assert clf.sigma_.shape == (0, 0)
# Ensure that no error is thrown at prediction stage
clf.predict(X, return_std=True)
def test_toy_ard_object():
# Test BayesianRegression ARD classifier
X = np.array([[1], [2], [3]])
Y = np.array([1, 2, 3])
clf = ARDRegression(compute_score=True)
clf.fit(X, Y)
# Check that the model could approximately learn the identity function
test = [[1], [3], [4]]
assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
@pytest.mark.parametrize("seed", range(100))
@pytest.mark.parametrize("n_samples, n_features", ((10, 100), (100, 10)))
def test_ard_accuracy_on_easy_problem(seed, n_samples, n_features):
# Check that ARD converges with reasonable accuracy on an easy problem
# (Github issue #14055)
X = np.random.RandomState(seed=seed).normal(size=(250, 3))
y = X[:, 1]
regressor = ARDRegression()
regressor.fit(X, y)
abs_coef_error = np.abs(1 - regressor.coef_[1])
assert abs_coef_error < 1e-10
def test_return_std():
# Test return_std option for both Bayesian regressors
def f(X):
return np.dot(X, w) + b
def f_noise(X, noise_mult):
return f(X) + np.random.randn(X.shape[0]) * noise_mult
d = 5
n_train = 50
n_test = 10
w = np.array([1.0, 0.0, 1.0, -1.0, 0.0])
b = 1.0
X = np.random.random((n_train, d))
X_test = np.random.random((n_test, d))
for decimal, noise_mult in enumerate([1, 0.1, 0.01]):
y = f_noise(X, noise_mult)
m1 = BayesianRidge()
m1.fit(X, y)
y_mean1, y_std1 = m1.predict(X_test, return_std=True)
assert_array_almost_equal(y_std1, noise_mult, decimal=decimal)
m2 = ARDRegression()
m2.fit(X, y)
y_mean2, y_std2 = m2.predict(X_test, return_std=True)
assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)
@pytest.mark.parametrize("seed", range(10))
def test_update_sigma(seed):
# make sure the two update_sigma() helpers are equivalent. The woodbury
# formula is used when n_samples < n_features, and the other one is used
# otherwise.
rng = np.random.RandomState(seed)
# set n_samples == n_features to avoid instability issues when inverting
# the matrices. Using the woodbury formula would be unstable when
# n_samples > n_features
n_samples = n_features = 10
X = rng.randn(n_samples, n_features)
alpha = 1
lmbda = np.arange(1, n_features + 1)
keep_lambda = np.array([True] * n_features)
reg = ARDRegression()
sigma = reg._update_sigma(X, alpha, lmbda, keep_lambda)
sigma_woodbury = reg._update_sigma_woodbury(X, alpha, lmbda, keep_lambda)
np.testing.assert_allclose(sigma, sigma_woodbury)
# FIXME: 'normalize' to be removed in 1.2 in LinearRegression
@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
def test_ard_regression_predict_normalize_true():
"""Check that we can predict with `normalize=True` and `return_std=True`.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/18605
"""
clf = ARDRegression(normalize=True)
clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
clf.predict([[1, 1]], return_std=True)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("Estimator", [BayesianRidge, ARDRegression])
def test_dtype_match(dtype, Estimator):
# Test that np.float32 input data is not cast to np.float64 when possible
X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]], dtype=dtype)
y = np.array([1, 2, 3, 2, 0, 4, 5]).T
model = Estimator()
# check type consistency
model.fit(X, y)
attributes = ["coef_", "sigma_"]
for attribute in attributes:
assert getattr(model, attribute).dtype == X.dtype
y_mean, y_std = model.predict(X, return_std=True)
assert y_mean.dtype == X.dtype
assert y_std.dtype == X.dtype
@pytest.mark.parametrize("Estimator", [BayesianRidge, ARDRegression])
def test_dtype_correctness(Estimator):
X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
y = np.array([1, 2, 3, 2, 0, 4, 5]).T
model = Estimator()
coef_32 = model.fit(X.astype(np.float32), y).coef_
coef_64 = model.fit(X.astype(np.float64), y).coef_
np.testing.assert_allclose(coef_32, coef_64, rtol=1e-4)

View File

@@ -0,0 +1,78 @@
# Author: Maria Telenczuk <https://github.com/maikia>
#
# License: BSD 3 clause
import pytest
import sys
import warnings
import numpy as np
from sklearn.base import is_classifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ARDRegression
from sklearn.utils.fixes import np_version, parse_version
from sklearn.utils import check_random_state
@pytest.mark.parametrize(
"normalize, n_warnings, warning_category",
[(True, 1, FutureWarning), (False, 1, FutureWarning), ("deprecated", 0, None)],
)
@pytest.mark.parametrize(
"estimator",
[
LinearRegression,
Ridge,
RidgeCV,
RidgeClassifier,
RidgeClassifierCV,
BayesianRidge,
ARDRegression,
],
)
# FIXME remove test in 1.2
@pytest.mark.xfail(
sys.platform == "darwin" and np_version < parse_version("1.22"),
reason="https://github.com/scikit-learn/scikit-learn/issues/21395",
)
def test_linear_model_normalize_deprecation_message(
estimator, normalize, n_warnings, warning_category
):
# check that we issue a FutureWarning when normalize was set in
# linear model
rng = check_random_state(0)
n_samples = 200
n_features = 2
X = rng.randn(n_samples, n_features)
X[X < 0.1] = 0.0
y = rng.rand(n_samples)
if is_classifier(estimator):
y = np.sign(y)
model = estimator(normalize=normalize)
if warning_category is None:
with warnings.catch_warnings():
warnings.simplefilter("error", FutureWarning)
model.fit(X, y)
return
with pytest.warns(warning_category) as record:
model.fit(X, y)
# Filter record in case other unrelated warnings are raised
unwanted = [r for r in record if r.category != warning_category]
if len(unwanted):
msg = "unexpected warnings:\n"
for w in unwanted:
msg += str(w)
msg += "\n"
raise AssertionError(msg)
wanted = [r for r in record if r.category == warning_category]
assert "'normalize' was deprecated" in str(wanted[0].message)
assert len(wanted) == n_warnings

View File

@@ -0,0 +1,211 @@
# Authors: Manoj Kumar mks542@nyu.edu
# License: BSD 3 clause
import numpy as np
from scipy import optimize, sparse
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.datasets import make_regression
from sklearn.linear_model import HuberRegressor, LinearRegression, SGDRegressor, Ridge
from sklearn.linear_model._huber import _huber_loss_and_gradient
def make_regression_with_outliers(n_samples=50, n_features=20):
rng = np.random.RandomState(0)
# Generate data with outliers by replacing 10% of the samples with noise.
X, y = make_regression(
n_samples=n_samples, n_features=n_features, random_state=0, noise=0.05
)
# Replace 10% of the sample with noise.
num_noise = int(0.1 * n_samples)
random_samples = rng.randint(0, n_samples, num_noise)
X[random_samples, :] = 2.0 * rng.normal(0, 1, (num_noise, X.shape[1]))
return X, y
def test_huber_equals_lr_for_high_epsilon():
# Test that Ridge matches LinearRegression for large epsilon
X, y = make_regression_with_outliers()
lr = LinearRegression()
lr.fit(X, y)
huber = HuberRegressor(epsilon=1e3, alpha=0.0)
huber.fit(X, y)
assert_almost_equal(huber.coef_, lr.coef_, 3)
assert_almost_equal(huber.intercept_, lr.intercept_, 2)
def test_huber_max_iter():
X, y = make_regression_with_outliers()
huber = HuberRegressor(max_iter=1)
huber.fit(X, y)
assert huber.n_iter_ == huber.max_iter
def test_huber_gradient():
# Test that the gradient calculated by _huber_loss_and_gradient is correct
rng = np.random.RandomState(1)
X, y = make_regression_with_outliers()
sample_weight = rng.randint(1, 3, (y.shape[0]))
def loss_func(x, *args):
return _huber_loss_and_gradient(x, *args)[0]
def grad_func(x, *args):
return _huber_loss_and_gradient(x, *args)[1]
# Check using optimize.check_grad that the gradients are equal.
for _ in range(5):
# Check for both fit_intercept and otherwise.
for n_features in [X.shape[1] + 1, X.shape[1] + 2]:
w = rng.randn(n_features)
w[-1] = np.abs(w[-1])
grad_same = optimize.check_grad(
loss_func, grad_func, w, X, y, 0.01, 0.1, sample_weight
)
assert_almost_equal(grad_same, 1e-6, 4)
def test_huber_sample_weights():
# Test sample_weights implementation in HuberRegressor"""
X, y = make_regression_with_outliers()
huber = HuberRegressor()
huber.fit(X, y)
huber_coef = huber.coef_
huber_intercept = huber.intercept_
# Rescale coefs before comparing with assert_array_almost_equal to make
# sure that the number of decimal places used is somewhat insensitive to
# the amplitude of the coefficients and therefore to the scale of the
# data and the regularization parameter
scale = max(np.mean(np.abs(huber.coef_)), np.mean(np.abs(huber.intercept_)))
huber.fit(X, y, sample_weight=np.ones(y.shape[0]))
assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale)
X, y = make_regression_with_outliers(n_samples=5, n_features=20)
X_new = np.vstack((X, np.vstack((X[1], X[1], X[3]))))
y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]]))
huber.fit(X_new, y_new)
huber_coef = huber.coef_
huber_intercept = huber.intercept_
sample_weight = np.ones(X.shape[0])
sample_weight[1] = 3
sample_weight[3] = 2
huber.fit(X, y, sample_weight=sample_weight)
assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale)
# Test sparse implementation with sample weights.
X_csr = sparse.csr_matrix(X)
huber_sparse = HuberRegressor()
huber_sparse.fit(X_csr, y, sample_weight=sample_weight)
assert_array_almost_equal(huber_sparse.coef_ / scale, huber_coef / scale)
def test_huber_sparse():
X, y = make_regression_with_outliers()
huber = HuberRegressor(alpha=0.1)
huber.fit(X, y)
X_csr = sparse.csr_matrix(X)
huber_sparse = HuberRegressor(alpha=0.1)
huber_sparse.fit(X_csr, y)
assert_array_almost_equal(huber_sparse.coef_, huber.coef_)
assert_array_equal(huber.outliers_, huber_sparse.outliers_)
def test_huber_scaling_invariant():
# Test that outliers filtering is scaling independent.
X, y = make_regression_with_outliers()
huber = HuberRegressor(fit_intercept=False, alpha=0.0)
huber.fit(X, y)
n_outliers_mask_1 = huber.outliers_
assert not np.all(n_outliers_mask_1)
huber.fit(X, 2.0 * y)
n_outliers_mask_2 = huber.outliers_
assert_array_equal(n_outliers_mask_2, n_outliers_mask_1)
huber.fit(2.0 * X, 2.0 * y)
n_outliers_mask_3 = huber.outliers_
assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
def test_huber_and_sgd_same_results():
# Test they should converge to same coefficients for same parameters
X, y = make_regression_with_outliers(n_samples=10, n_features=2)
# Fit once to find out the scale parameter. Scale down X and y by scale
# so that the scale parameter is optimized to 1.0
huber = HuberRegressor(fit_intercept=False, alpha=0.0, epsilon=1.35)
huber.fit(X, y)
X_scale = X / huber.scale_
y_scale = y / huber.scale_
huber.fit(X_scale, y_scale)
assert_almost_equal(huber.scale_, 1.0, 3)
sgdreg = SGDRegressor(
alpha=0.0,
loss="huber",
shuffle=True,
random_state=0,
max_iter=10000,
fit_intercept=False,
epsilon=1.35,
tol=None,
)
sgdreg.fit(X_scale, y_scale)
assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1)
def test_huber_warm_start():
X, y = make_regression_with_outliers()
huber_warm = HuberRegressor(alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1)
huber_warm.fit(X, y)
huber_warm_coef = huber_warm.coef_.copy()
huber_warm.fit(X, y)
# SciPy performs the tol check after doing the coef updates, so
# these would be almost same but not equal.
assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1)
assert huber_warm.n_iter_ == 0
def test_huber_better_r2_score():
# Test that huber returns a better r2 score than non-outliers"""
X, y = make_regression_with_outliers()
huber = HuberRegressor(alpha=0.01)
huber.fit(X, y)
linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y
mask = np.abs(linear_loss) < huber.epsilon * huber.scale_
huber_score = huber.score(X[mask], y[mask])
huber_outlier_score = huber.score(X[~mask], y[~mask])
# The Ridge regressor should be influenced by the outliers and hence
# give a worse score on the non-outliers as compared to the huber
# regressor.
ridge = Ridge(alpha=0.01)
ridge.fit(X, y)
ridge_score = ridge.score(X[mask], y[mask])
ridge_outlier_score = ridge.score(X[~mask], y[~mask])
assert huber_score > ridge_score
# The huber model should also fit poorly on the outliers.
assert ridge_outlier_score > huber_outlier_score
def test_huber_bool():
# Test that it does not crash with bool data
X, y = make_regression(n_samples=200, n_features=2, noise=4.0, random_state=0)
X_bool = X > 0
HuberRegressor().fit(X_bool, y)

View File

@@ -0,0 +1,980 @@
import warnings
import numpy as np
import pytest
from scipy import linalg
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import ignore_warnings
from sklearn.utils._testing import TempMemmap
from sklearn.utils import check_random_state
from sklearn.exceptions import ConvergenceWarning
from sklearn import linear_model, datasets
from sklearn.linear_model._least_angle import _lars_path_residues
from sklearn.linear_model import LassoLarsIC, lars_path
from sklearn.linear_model import Lars, LassoLars, LarsCV, LassoLarsCV
# TODO: use another dataset that has multiple drops
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target
G = np.dot(X.T, X)
Xy = np.dot(X.T, y)
n_samples = y.size
# FIXME: 'normalize' to be removed in 1.4
filterwarnings_normalize = pytest.mark.filterwarnings(
"ignore:The default of 'normalize'"
)
# FIXME: 'normalize' to be removed in 1.4
@pytest.mark.parametrize(
"LeastAngleModel", [Lars, LassoLars, LarsCV, LassoLarsCV, LassoLarsIC]
)
@pytest.mark.parametrize(
"normalize, n_warnings", [(True, 0), (False, 0), ("deprecated", 1)]
)
def test_assure_warning_when_normalize(LeastAngleModel, normalize, n_warnings):
# check that we issue a FutureWarning when normalize was set
rng = check_random_state(0)
n_samples = 200
n_features = 2
X = rng.randn(n_samples, n_features)
X[X < 0.1] = 0.0
y = rng.rand(n_samples)
model = LeastAngleModel(normalize=normalize)
with warnings.catch_warnings(record=True) as rec:
warnings.simplefilter("always", FutureWarning)
model.fit(X, y)
assert len([w.message for w in rec]) == n_warnings
def test_simple():
# Principle of Lars is to keep covariances tied and decreasing
# also test verbose output
from io import StringIO
import sys
old_stdout = sys.stdout
try:
sys.stdout = StringIO()
_, _, coef_path_ = linear_model.lars_path(X, y, method="lar", verbose=10)
sys.stdout = old_stdout
for i, coef_ in enumerate(coef_path_.T):
res = y - np.dot(X, coef_)
cov = np.dot(X.T, res)
C = np.max(abs(cov))
eps = 1e-3
ocur = len(cov[C - eps < abs(cov)])
if i < X.shape[1]:
assert ocur == i + 1
else:
# no more than max_pred variables can go into the active set
assert ocur == X.shape[1]
finally:
sys.stdout = old_stdout
def test_simple_precomputed():
# The same, with precomputed Gram matrix
_, _, coef_path_ = linear_model.lars_path(X, y, Gram=G, method="lar")
for i, coef_ in enumerate(coef_path_.T):
res = y - np.dot(X, coef_)
cov = np.dot(X.T, res)
C = np.max(abs(cov))
eps = 1e-3
ocur = len(cov[C - eps < abs(cov)])
if i < X.shape[1]:
assert ocur == i + 1
else:
# no more than max_pred variables can go into the active set
assert ocur == X.shape[1]
def _assert_same_lars_path_result(output1, output2):
assert len(output1) == len(output2)
for o1, o2 in zip(output1, output2):
assert_allclose(o1, o2)
@pytest.mark.parametrize("method", ["lar", "lasso"])
@pytest.mark.parametrize("return_path", [True, False])
def test_lars_path_gram_equivalent(method, return_path):
_assert_same_lars_path_result(
linear_model.lars_path_gram(
Xy=Xy, Gram=G, n_samples=n_samples, method=method, return_path=return_path
),
linear_model.lars_path(X, y, Gram=G, method=method, return_path=return_path),
)
def test_x_none_gram_none_raises_value_error():
# Test that lars_path with no X and Gram raises exception
Xy = np.dot(X.T, y)
with pytest.raises(ValueError):
linear_model.lars_path(None, y, Gram=None, Xy=Xy)
def test_all_precomputed():
# Test that lars_path with precomputed Gram and Xy gives the right answer
G = np.dot(X.T, X)
Xy = np.dot(X.T, y)
for method in "lar", "lasso":
output = linear_model.lars_path(X, y, method=method)
output_pre = linear_model.lars_path(X, y, Gram=G, Xy=Xy, method=method)
for expected, got in zip(output, output_pre):
assert_array_almost_equal(expected, got)
# FIXME: 'normalize' to be removed in 1.4
@filterwarnings_normalize
@pytest.mark.filterwarnings("ignore: `rcond` parameter will change")
# numpy deprecation
def test_lars_lstsq():
# Test that Lars gives least square solution at the end
# of the path
X1 = 3 * X # use un-normalized dataset
clf = linear_model.LassoLars(alpha=0.0)
clf.fit(X1, y)
coef_lstsq = np.linalg.lstsq(X1, y, rcond=None)[0]
assert_array_almost_equal(clf.coef_, coef_lstsq)
@pytest.mark.filterwarnings("ignore:`rcond` parameter will change")
# numpy deprecation
def test_lasso_gives_lstsq_solution():
# Test that Lars Lasso gives least square solution at the end
# of the path
_, _, coef_path_ = linear_model.lars_path(X, y, method="lasso")
coef_lstsq = np.linalg.lstsq(X, y)[0]
assert_array_almost_equal(coef_lstsq, coef_path_[:, -1])
def test_collinearity():
# Check that lars_path is robust to collinearity in input
X = np.array([[3.0, 3.0, 1.0], [2.0, 2.0, 0.0], [1.0, 1.0, 0]])
y = np.array([1.0, 0.0, 0])
rng = np.random.RandomState(0)
f = ignore_warnings
_, _, coef_path_ = f(linear_model.lars_path)(X, y, alpha_min=0.01)
assert not np.isnan(coef_path_).any()
residual = np.dot(X, coef_path_[:, -1]) - y
assert (residual**2).sum() < 1.0 # just make sure it's bounded
n_samples = 10
X = rng.rand(n_samples, 5)
y = np.zeros(n_samples)
_, _, coef_path_ = linear_model.lars_path(
X,
y,
Gram="auto",
copy_X=False,
copy_Gram=False,
alpha_min=0.0,
method="lasso",
verbose=0,
max_iter=500,
)
assert_array_almost_equal(coef_path_, np.zeros_like(coef_path_))
def test_no_path():
# Test that the ``return_path=False`` option returns the correct output
alphas_, _, coef_path_ = linear_model.lars_path(X, y, method="lar")
alpha_, _, coef = linear_model.lars_path(X, y, method="lar", return_path=False)
assert_array_almost_equal(coef, coef_path_[:, -1])
assert alpha_ == alphas_[-1]
def test_no_path_precomputed():
# Test that the ``return_path=False`` option with Gram remains correct
alphas_, _, coef_path_ = linear_model.lars_path(X, y, method="lar", Gram=G)
alpha_, _, coef = linear_model.lars_path(
X, y, method="lar", Gram=G, return_path=False
)
assert_array_almost_equal(coef, coef_path_[:, -1])
assert alpha_ == alphas_[-1]
def test_no_path_all_precomputed():
# Test that the ``return_path=False`` option with Gram and Xy remains
# correct
X, y = 3 * diabetes.data, diabetes.target
G = np.dot(X.T, X)
Xy = np.dot(X.T, y)
alphas_, _, coef_path_ = linear_model.lars_path(
X, y, method="lasso", Xy=Xy, Gram=G, alpha_min=0.9
)
alpha_, _, coef = linear_model.lars_path(
X, y, method="lasso", Gram=G, Xy=Xy, alpha_min=0.9, return_path=False
)
assert_array_almost_equal(coef, coef_path_[:, -1])
assert alpha_ == alphas_[-1]
@filterwarnings_normalize
@pytest.mark.parametrize(
"classifier", [linear_model.Lars, linear_model.LarsCV, linear_model.LassoLarsIC]
)
def test_lars_precompute(classifier):
# Check for different values of precompute
G = np.dot(X.T, X)
clf = classifier(precompute=G)
output_1 = ignore_warnings(clf.fit)(X, y).coef_
for precompute in [True, False, "auto", None]:
clf = classifier(precompute=precompute)
output_2 = clf.fit(X, y).coef_
assert_array_almost_equal(output_1, output_2, decimal=8)
def test_singular_matrix():
# Test when input is a singular matrix
X1 = np.array([[1, 1.0], [1.0, 1.0]])
y1 = np.array([1, 1])
_, _, coef_path = linear_model.lars_path(X1, y1)
assert_array_almost_equal(coef_path.T, [[0, 0], [1, 0]])
@filterwarnings_normalize
def test_rank_deficient_design():
# consistency test that checks that LARS Lasso is handling rank
# deficient input data (with n_features < rank) in the same way
# as coordinate descent Lasso
y = [5, 0, 5]
for X in ([[5, 0], [0, 5], [10, 10]], [[10, 10, 0], [1e-32, 0, 0], [0, 0, 1]]):
# To be able to use the coefs to compute the objective function,
# we need to turn off normalization
lars = linear_model.LassoLars(0.1, normalize=False)
coef_lars_ = lars.fit(X, y).coef_
obj_lars = 1.0 / (2.0 * 3.0) * linalg.norm(
y - np.dot(X, coef_lars_)
) ** 2 + 0.1 * linalg.norm(coef_lars_, 1)
coord_descent = linear_model.Lasso(0.1, tol=1e-6)
coef_cd_ = coord_descent.fit(X, y).coef_
obj_cd = (1.0 / (2.0 * 3.0)) * linalg.norm(
y - np.dot(X, coef_cd_)
) ** 2 + 0.1 * linalg.norm(coef_cd_, 1)
assert obj_lars < obj_cd * (1.0 + 1e-8)
@filterwarnings_normalize
def test_lasso_lars_vs_lasso_cd():
# Test that LassoLars and Lasso using coordinate descent give the
# same results.
X = 3 * diabetes.data
alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso")
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
for c, a in zip(lasso_path.T, alphas):
if a == 0:
continue
lasso_cd.alpha = a
lasso_cd.fit(X, y)
error = linalg.norm(c - lasso_cd.coef_)
assert error < 0.01
# similar test, with the classifiers
for alpha in np.linspace(1e-2, 1 - 1e-2, 20):
clf1 = linear_model.LassoLars(alpha=alpha, normalize=False).fit(X, y)
clf2 = linear_model.Lasso(alpha=alpha, tol=1e-8).fit(X, y)
err = linalg.norm(clf1.coef_ - clf2.coef_)
assert err < 1e-3
# same test, with normalized data
X = diabetes.data
X = X - X.sum(axis=0)
X /= np.linalg.norm(X, axis=0)
alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso")
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
for c, a in zip(lasso_path.T, alphas):
if a == 0:
continue
lasso_cd.alpha = a
lasso_cd.fit(X, y)
error = linalg.norm(c - lasso_cd.coef_)
assert error < 0.01
@filterwarnings_normalize
def test_lasso_lars_vs_lasso_cd_early_stopping():
# Test that LassoLars and Lasso using coordinate descent give the
# same results when early stopping is used.
# (test : before, in the middle, and in the last part of the path)
alphas_min = [10, 0.9, 1e-4]
X = diabetes.data
for alpha_min in alphas_min:
alphas, _, lasso_path = linear_model.lars_path(
X, y, method="lasso", alpha_min=alpha_min
)
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
lasso_cd.alpha = alphas[-1]
lasso_cd.fit(X, y)
error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
assert error < 0.01
# same test, with normalization
X = diabetes.data - diabetes.data.sum(axis=0)
X /= np.linalg.norm(X, axis=0)
for alpha_min in alphas_min:
alphas, _, lasso_path = linear_model.lars_path(
X, y, method="lasso", alpha_min=alpha_min
)
lasso_cd = linear_model.Lasso(tol=1e-8)
lasso_cd.alpha = alphas[-1]
lasso_cd.fit(X, y)
error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
assert error < 0.01
@filterwarnings_normalize
def test_lasso_lars_path_length():
# Test that the path length of the LassoLars is right
lasso = linear_model.LassoLars()
lasso.fit(X, y)
lasso2 = linear_model.LassoLars(alpha=lasso.alphas_[2])
lasso2.fit(X, y)
assert_array_almost_equal(lasso.alphas_[:3], lasso2.alphas_)
# Also check that the sequence of alphas is always decreasing
assert np.all(np.diff(lasso.alphas_) < 0)
def test_lasso_lars_vs_lasso_cd_ill_conditioned():
# Test lasso lars on a very ill-conditioned design, and check that
# it does not blow up, and stays somewhat close to a solution given
# by the coordinate descent solver
# Also test that lasso_path (using lars_path output style) gives
# the same result as lars_path and previous lasso output style
# under these conditions.
rng = np.random.RandomState(42)
# Generate data
n, m = 70, 100
k = 5
X = rng.randn(n, m)
w = np.zeros((m, 1))
i = np.arange(0, m)
rng.shuffle(i)
supp = i[:k]
w[supp] = np.sign(rng.randn(k, 1)) * (rng.rand(k, 1) + 1)
y = np.dot(X, w)
sigma = 0.2
y += sigma * rng.rand(*y.shape)
y = y.squeeze()
lars_alphas, _, lars_coef = linear_model.lars_path(X, y, method="lasso")
_, lasso_coef2, _ = linear_model.lasso_path(X, y, alphas=lars_alphas, tol=1e-6)
assert_array_almost_equal(lars_coef, lasso_coef2, decimal=1)
@filterwarnings_normalize
def test_lasso_lars_vs_lasso_cd_ill_conditioned2():
# Create an ill-conditioned situation in which the LARS has to go
# far in the path to converge, and check that LARS and coordinate
# descent give the same answers
# Note it used to be the case that Lars had to use the drop for good
# strategy for this but this is no longer the case with the
# equality_tolerance checks
X = [[1e20, 1e20, 0], [-1e-32, 0, 0], [1, 1, 1]]
y = [10, 10, 1]
alpha = 0.0001
def objective_function(coef):
return 1.0 / (2.0 * len(X)) * linalg.norm(
y - np.dot(X, coef)
) ** 2 + alpha * linalg.norm(coef, 1)
lars = linear_model.LassoLars(alpha=alpha, normalize=False)
warning_message = "Regressors in active set degenerate."
with pytest.warns(ConvergenceWarning, match=warning_message):
lars.fit(X, y)
lars_coef_ = lars.coef_
lars_obj = objective_function(lars_coef_)
coord_descent = linear_model.Lasso(alpha=alpha, tol=1e-4)
cd_coef_ = coord_descent.fit(X, y).coef_
cd_obj = objective_function(cd_coef_)
assert lars_obj < cd_obj * (1.0 + 1e-8)
@filterwarnings_normalize
def test_lars_add_features():
# assure that at least some features get added if necessary
# test for 6d2b4c
# Hilbert matrix
n = 5
H = 1.0 / (np.arange(1, n + 1) + np.arange(n)[:, np.newaxis])
clf = linear_model.Lars(fit_intercept=False).fit(H, np.arange(n))
assert np.all(np.isfinite(clf.coef_))
@filterwarnings_normalize
def test_lars_n_nonzero_coefs(verbose=False):
lars = linear_model.Lars(n_nonzero_coefs=6, verbose=verbose)
lars.fit(X, y)
assert len(lars.coef_.nonzero()[0]) == 6
# The path should be of length 6 + 1 in a Lars going down to 6
# non-zero coefs
assert len(lars.alphas_) == 7
@filterwarnings_normalize
@ignore_warnings
def test_multitarget():
# Assure that estimators receiving multidimensional y do the right thing
Y = np.vstack([y, y**2]).T
n_targets = Y.shape[1]
estimators = [
linear_model.LassoLars(),
linear_model.Lars(),
# regression test for gh-1615
linear_model.LassoLars(fit_intercept=False),
linear_model.Lars(fit_intercept=False),
]
for estimator in estimators:
estimator.fit(X, Y)
Y_pred = estimator.predict(X)
alphas, active, coef, path = (
estimator.alphas_,
estimator.active_,
estimator.coef_,
estimator.coef_path_,
)
for k in range(n_targets):
estimator.fit(X, Y[:, k])
y_pred = estimator.predict(X)
assert_array_almost_equal(alphas[k], estimator.alphas_)
assert_array_almost_equal(active[k], estimator.active_)
assert_array_almost_equal(coef[k], estimator.coef_)
assert_array_almost_equal(path[k], estimator.coef_path_)
assert_array_almost_equal(Y_pred[:, k], y_pred)
@filterwarnings_normalize
def test_lars_cv():
# Test the LassoLarsCV object by checking that the optimal alpha
# increases as the number of samples increases.
# This property is not actually guaranteed in general and is just a
# property of the given dataset, with the given steps chosen.
old_alpha = 0
lars_cv = linear_model.LassoLarsCV()
for length in (400, 200, 100):
X = diabetes.data[:length]
y = diabetes.target[:length]
lars_cv.fit(X, y)
np.testing.assert_array_less(old_alpha, lars_cv.alpha_)
old_alpha = lars_cv.alpha_
assert not hasattr(lars_cv, "n_nonzero_coefs")
@filterwarnings_normalize
def test_lars_cv_max_iter(recwarn):
warnings.simplefilter("always")
with np.errstate(divide="raise", invalid="raise"):
X = diabetes.data
y = diabetes.target
rng = np.random.RandomState(42)
x = rng.randn(len(y))
X = diabetes.data
X = np.c_[X, x, x] # add correlated features
lars_cv = linear_model.LassoLarsCV(max_iter=5, cv=5)
lars_cv.fit(X, y)
# Check that there is no warning in general and no ConvergenceWarning
# in particular.
# Materialize the string representation of the warning to get a more
# informative error message in case of AssertionError.
recorded_warnings = [str(w) for w in recwarn]
# FIXME: when 'normalize' is removed set exchange below for:
# assert len(recorded_warnings) == []
assert len(recorded_warnings) == 1
assert "normalize' will be set to False in version 1.2" in recorded_warnings[0]
@filterwarnings_normalize
def test_lasso_lars_ic():
# Test the LassoLarsIC object by checking that
# - some good features are selected.
# - alpha_bic > alpha_aic
# - n_nonzero_bic < n_nonzero_aic
lars_bic = linear_model.LassoLarsIC("bic")
lars_aic = linear_model.LassoLarsIC("aic")
rng = np.random.RandomState(42)
X = diabetes.data
X = np.c_[X, rng.randn(X.shape[0], 5)] # add 5 bad features
lars_bic.fit(X, y)
lars_aic.fit(X, y)
nonzero_bic = np.where(lars_bic.coef_)[0]
nonzero_aic = np.where(lars_aic.coef_)[0]
assert lars_bic.alpha_ > lars_aic.alpha_
assert len(nonzero_bic) < len(nonzero_aic)
assert np.max(nonzero_bic) < diabetes.data.shape[1]
# test error on unknown IC
lars_broken = linear_model.LassoLarsIC("<unknown>")
with pytest.raises(ValueError):
lars_broken.fit(X, y)
def test_lars_path_readonly_data():
# When using automated memory mapping on large input, the
# fold data is in read-only mode
# This is a non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/4597
splitted_data = train_test_split(X, y, random_state=42)
with TempMemmap(splitted_data) as (X_train, X_test, y_train, y_test):
# The following should not fail despite copy=False
_lars_path_residues(X_train, y_train, X_test, y_test, copy=False)
def test_lars_path_positive_constraint():
# this is the main test for the positive parameter on the lars_path method
# the estimator classes just make use of this function
# we do the test on the diabetes dataset
# ensure that we get negative coefficients when positive=False
# and all positive when positive=True
# for method 'lar' (default) and lasso
err_msg = "Positive constraint not supported for 'lar' coding method."
with pytest.raises(ValueError, match=err_msg):
linear_model.lars_path(
diabetes["data"], diabetes["target"], method="lar", positive=True
)
method = "lasso"
_, _, coefs = linear_model.lars_path(
X, y, return_path=True, method=method, positive=False
)
assert coefs.min() < 0
_, _, coefs = linear_model.lars_path(
X, y, return_path=True, method=method, positive=True
)
assert coefs.min() >= 0
# now we gonna test the positive option for all estimator classes
default_parameter = {"fit_intercept": False}
estimator_parameter_map = {
"LassoLars": {"alpha": 0.1},
"LassoLarsCV": {},
"LassoLarsIC": {},
}
@filterwarnings_normalize
def test_estimatorclasses_positive_constraint():
# testing the transmissibility for the positive option of all estimator
# classes in this same function here
default_parameter = {"fit_intercept": False}
estimator_parameter_map = {
"LassoLars": {"alpha": 0.1},
"LassoLarsCV": {},
"LassoLarsIC": {},
}
for estname in estimator_parameter_map:
params = default_parameter.copy()
params.update(estimator_parameter_map[estname])
estimator = getattr(linear_model, estname)(positive=False, **params)
estimator.fit(X, y)
assert estimator.coef_.min() < 0
estimator = getattr(linear_model, estname)(positive=True, **params)
estimator.fit(X, y)
assert min(estimator.coef_) >= 0
@filterwarnings_normalize
def test_lasso_lars_vs_lasso_cd_positive():
# Test that LassoLars and Lasso using coordinate descent give the
# same results when using the positive option
# This test is basically a copy of the above with additional positive
# option. However for the middle part, the comparison of coefficient values
# for a range of alphas, we had to make an adaptations. See below.
# not normalized data
X = 3 * diabetes.data
alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso", positive=True)
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
for c, a in zip(lasso_path.T, alphas):
if a == 0:
continue
lasso_cd.alpha = a
lasso_cd.fit(X, y)
error = linalg.norm(c - lasso_cd.coef_)
assert error < 0.01
# The range of alphas chosen for coefficient comparison here is restricted
# as compared with the above test without the positive option. This is due
# to the circumstance that the Lars-Lasso algorithm does not converge to
# the least-squares-solution for small alphas, see 'Least Angle Regression'
# by Efron et al 2004. The coefficients are typically in congruence up to
# the smallest alpha reached by the Lars-Lasso algorithm and start to
# diverge thereafter. See
# https://gist.github.com/michigraber/7e7d7c75eca694c7a6ff
for alpha in np.linspace(6e-1, 1 - 1e-2, 20):
clf1 = linear_model.LassoLars(
fit_intercept=False, alpha=alpha, normalize=False, positive=True
).fit(X, y)
clf2 = linear_model.Lasso(
fit_intercept=False, alpha=alpha, tol=1e-8, positive=True
).fit(X, y)
err = linalg.norm(clf1.coef_ - clf2.coef_)
assert err < 1e-3
# normalized data
X = diabetes.data - diabetes.data.sum(axis=0)
X /= np.linalg.norm(X, axis=0)
alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso", positive=True)
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
for c, a in zip(lasso_path.T[:-1], alphas[:-1]): # don't include alpha=0
lasso_cd.alpha = a
lasso_cd.fit(X, y)
error = linalg.norm(c - lasso_cd.coef_)
assert error < 0.01
@filterwarnings_normalize
def test_lasso_lars_vs_R_implementation():
# Test that sklearn LassoLars implementation agrees with the LassoLars
# implementation available in R (lars library) under the following
# scenarios:
# 1) fit_intercept=False and normalize=False
# 2) fit_intercept=True and normalize=True
# Let's generate the data used in the bug report 7778
y = np.array([-6.45006793, -3.51251449, -8.52445396, 6.12277822, -19.42109366])
x = np.array(
[
[0.47299829, 0, 0, 0, 0],
[0.08239882, 0.85784863, 0, 0, 0],
[0.30114139, -0.07501577, 0.80895216, 0, 0],
[-0.01460346, -0.1015233, 0.0407278, 0.80338378, 0],
[-0.69363927, 0.06754067, 0.18064514, -0.0803561, 0.40427291],
]
)
X = x.T
###########################################################################
# Scenario 1: Let's compare R vs sklearn when fit_intercept=False and
# normalize=False
###########################################################################
#
# The R result was obtained using the following code:
#
# library(lars)
# model_lasso_lars = lars(X, t(y), type="lasso", intercept=FALSE,
# trace=TRUE, normalize=FALSE)
# r = t(model_lasso_lars$beta)
#
r = np.array(
[
[
0,
0,
0,
0,
0,
-79.810362809499026,
-83.528788732782829,
-83.777653739190711,
-83.784156932888934,
-84.033390591756657,
],
[0, 0, 0, 0, -0.476624256777266, 0, 0, 0, 0, 0.025219751009936],
[
0,
-3.577397088285891,
-4.702795355871871,
-7.016748621359461,
-7.614898471899412,
-0.336938391359179,
0,
0,
0.001213370600853,
0.048162321585148,
],
[
0,
0,
0,
2.231558436628169,
2.723267514525966,
2.811549786389614,
2.813766976061531,
2.817462468949557,
2.817368178703816,
2.816221090636795,
],
[
0,
0,
-1.218422599914637,
-3.457726183014808,
-4.021304522060710,
-45.827461592423745,
-47.776608869312305,
-47.911561610746404,
-47.914845922736234,
-48.039562334265717,
],
]
)
model_lasso_lars = linear_model.LassoLars(
alpha=0, fit_intercept=False, normalize=False
)
model_lasso_lars.fit(X, y)
skl_betas = model_lasso_lars.coef_path_
assert_array_almost_equal(r, skl_betas, decimal=12)
###########################################################################
###########################################################################
# Scenario 2: Let's compare R vs sklearn when fit_intercept=True and
# normalize=True
#
# Note: When normalize is equal to True, R returns the coefficients in
# their original units, that is, they are rescaled back, whereas sklearn
# does not do that, therefore, we need to do this step before comparing
# their results.
###########################################################################
#
# The R result was obtained using the following code:
#
# library(lars)
# model_lasso_lars2 = lars(X, t(y), type="lasso", intercept=TRUE,
# trace=TRUE, normalize=TRUE)
# r2 = t(model_lasso_lars2$beta)
r2 = np.array(
[
[0, 0, 0, 0, 0],
[0, 0, 0, 8.371887668009453, 19.463768371044026],
[0, 0, 0, 0, 9.901611055290553],
[
0,
7.495923132833733,
9.245133544334507,
17.389369207545062,
26.971656815643499,
],
[0, 0, -1.569380717440311, -5.924804108067312, -7.996385265061972],
]
)
model_lasso_lars2 = linear_model.LassoLars(alpha=0, normalize=True)
model_lasso_lars2.fit(X, y)
skl_betas2 = model_lasso_lars2.coef_path_
# Let's rescale back the coefficients returned by sklearn before comparing
# against the R result (read the note above)
temp = X - np.mean(X, axis=0)
normx = np.sqrt(np.sum(temp**2, axis=0))
skl_betas2 /= normx[:, np.newaxis]
assert_array_almost_equal(r2, skl_betas2, decimal=12)
###########################################################################
@filterwarnings_normalize
@pytest.mark.parametrize("copy_X", [True, False])
def test_lasso_lars_copyX_behaviour(copy_X):
"""
Test that user input regarding copy_X is not being overridden (it was until
at least version 0.21)
"""
lasso_lars = LassoLarsIC(copy_X=copy_X, precompute=False)
rng = np.random.RandomState(0)
X = rng.normal(0, 1, (100, 5))
X_copy = X.copy()
y = X[:, 2]
lasso_lars.fit(X, y)
assert copy_X == np.array_equal(X, X_copy)
@filterwarnings_normalize
@pytest.mark.parametrize("copy_X", [True, False])
def test_lasso_lars_fit_copyX_behaviour(copy_X):
"""
Test that user input to .fit for copy_X overrides default __init__ value
"""
lasso_lars = LassoLarsIC(precompute=False)
rng = np.random.RandomState(0)
X = rng.normal(0, 1, (100, 5))
X_copy = X.copy()
y = X[:, 2]
lasso_lars.fit(X, y, copy_X=copy_X)
assert copy_X == np.array_equal(X, X_copy)
@filterwarnings_normalize
@pytest.mark.parametrize("est", (LassoLars(alpha=1e-3), Lars()))
def test_lars_with_jitter(est):
# Test that a small amount of jitter helps stability,
# using example provided in issue #2746
X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]])
y = [-2.5, -2.5]
expected_coef = [0, 2.5, 0, 2.5, 0]
# set to fit_intercept to False since target is constant and we want check
# the value of coef. coef would be all zeros otherwise.
est.set_params(fit_intercept=False)
est_jitter = clone(est).set_params(jitter=10e-8, random_state=0)
est.fit(X, y)
est_jitter.fit(X, y)
assert np.mean((est.coef_ - est_jitter.coef_) ** 2) > 0.1
np.testing.assert_allclose(est_jitter.coef_, expected_coef, rtol=1e-3)
def test_X_none_gram_not_none():
with pytest.raises(ValueError, match="X cannot be None if Gram is not None"):
lars_path(X=None, y=[1], Gram="not None")
def test_copy_X_with_auto_gram():
# Non-regression test for #17789, `copy_X=True` and Gram='auto' does not
# overwrite X
rng = np.random.RandomState(42)
X = rng.rand(6, 6)
y = rng.rand(6)
X_before = X.copy()
linear_model.lars_path(X, y, Gram="auto", copy_X=True, method="lasso")
# X did not change
assert_allclose(X, X_before)
@pytest.mark.parametrize(
"LARS, has_coef_path, args",
(
(Lars, True, {}),
(LassoLars, True, {}),
(LassoLarsIC, False, {}),
(LarsCV, True, {}),
# max_iter=5 is for avoiding ConvergenceWarning
(LassoLarsCV, True, {"max_iter": 5}),
),
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
@filterwarnings_normalize
def test_lars_dtype_match(LARS, has_coef_path, args, dtype):
# The test ensures that the fit method preserves input dtype
rng = np.random.RandomState(0)
X = rng.rand(20, 6).astype(dtype)
y = rng.rand(20).astype(dtype)
model = LARS(**args)
model.fit(X, y)
assert model.coef_.dtype == dtype
if has_coef_path:
assert model.coef_path_.dtype == dtype
assert model.intercept_.dtype == dtype
@pytest.mark.parametrize(
"LARS, has_coef_path, args",
(
(Lars, True, {}),
(LassoLars, True, {}),
(LassoLarsIC, False, {}),
(LarsCV, True, {}),
# max_iter=5 is for avoiding ConvergenceWarning
(LassoLarsCV, True, {"max_iter": 5}),
),
)
@filterwarnings_normalize
def test_lars_numeric_consistency(LARS, has_coef_path, args):
# The test ensures numerical consistency between trained coefficients
# of float32 and float64.
rtol = 1e-5
atol = 1e-5
rng = np.random.RandomState(0)
X_64 = rng.rand(10, 6)
y_64 = rng.rand(10)
model_64 = LARS(**args).fit(X_64, y_64)
model_32 = LARS(**args).fit(X_64.astype(np.float32), y_64.astype(np.float32))
assert_allclose(model_64.coef_, model_32.coef_, rtol=rtol, atol=atol)
if has_coef_path:
assert_allclose(model_64.coef_path_, model_32.coef_path_, rtol=rtol, atol=atol)
assert_allclose(model_64.intercept_, model_32.intercept_, rtol=rtol, atol=atol)
@pytest.mark.parametrize("criterion", ["aic", "bic"])
def test_lassolarsic_alpha_selection(criterion):
"""Check that we properly compute the AIC and BIC score.
In this test, we reproduce the example of the Fig. 2 of Zou et al.
(reference [1] in LassoLarsIC) In this example, only 7 features should be
selected.
"""
model = make_pipeline(
StandardScaler(), LassoLarsIC(criterion=criterion, normalize=False)
)
model.fit(X, y)
best_alpha_selected = np.argmin(model[-1].criterion_)
assert best_alpha_selected == 7
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_lassolarsic_noise_variance(fit_intercept):
"""Check the behaviour when `n_samples` < `n_features` and that one needs
to provide the noise variance."""
rng = np.random.RandomState(0)
X, y = datasets.make_regression(
n_samples=10, n_features=11 - fit_intercept, random_state=rng
)
model = make_pipeline(
StandardScaler(), LassoLarsIC(fit_intercept=fit_intercept, normalize=False)
)
err_msg = (
"You are using LassoLarsIC in the case where the number of samples is smaller"
" than the number of features"
)
with pytest.raises(ValueError, match=err_msg):
model.fit(X, y)
model.set_params(lassolarsic__noise_variance=1.0)
model.fit(X, y).predict(X)

View File

@@ -0,0 +1,308 @@
"""
Tests for LinearModelLoss
Note that correctness of losses (which compose LinearModelLoss) is already well
covered in the _loss module.
"""
import pytest
import numpy as np
from numpy.testing import assert_allclose
from scipy import linalg, optimize, sparse
from sklearn._loss.loss import (
HalfBinomialLoss,
HalfMultinomialLoss,
HalfPoissonLoss,
)
from sklearn.datasets import make_low_rank_matrix
from sklearn.linear_model._linear_loss import LinearModelLoss
from sklearn.utils.extmath import squared_norm
# We do not need to test all losses, just what LinearModelLoss does on top of the
# base losses.
LOSSES = [HalfBinomialLoss, HalfMultinomialLoss, HalfPoissonLoss]
def random_X_y_coef(
linear_model_loss, n_samples, n_features, coef_bound=(-2, 2), seed=42
):
"""Random generate y, X and coef in valid range."""
rng = np.random.RandomState(seed)
n_dof = n_features + linear_model_loss.fit_intercept
X = make_low_rank_matrix(
n_samples=n_samples,
n_features=n_features,
random_state=rng,
)
if linear_model_loss.base_loss.is_multiclass:
n_classes = linear_model_loss.base_loss.n_classes
coef = np.empty((n_classes, n_dof))
coef.flat[:] = rng.uniform(
low=coef_bound[0],
high=coef_bound[1],
size=n_classes * n_dof,
)
if linear_model_loss.fit_intercept:
raw_prediction = X @ coef[:, :-1].T + coef[:, -1]
else:
raw_prediction = X @ coef.T
proba = linear_model_loss.base_loss.link.inverse(raw_prediction)
# y = rng.choice(np.arange(n_classes), p=proba) does not work.
# See https://stackoverflow.com/a/34190035/16761084
def choice_vectorized(items, p):
s = p.cumsum(axis=1)
r = rng.rand(p.shape[0])[:, None]
k = (s < r).sum(axis=1)
return items[k]
y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64)
else:
coef = np.empty((n_dof,))
coef.flat[:] = rng.uniform(
low=coef_bound[0],
high=coef_bound[1],
size=n_dof,
)
if linear_model_loss.fit_intercept:
raw_prediction = X @ coef[:-1] + coef[-1]
else:
raw_prediction = X @ coef
y = linear_model_loss.base_loss.link.inverse(
raw_prediction + rng.uniform(low=-1, high=1, size=n_samples)
)
return X, y, coef
@pytest.mark.parametrize("base_loss", LOSSES)
@pytest.mark.parametrize("fit_intercept", [False, True])
@pytest.mark.parametrize("sample_weight", [None, "range"])
@pytest.mark.parametrize("l2_reg_strength", [0, 1])
def test_loss_gradients_are_the_same(
base_loss, fit_intercept, sample_weight, l2_reg_strength
):
"""Test that loss and gradient are the same across different functions."""
loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
X, y, coef = random_X_y_coef(
linear_model_loss=loss, n_samples=10, n_features=5, seed=42
)
if sample_weight == "range":
sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
l1 = loss.loss(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
g1 = loss.gradient(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
l2, g2 = loss.loss_gradient(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
g3, h3 = loss.gradient_hessian_product(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
assert_allclose(l1, l2)
assert_allclose(g1, g2)
assert_allclose(g1, g3)
# same for sparse X
X = sparse.csr_matrix(X)
l1_sp = loss.loss(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
g1_sp = loss.gradient(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
l2_sp, g2_sp = loss.loss_gradient(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
g3_sp, h3_sp = loss.gradient_hessian_product(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
assert_allclose(l1, l1_sp)
assert_allclose(l1, l2_sp)
assert_allclose(g1, g1_sp)
assert_allclose(g1, g2_sp)
assert_allclose(g1, g3_sp)
assert_allclose(h3(g1), h3_sp(g1_sp))
@pytest.mark.parametrize("base_loss", LOSSES)
@pytest.mark.parametrize("sample_weight", [None, "range"])
@pytest.mark.parametrize("l2_reg_strength", [0, 1])
@pytest.mark.parametrize("X_sparse", [False, True])
def test_loss_gradients_hessp_intercept(
base_loss, sample_weight, l2_reg_strength, X_sparse
):
"""Test that loss and gradient handle intercept correctly."""
loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=False)
loss_inter = LinearModelLoss(base_loss=base_loss(), fit_intercept=True)
n_samples, n_features = 10, 5
X, y, coef = random_X_y_coef(
linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
)
X[:, -1] = 1 # make last column of 1 to mimic intercept term
X_inter = X[
:, :-1
] # exclude intercept column as it is added automatically by loss_inter
if X_sparse:
X = sparse.csr_matrix(X)
if sample_weight == "range":
sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
l, g = loss.loss_gradient(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
_, hessp = loss.gradient_hessian_product(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
l_inter, g_inter = loss_inter.loss_gradient(
coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
_, hessp_inter = loss_inter.gradient_hessian_product(
coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
# Note, that intercept gets no L2 penalty.
assert l == pytest.approx(
l_inter + 0.5 * l2_reg_strength * squared_norm(coef.T[-1])
)
g_inter_corrected = g_inter
g_inter_corrected.T[-1] += l2_reg_strength * coef.T[-1]
assert_allclose(g, g_inter_corrected)
s = np.random.RandomState(42).randn(*coef.shape)
h = hessp(s)
h_inter = hessp_inter(s)
h_inter_corrected = h_inter
h_inter_corrected.T[-1] += l2_reg_strength * s.T[-1]
assert_allclose(h, h_inter_corrected)
@pytest.mark.parametrize("base_loss", LOSSES)
@pytest.mark.parametrize("fit_intercept", [False, True])
@pytest.mark.parametrize("sample_weight", [None, "range"])
@pytest.mark.parametrize("l2_reg_strength", [0, 1])
def test_gradients_hessians_numerically(
base_loss, fit_intercept, sample_weight, l2_reg_strength
):
"""Test gradients and hessians with numerical derivatives.
Gradient should equal the numerical derivatives of the loss function.
Hessians should equal the numerical derivatives of gradients.
"""
loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
n_samples, n_features = 10, 5
X, y, coef = random_X_y_coef(
linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
)
coef = coef.ravel(order="F") # this is important only for multinomial loss
if sample_weight == "range":
sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
# 1. Check gradients numerically
eps = 1e-6
g, hessp = loss.gradient_hessian_product(
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
)
# Use a trick to get central finite difference of accuracy 4 (five-point stencil)
# https://en.wikipedia.org/wiki/Numerical_differentiation
# https://en.wikipedia.org/wiki/Finite_difference_coefficient
# approx_g1 = (f(x + eps) - f(x - eps)) / (2*eps)
approx_g1 = optimize.approx_fprime(
coef,
lambda coef: loss.loss(
coef - eps,
X,
y,
sample_weight=sample_weight,
l2_reg_strength=l2_reg_strength,
),
2 * eps,
)
# approx_g2 = (f(x + 2*eps) - f(x - 2*eps)) / (4*eps)
approx_g2 = optimize.approx_fprime(
coef,
lambda coef: loss.loss(
coef - 2 * eps,
X,
y,
sample_weight=sample_weight,
l2_reg_strength=l2_reg_strength,
),
4 * eps,
)
# Five-point stencil approximation
# See: https://en.wikipedia.org/wiki/Five-point_stencil#1D_first_derivative
approx_g = (4 * approx_g1 - approx_g2) / 3
assert_allclose(g, approx_g, rtol=1e-2, atol=1e-8)
# 2. Check hessp numerically along the second direction of the gradient
vector = np.zeros_like(g)
vector[1] = 1
hess_col = hessp(vector)
# Computation of the Hessian is particularly fragile to numerical errors when doing
# simple finite differences. Here we compute the grad along a path in the direction
# of the vector and then use a least-square regression to estimate the slope
eps = 1e-3
d_x = np.linspace(-eps, eps, 30)
d_grad = np.array(
[
loss.gradient(
coef + t * vector,
X,
y,
sample_weight=sample_weight,
l2_reg_strength=l2_reg_strength,
)
for t in d_x
]
)
d_grad -= d_grad.mean(axis=0)
approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel()
assert_allclose(approx_hess_col, hess_col, rtol=1e-3)
@pytest.mark.parametrize("fit_intercept", [False, True])
def test_multinomial_coef_shape(fit_intercept):
"""Test that multinomial LinearModelLoss respects shape of coef."""
loss = LinearModelLoss(base_loss=HalfMultinomialLoss(), fit_intercept=fit_intercept)
n_samples, n_features = 10, 5
X, y, coef = random_X_y_coef(
linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
)
s = np.random.RandomState(42).randn(*coef.shape)
l, g = loss.loss_gradient(coef, X, y)
g1 = loss.gradient(coef, X, y)
g2, hessp = loss.gradient_hessian_product(coef, X, y)
h = hessp(s)
assert g.shape == coef.shape
assert h.shape == coef.shape
assert_allclose(g, g1)
assert_allclose(g, g2)
coef_r = coef.ravel(order="F")
s_r = s.ravel(order="F")
l_r, g_r = loss.loss_gradient(coef_r, X, y)
g1_r = loss.gradient(coef_r, X, y)
g2_r, hessp_r = loss.gradient_hessian_product(coef_r, X, y)
h_r = hessp_r(s_r)
assert g_r.shape == coef_r.shape
assert h_r.shape == coef_r.shape
assert_allclose(g_r, g1_r)
assert_allclose(g_r, g2_r)
assert_allclose(g, g_r.reshape(loss.base_loss.n_classes, -1, order="F"))
assert_allclose(h, h_r.reshape(loss.base_loss.n_classes, -1, order="F"))

View File

@@ -0,0 +1,294 @@
# Author: Vlad Niculae
# License: BSD 3 clause
import numpy as np
import pytest
import warnings
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import ignore_warnings
from sklearn.linear_model import (
orthogonal_mp,
orthogonal_mp_gram,
OrthogonalMatchingPursuit,
OrthogonalMatchingPursuitCV,
LinearRegression,
)
from sklearn.utils import check_random_state
from sklearn.datasets import make_sparse_coded_signal
n_samples, n_features, n_nonzero_coefs, n_targets = 25, 35, 5, 3
y, X, gamma = make_sparse_coded_signal(
n_samples=n_targets,
n_components=n_features,
n_features=n_samples,
n_nonzero_coefs=n_nonzero_coefs,
random_state=0,
data_transposed=True,
)
# Make X not of norm 1 for testing
X *= 10
y *= 10
G, Xy = np.dot(X.T, X), np.dot(X.T, y)
# this makes X (n_samples, n_features)
# and y (n_samples, 3)
# FIXME: 'normalize' to set to False in 1.2 and removed in 1.4
@pytest.mark.parametrize(
"OmpModel", [OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV]
)
@pytest.mark.parametrize(
"normalize, n_warnings", [(True, 0), (False, 0), ("deprecated", 1)]
)
def test_assure_warning_when_normalize(OmpModel, normalize, n_warnings):
# check that we issue a FutureWarning when normalize was set
rng = check_random_state(0)
n_samples = 200
n_features = 2
X = rng.randn(n_samples, n_features)
X[X < 0.1] = 0.0
y = rng.rand(n_samples)
model = OmpModel(normalize=normalize)
with warnings.catch_warnings(record=True) as rec:
warnings.simplefilter("always", FutureWarning)
model.fit(X, y)
assert len([w.message for w in rec]) == n_warnings
def test_correct_shapes():
assert orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5).shape == (n_features,)
assert orthogonal_mp(X, y, n_nonzero_coefs=5).shape == (n_features, 3)
def test_correct_shapes_gram():
assert orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5).shape == (n_features,)
assert orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5).shape == (n_features, 3)
def test_n_nonzero_coefs():
assert np.count_nonzero(orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5)) <= 5
assert (
np.count_nonzero(orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5, precompute=True))
<= 5
)
def test_tol():
tol = 0.5
gamma = orthogonal_mp(X, y[:, 0], tol=tol)
gamma_gram = orthogonal_mp(X, y[:, 0], tol=tol, precompute=True)
assert np.sum((y[:, 0] - np.dot(X, gamma)) ** 2) <= tol
assert np.sum((y[:, 0] - np.dot(X, gamma_gram)) ** 2) <= tol
def test_with_without_gram():
assert_array_almost_equal(
orthogonal_mp(X, y, n_nonzero_coefs=5),
orthogonal_mp(X, y, n_nonzero_coefs=5, precompute=True),
)
def test_with_without_gram_tol():
assert_array_almost_equal(
orthogonal_mp(X, y, tol=1.0), orthogonal_mp(X, y, tol=1.0, precompute=True)
)
def test_unreachable_accuracy():
assert_array_almost_equal(
orthogonal_mp(X, y, tol=0), orthogonal_mp(X, y, n_nonzero_coefs=n_features)
)
warning_message = (
"Orthogonal matching pursuit ended prematurely "
"due to linear dependence in the dictionary. "
"The requested precision might not have been met."
)
with pytest.warns(RuntimeWarning, match=warning_message):
assert_array_almost_equal(
orthogonal_mp(X, y, tol=0, precompute=True),
orthogonal_mp(X, y, precompute=True, n_nonzero_coefs=n_features),
)
@pytest.mark.parametrize("positional_params", [(X, y), (G, Xy)])
@pytest.mark.parametrize(
"keyword_params",
[{"tol": -1}, {"n_nonzero_coefs": -1}, {"n_nonzero_coefs": n_features + 1}],
)
def test_bad_input(positional_params, keyword_params):
with pytest.raises(ValueError):
orthogonal_mp(*positional_params, **keyword_params)
def test_perfect_signal_recovery():
(idx,) = gamma[:, 0].nonzero()
gamma_rec = orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5)
gamma_gram = orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5)
assert_array_equal(idx, np.flatnonzero(gamma_rec))
assert_array_equal(idx, np.flatnonzero(gamma_gram))
assert_array_almost_equal(gamma[:, 0], gamma_rec, decimal=2)
assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2)
def test_orthogonal_mp_gram_readonly():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/5956
(idx,) = gamma[:, 0].nonzero()
G_readonly = G.copy()
G_readonly.setflags(write=False)
Xy_readonly = Xy.copy()
Xy_readonly.setflags(write=False)
gamma_gram = orthogonal_mp_gram(
G_readonly, Xy_readonly[:, 0], n_nonzero_coefs=5, copy_Gram=False, copy_Xy=False
)
assert_array_equal(idx, np.flatnonzero(gamma_gram))
assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2)
# FIXME: 'normalize' to be removed in 1.4
@pytest.mark.filterwarnings("ignore:The default of 'normalize'")
def test_estimator():
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs)
omp.fit(X, y[:, 0])
assert omp.coef_.shape == (n_features,)
assert omp.intercept_.shape == ()
assert np.count_nonzero(omp.coef_) <= n_nonzero_coefs
omp.fit(X, y)
assert omp.coef_.shape == (n_targets, n_features)
assert omp.intercept_.shape == (n_targets,)
assert np.count_nonzero(omp.coef_) <= n_targets * n_nonzero_coefs
coef_normalized = omp.coef_[0].copy()
omp.set_params(fit_intercept=True, normalize=False)
omp.fit(X, y[:, 0])
assert_array_almost_equal(coef_normalized, omp.coef_)
omp.set_params(fit_intercept=False, normalize=False)
omp.fit(X, y[:, 0])
assert np.count_nonzero(omp.coef_) <= n_nonzero_coefs
assert omp.coef_.shape == (n_features,)
assert omp.intercept_ == 0
omp.fit(X, y)
assert omp.coef_.shape == (n_targets, n_features)
assert omp.intercept_ == 0
assert np.count_nonzero(omp.coef_) <= n_targets * n_nonzero_coefs
def test_identical_regressors():
newX = X.copy()
newX[:, 1] = newX[:, 0]
gamma = np.zeros(n_features)
gamma[0] = gamma[1] = 1.0
newy = np.dot(newX, gamma)
warning_message = (
"Orthogonal matching pursuit ended prematurely "
"due to linear dependence in the dictionary. "
"The requested precision might not have been met."
)
with pytest.warns(RuntimeWarning, match=warning_message):
orthogonal_mp(newX, newy, n_nonzero_coefs=2)
def test_swapped_regressors():
gamma = np.zeros(n_features)
# X[:, 21] should be selected first, then X[:, 0] selected second,
# which will take X[:, 21]'s place in case the algorithm does
# column swapping for optimization (which is the case at the moment)
gamma[21] = 1.0
gamma[0] = 0.5
new_y = np.dot(X, gamma)
new_Xy = np.dot(X.T, new_y)
gamma_hat = orthogonal_mp(X, new_y, n_nonzero_coefs=2)
gamma_hat_gram = orthogonal_mp_gram(G, new_Xy, n_nonzero_coefs=2)
assert_array_equal(np.flatnonzero(gamma_hat), [0, 21])
assert_array_equal(np.flatnonzero(gamma_hat_gram), [0, 21])
def test_no_atoms():
y_empty = np.zeros_like(y)
Xy_empty = np.dot(X.T, y_empty)
gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty, n_nonzero_coefs=1)
gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty, n_nonzero_coefs=1)
assert np.all(gamma_empty == 0)
assert np.all(gamma_empty_gram == 0)
def test_omp_path():
path = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=True)
last = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=False)
assert path.shape == (n_features, n_targets, 5)
assert_array_almost_equal(path[:, :, -1], last)
path = orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5, return_path=True)
last = orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5, return_path=False)
assert path.shape == (n_features, n_targets, 5)
assert_array_almost_equal(path[:, :, -1], last)
def test_omp_return_path_prop_with_gram():
path = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=True, precompute=True)
last = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=False, precompute=True)
assert path.shape == (n_features, n_targets, 5)
assert_array_almost_equal(path[:, :, -1], last)
# FIXME: 'normalize' to be removed in 1.4
@pytest.mark.filterwarnings("ignore:The default of 'normalize'")
def test_omp_cv():
y_ = y[:, 0]
gamma_ = gamma[:, 0]
ompcv = OrthogonalMatchingPursuitCV(
normalize=True, fit_intercept=False, max_iter=10
)
ompcv.fit(X, y_)
assert ompcv.n_nonzero_coefs_ == n_nonzero_coefs
assert_array_almost_equal(ompcv.coef_, gamma_)
omp = OrthogonalMatchingPursuit(
normalize=True, fit_intercept=False, n_nonzero_coefs=ompcv.n_nonzero_coefs_
)
omp.fit(X, y_)
assert_array_almost_equal(ompcv.coef_, omp.coef_)
# FIXME: 'normalize' to be removed in 1.4
@pytest.mark.filterwarnings("ignore:The default of 'normalize'")
def test_omp_reaches_least_squares():
# Use small simple data; it's a sanity check but OMP can stop early
rng = check_random_state(0)
n_samples, n_features = (10, 8)
n_targets = 3
X = rng.randn(n_samples, n_features)
Y = rng.randn(n_samples, n_targets)
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_features)
lstsq = LinearRegression()
omp.fit(X, Y)
lstsq.fit(X, Y)
assert_array_almost_equal(omp.coef_, lstsq.coef_)
@pytest.mark.parametrize("data_type", (np.float32, np.float64))
def test_omp_gram_dtype_match(data_type):
# verify matching input data type and output data type
coef = orthogonal_mp_gram(
G.astype(data_type), Xy.astype(data_type), n_nonzero_coefs=5
)
assert coef.dtype == data_type
def test_omp_gram_numerical_consistency():
# verify numericaly consistency among np.float32 and np.float64
coef_32 = orthogonal_mp_gram(
G.astype(np.float32), Xy.astype(np.float32), n_nonzero_coefs=5
)
coef_64 = orthogonal_mp_gram(
G.astype(np.float32), Xy.astype(np.float64), n_nonzero_coefs=5
)
assert_allclose(coef_32, coef_64)

View File

@@ -0,0 +1,318 @@
import numpy as np
import scipy.sparse as sp
import pytest
from sklearn.base import is_classifier
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.base import ClassifierMixin
from sklearn.utils import check_random_state
from sklearn.datasets import load_iris
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import PassiveAggressiveRegressor
iris = load_iris()
random_state = check_random_state(12)
indices = np.arange(iris.data.shape[0])
random_state.shuffle(indices)
X = iris.data[indices]
y = iris.target[indices]
X_csr = sp.csr_matrix(X)
class MyPassiveAggressive(ClassifierMixin):
def __init__(
self,
C=1.0,
epsilon=0.01,
loss="hinge",
fit_intercept=True,
n_iter=1,
random_state=None,
):
self.C = C
self.epsilon = epsilon
self.loss = loss
self.fit_intercept = fit_intercept
self.n_iter = n_iter
def fit(self, X, y):
n_samples, n_features = X.shape
self.w = np.zeros(n_features, dtype=np.float64)
self.b = 0.0
for t in range(self.n_iter):
for i in range(n_samples):
p = self.project(X[i])
if self.loss in ("hinge", "squared_hinge"):
loss = max(1 - y[i] * p, 0)
else:
loss = max(np.abs(p - y[i]) - self.epsilon, 0)
sqnorm = np.dot(X[i], X[i])
if self.loss in ("hinge", "epsilon_insensitive"):
step = min(self.C, loss / sqnorm)
elif self.loss in ("squared_hinge", "squared_epsilon_insensitive"):
step = loss / (sqnorm + 1.0 / (2 * self.C))
if self.loss in ("hinge", "squared_hinge"):
step *= y[i]
else:
step *= np.sign(y[i] - p)
self.w += step * X[i]
if self.fit_intercept:
self.b += step
def project(self, X):
return np.dot(X, self.w) + self.b
def test_classifier_accuracy():
for data in (X, X_csr):
for fit_intercept in (True, False):
for average in (False, True):
clf = PassiveAggressiveClassifier(
C=1.0,
max_iter=30,
fit_intercept=fit_intercept,
random_state=1,
average=average,
tol=None,
)
clf.fit(data, y)
score = clf.score(data, y)
assert score > 0.79
if average:
assert hasattr(clf, "_average_coef")
assert hasattr(clf, "_average_intercept")
assert hasattr(clf, "_standard_intercept")
assert hasattr(clf, "_standard_coef")
def test_classifier_partial_fit():
classes = np.unique(y)
for data in (X, X_csr):
for average in (False, True):
clf = PassiveAggressiveClassifier(
random_state=0, average=average, max_iter=5
)
for t in range(30):
clf.partial_fit(data, y, classes)
score = clf.score(data, y)
assert score > 0.79
if average:
assert hasattr(clf, "_average_coef")
assert hasattr(clf, "_average_intercept")
assert hasattr(clf, "_standard_intercept")
assert hasattr(clf, "_standard_coef")
def test_classifier_refit():
# Classifier can be retrained on different labels and features.
clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y)
assert_array_equal(clf.classes_, np.unique(y))
clf.fit(X[:, :-1], iris.target_names[y])
assert_array_equal(clf.classes_, iris.target_names)
@pytest.mark.parametrize("loss", ("hinge", "squared_hinge"))
def test_classifier_correctness(loss):
y_bin = y.copy()
y_bin[y != 1] = -1
clf1 = MyPassiveAggressive(loss=loss, n_iter=2)
clf1.fit(X, y_bin)
for data in (X, X_csr):
clf2 = PassiveAggressiveClassifier(
loss=loss, max_iter=2, shuffle=False, tol=None
)
clf2.fit(data, y_bin)
assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
@pytest.mark.parametrize(
"response_method", ["predict_proba", "predict_log_proba", "transform"]
)
def test_classifier_undefined_methods(response_method):
clf = PassiveAggressiveClassifier(max_iter=100)
with pytest.raises(AttributeError):
getattr(clf, response_method)
def test_class_weights():
# Test class weights.
X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
y2 = [1, 1, 1, -1, -1]
clf = PassiveAggressiveClassifier(
C=0.1, max_iter=100, class_weight=None, random_state=100
)
clf.fit(X2, y2)
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
# we give a small weights to class 1
clf = PassiveAggressiveClassifier(
C=0.1, max_iter=100, class_weight={1: 0.001}, random_state=100
)
clf.fit(X2, y2)
# now the hyperplane should rotate clock-wise and
# the prediction on this point should shift
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
def test_partial_fit_weight_class_balanced():
# partial_fit with class_weight='balanced' not supported
clf = PassiveAggressiveClassifier(class_weight="balanced", max_iter=100)
with pytest.raises(ValueError):
clf.partial_fit(X, y, classes=np.unique(y))
def test_equal_class_weight():
X2 = [[1, 0], [1, 0], [0, 1], [0, 1]]
y2 = [0, 0, 1, 1]
clf = PassiveAggressiveClassifier(C=0.1, tol=None, class_weight=None)
clf.fit(X2, y2)
# Already balanced, so "balanced" weights should have no effect
clf_balanced = PassiveAggressiveClassifier(C=0.1, tol=None, class_weight="balanced")
clf_balanced.fit(X2, y2)
clf_weighted = PassiveAggressiveClassifier(
C=0.1, tol=None, class_weight={0: 0.5, 1: 0.5}
)
clf_weighted.fit(X2, y2)
# should be similar up to some epsilon due to learning rate schedule
assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2)
def test_wrong_class_weight_label():
# ValueError due to wrong class_weight label.
X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
y2 = [1, 1, 1, -1, -1]
clf = PassiveAggressiveClassifier(class_weight={0: 0.5}, max_iter=100)
with pytest.raises(ValueError):
clf.fit(X2, y2)
def test_wrong_class_weight_format():
# ValueError due to wrong class_weight argument type.
X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
y2 = [1, 1, 1, -1, -1]
clf = PassiveAggressiveClassifier(class_weight=[0.5], max_iter=100)
with pytest.raises(ValueError):
clf.fit(X2, y2)
clf = PassiveAggressiveClassifier(class_weight="the larch", max_iter=100)
with pytest.raises(ValueError):
clf.fit(X2, y2)
def test_regressor_mse():
y_bin = y.copy()
y_bin[y != 1] = -1
for data in (X, X_csr):
for fit_intercept in (True, False):
for average in (False, True):
reg = PassiveAggressiveRegressor(
C=1.0,
fit_intercept=fit_intercept,
random_state=0,
average=average,
max_iter=5,
)
reg.fit(data, y_bin)
pred = reg.predict(data)
assert np.mean((pred - y_bin) ** 2) < 1.7
if average:
assert hasattr(reg, "_average_coef")
assert hasattr(reg, "_average_intercept")
assert hasattr(reg, "_standard_intercept")
assert hasattr(reg, "_standard_coef")
def test_regressor_partial_fit():
y_bin = y.copy()
y_bin[y != 1] = -1
for data in (X, X_csr):
for average in (False, True):
reg = PassiveAggressiveRegressor(
random_state=0, average=average, max_iter=100
)
for t in range(50):
reg.partial_fit(data, y_bin)
pred = reg.predict(data)
assert np.mean((pred - y_bin) ** 2) < 1.7
if average:
assert hasattr(reg, "_average_coef")
assert hasattr(reg, "_average_intercept")
assert hasattr(reg, "_standard_intercept")
assert hasattr(reg, "_standard_coef")
@pytest.mark.parametrize("loss", ("epsilon_insensitive", "squared_epsilon_insensitive"))
def test_regressor_correctness(loss):
y_bin = y.copy()
y_bin[y != 1] = -1
reg1 = MyPassiveAggressive(loss=loss, n_iter=2)
reg1.fit(X, y_bin)
for data in (X, X_csr):
reg2 = PassiveAggressiveRegressor(
tol=None, loss=loss, max_iter=2, shuffle=False
)
reg2.fit(data, y_bin)
assert_array_almost_equal(reg1.w, reg2.coef_.ravel(), decimal=2)
def test_regressor_undefined_methods():
reg = PassiveAggressiveRegressor(max_iter=100)
with pytest.raises(AttributeError):
reg.transform(X)
@pytest.mark.parametrize(
"klass", [PassiveAggressiveClassifier, PassiveAggressiveRegressor]
)
@pytest.mark.parametrize("fit_method", ["fit", "partial_fit"])
@pytest.mark.parametrize(
"params, err_msg",
[
({"loss": "foobar"}, "The loss foobar is not supported"),
({"max_iter": -1}, "max_iter must be > zero"),
({"shuffle": "false"}, "shuffle must be either True or False"),
({"early_stopping": "false"}, "early_stopping must be either True or False"),
(
{"validation_fraction": -0.1},
r"validation_fraction must be in range \(0, 1\)",
),
({"n_iter_no_change": 0}, "n_iter_no_change must be >= 1"),
],
)
def test_passive_aggressive_estimator_params_validation(
klass, fit_method, params, err_msg
):
"""Validate parameters in the different PassiveAggressive estimators."""
sgd_estimator = klass(**params)
with pytest.raises(ValueError, match=err_msg):
if is_classifier(sgd_estimator) and fit_method == "partial_fit":
fit_params = {"classes": np.unique(y)}
else:
fit_params = {}
getattr(sgd_estimator, fit_method)(X, y, **fit_params)

View File

@@ -0,0 +1,90 @@
import numpy as np
import scipy.sparse as sp
import pytest
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils import check_random_state
from sklearn.datasets import load_iris
from sklearn.linear_model import Perceptron
iris = load_iris()
random_state = check_random_state(12)
indices = np.arange(iris.data.shape[0])
random_state.shuffle(indices)
X = iris.data[indices]
y = iris.target[indices]
X_csr = sp.csr_matrix(X)
X_csr.sort_indices()
class MyPerceptron:
def __init__(self, n_iter=1):
self.n_iter = n_iter
def fit(self, X, y):
n_samples, n_features = X.shape
self.w = np.zeros(n_features, dtype=np.float64)
self.b = 0.0
for t in range(self.n_iter):
for i in range(n_samples):
if self.predict(X[i])[0] != y[i]:
self.w += y[i] * X[i]
self.b += y[i]
def project(self, X):
return np.dot(X, self.w) + self.b
def predict(self, X):
X = np.atleast_2d(X)
return np.sign(self.project(X))
def test_perceptron_accuracy():
for data in (X, X_csr):
clf = Perceptron(max_iter=100, tol=None, shuffle=False)
clf.fit(data, y)
score = clf.score(data, y)
assert score > 0.7
def test_perceptron_correctness():
y_bin = y.copy()
y_bin[y != 1] = -1
clf1 = MyPerceptron(n_iter=2)
clf1.fit(X, y_bin)
clf2 = Perceptron(max_iter=2, shuffle=False, tol=None)
clf2.fit(X, y_bin)
assert_array_almost_equal(clf1.w, clf2.coef_.ravel())
def test_undefined_methods():
clf = Perceptron(max_iter=100)
for meth in ("predict_proba", "predict_log_proba"):
with pytest.raises(AttributeError):
getattr(clf, meth)
def test_perceptron_l1_ratio():
"""Check that `l1_ratio` has an impact when `penalty='elasticnet'`"""
clf1 = Perceptron(l1_ratio=0, penalty="elasticnet")
clf1.fit(X, y)
clf2 = Perceptron(l1_ratio=0.15, penalty="elasticnet")
clf2.fit(X, y)
assert clf1.score(X, y) != clf2.score(X, y)
# check that the bounds of elastic net which should correspond to an l1 or
# l2 penalty depending of `l1_ratio` value.
clf_l1 = Perceptron(penalty="l1").fit(X, y)
clf_elasticnet = Perceptron(l1_ratio=1, penalty="elasticnet").fit(X, y)
assert_allclose(clf_l1.coef_, clf_elasticnet.coef_)
clf_l2 = Perceptron(penalty="l2").fit(X, y)
clf_elasticnet = Perceptron(l1_ratio=0, penalty="elasticnet").fit(X, y)
assert_allclose(clf_l2.coef_, clf_elasticnet.coef_)

View File

@@ -0,0 +1,291 @@
# Authors: David Dale <dale.david@mail.ru>
# Christian Lorentzen <lorentzen.ch@gmail.com>
# License: BSD 3 clause
import numpy as np
import pytest
from pytest import approx
from scipy.optimize import minimize
from scipy import sparse
from sklearn.datasets import make_regression
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import HuberRegressor, QuantileRegressor
from sklearn.metrics import mean_pinball_loss
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import skip_if_32bit
from sklearn.utils.fixes import parse_version, sp_version
@pytest.fixture
def X_y_data():
X, y = make_regression(n_samples=10, n_features=1, random_state=0, noise=1)
return X, y
@pytest.mark.parametrize(
"params, err_msg",
[
({"quantile": 2}, "Quantile should be strictly between 0.0 and 1.0"),
({"quantile": 1}, "Quantile should be strictly between 0.0 and 1.0"),
({"quantile": 0}, "Quantile should be strictly between 0.0 and 1.0"),
({"quantile": -1}, "Quantile should be strictly between 0.0 and 1.0"),
({"alpha": -1.5}, "Penalty alpha must be a non-negative number"),
({"fit_intercept": "blah"}, "The argument fit_intercept must be bool"),
({"fit_intercept": 0}, "The argument fit_intercept must be bool"),
({"solver": "blah"}, "Invalid value for argument solver"),
(
{"solver_options": "blah"},
"Invalid value for argument solver_options",
),
],
)
def test_init_parameters_validation(X_y_data, params, err_msg):
"""Test that invalid init parameters raise errors."""
X, y = X_y_data
with pytest.raises(ValueError, match=err_msg):
QuantileRegressor(**params).fit(X, y)
@pytest.mark.parametrize("solver", ["interior-point", "revised simplex"])
def test_incompatible_solver_for_sparse_input(X_y_data, solver):
X, y = X_y_data
X_sparse = sparse.csc_matrix(X)
err_msg = (
f"Solver {solver} does not support sparse X. Use solver 'highs' for example."
)
with pytest.raises(ValueError, match=err_msg):
QuantileRegressor(solver=solver).fit(X_sparse, y)
@pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs"))
@pytest.mark.skipif(
sp_version >= parse_version("1.6.0"),
reason="Solvers are available as of scipy 1.6.0",
)
def test_too_new_solver_methods_raise_error(X_y_data, solver):
"""Test that highs solver raises for scipy<1.6.0."""
X, y = X_y_data
with pytest.raises(ValueError, match="scipy>=1.6.0"):
QuantileRegressor(solver=solver).fit(X, y)
@pytest.mark.parametrize(
"quantile, alpha, intercept, coef",
[
# for 50% quantile w/o regularization, any slope in [1, 10] is okay
[0.5, 0, 1, None],
# if positive error costs more, the slope is maximal
[0.51, 0, 1, 10],
# if negative error costs more, the slope is minimal
[0.49, 0, 1, 1],
# for a small lasso penalty, the slope is also minimal
[0.5, 0.01, 1, 1],
# for a large lasso penalty, the model predicts the constant median
[0.5, 100, 2, 0],
],
)
def test_quantile_toy_example(quantile, alpha, intercept, coef):
# test how different parameters affect a small intuitive example
X = [[0], [1], [1]]
y = [1, 2, 11]
model = QuantileRegressor(quantile=quantile, alpha=alpha).fit(X, y)
assert_allclose(model.intercept_, intercept, atol=1e-2)
if coef is not None:
assert_allclose(model.coef_[0], coef, atol=1e-2)
if alpha < 100:
assert model.coef_[0] >= 1
assert model.coef_[0] <= 10
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_quantile_equals_huber_for_low_epsilon(fit_intercept):
X, y = make_regression(n_samples=100, n_features=20, random_state=0, noise=1.0)
alpha = 1e-4
huber = HuberRegressor(
epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept
).fit(X, y)
quant = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(X, y)
assert_allclose(huber.coef_, quant.coef_, atol=1e-1)
if fit_intercept:
assert huber.intercept_ == approx(quant.intercept_, abs=1e-1)
# check that we still predict fraction
assert np.mean(y < quant.predict(X)) == approx(0.5, abs=1e-1)
@pytest.mark.parametrize("q", [0.5, 0.9, 0.05])
def test_quantile_estimates_calibration(q):
# Test that model estimates percentage of points below the prediction
X, y = make_regression(n_samples=1000, n_features=20, random_state=0, noise=1.0)
quant = QuantileRegressor(
quantile=q,
alpha=0,
solver_options={"lstsq": False},
).fit(X, y)
assert np.mean(y < quant.predict(X)) == approx(q, abs=1e-2)
def test_quantile_sample_weight():
# test that with unequal sample weights we still estimate weighted fraction
n = 1000
X, y = make_regression(n_samples=n, n_features=5, random_state=0, noise=10.0)
weight = np.ones(n)
# when we increase weight of upper observations,
# estimate of quantile should go up
weight[y > y.mean()] = 100
quant = QuantileRegressor(quantile=0.5, alpha=1e-8, solver_options={"lstsq": False})
quant.fit(X, y, sample_weight=weight)
fraction_below = np.mean(y < quant.predict(X))
assert fraction_below > 0.5
weighted_fraction_below = np.average(y < quant.predict(X), weights=weight)
assert weighted_fraction_below == approx(0.5, abs=3e-2)
@pytest.mark.skipif(
sp_version < parse_version("1.6.0"),
reason="The `highs` solver is available from the 1.6.0 scipy version",
)
@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
def test_asymmetric_error(quantile):
"""Test quantile regression for asymmetric distributed targets."""
n_samples = 1000
rng = np.random.RandomState(42)
X = np.concatenate(
(
np.abs(rng.randn(n_samples)[:, None]),
-rng.randint(2, size=(n_samples, 1)),
),
axis=1,
)
intercept = 1.23
coef = np.array([0.5, -2])
# Take care that X @ coef + intercept > 0
assert np.min(X @ coef + intercept) > 0
# For an exponential distribution with rate lambda, e.g. exp(-lambda * x),
# the quantile at level q is:
# quantile(q) = - log(1 - q) / lambda
# scale = 1/lambda = -quantile(q) / log(1 - q)
y = rng.exponential(
scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples
)
model = QuantileRegressor(
quantile=quantile,
alpha=0,
solver="highs",
).fit(X, y)
# This test can be made to pass with any solver but in the interest
# of sparing continuous integration resources, the test is performed
# with the fastest solver only.
assert model.intercept_ == approx(intercept, rel=0.2)
assert_allclose(model.coef_, coef, rtol=0.6)
assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2)
# Now compare to Nelder-Mead optimization with L1 penalty
alpha = 0.01
model.set_params(alpha=alpha).fit(X, y)
model_coef = np.r_[model.intercept_, model.coef_]
def func(coef):
loss = mean_pinball_loss(y, X @ coef[1:] + coef[0], alpha=quantile)
L1 = np.sum(np.abs(coef[1:]))
return loss + alpha * L1
res = minimize(
fun=func,
x0=[1, 0, -1],
method="Nelder-Mead",
tol=1e-12,
options={"maxiter": 2000},
)
assert func(model_coef) == approx(func(res.x))
assert_allclose(model.intercept_, res.x[0])
assert_allclose(model.coef_, res.x[1:])
assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2)
@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
def test_equivariance(quantile):
"""Test equivariace of quantile regression.
See Koenker (2005) Quantile Regression, Chapter 2.2.3.
"""
rng = np.random.RandomState(42)
n_samples, n_features = 100, 5
X, y = make_regression(
n_samples=n_samples,
n_features=n_features,
n_informative=n_features,
noise=0,
random_state=rng,
shuffle=False,
)
# make y asymmetric
y += rng.exponential(scale=100, size=y.shape)
params = dict(alpha=0, solver_options={"lstsq": True, "tol": 1e-10})
model1 = QuantileRegressor(quantile=quantile, **params).fit(X, y)
# coef(q; a*y, X) = a * coef(q; y, X)
a = 2.5
model2 = QuantileRegressor(quantile=quantile, **params).fit(X, a * y)
assert model2.intercept_ == approx(a * model1.intercept_, rel=1e-5)
assert_allclose(model2.coef_, a * model1.coef_, rtol=1e-5)
# coef(1-q; -a*y, X) = -a * coef(q; y, X)
model2 = QuantileRegressor(quantile=1 - quantile, **params).fit(X, -a * y)
assert model2.intercept_ == approx(-a * model1.intercept_, rel=1e-5)
assert_allclose(model2.coef_, -a * model1.coef_, rtol=1e-5)
# coef(q; y + X @ g, X) = coef(q; y, X) + g
g_intercept, g_coef = rng.randn(), rng.randn(n_features)
model2 = QuantileRegressor(quantile=quantile, **params)
model2.fit(X, y + X @ g_coef + g_intercept)
assert model2.intercept_ == approx(model1.intercept_ + g_intercept)
assert_allclose(model2.coef_, model1.coef_ + g_coef, rtol=1e-6)
# coef(q; y, X @ A) = A^-1 @ coef(q; y, X)
A = rng.randn(n_features, n_features)
model2 = QuantileRegressor(quantile=quantile, **params)
model2.fit(X @ A, y)
assert model2.intercept_ == approx(model1.intercept_, rel=1e-5)
assert_allclose(model2.coef_, np.linalg.solve(A, model1.coef_), rtol=1e-5)
def test_linprog_failure():
"""Test that linprog fails."""
X = np.linspace(0, 10, num=10).reshape(-1, 1)
y = np.linspace(0, 10, num=10)
reg = QuantileRegressor(
alpha=0, solver="interior-point", solver_options={"maxiter": 1}
)
msg = "Linear programming for QuantileRegressor did not succeed."
with pytest.warns(ConvergenceWarning, match=msg):
reg.fit(X, y)
@skip_if_32bit
@pytest.mark.skipif(
sp_version <= parse_version("1.6.0"),
reason="Solvers are available as of scipy 1.6.0",
)
@pytest.mark.parametrize(
"sparse_format", [sparse.csc_matrix, sparse.csr_matrix, sparse.coo_matrix]
)
@pytest.mark.parametrize("solver", ["highs", "highs-ds", "highs-ipm"])
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_sparse_input(sparse_format, solver, fit_intercept):
"""Test that sparse and dense X give same results."""
X, y = make_regression(n_samples=100, n_features=20, random_state=1, noise=1.0)
X_sparse = sparse_format(X)
alpha = 1e-4
quant_dense = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(X, y)
quant_sparse = QuantileRegressor(
alpha=alpha, fit_intercept=fit_intercept, solver=solver
).fit(X_sparse, y)
assert_allclose(quant_sparse.coef_, quant_dense.coef_, rtol=1e-2)
if fit_intercept:
assert quant_sparse.intercept_ == approx(quant_dense.intercept_)
# check that we still predict fraction
assert 0.45 <= np.mean(y < quant_sparse.predict(X_sparse)) <= 0.55

View File

@@ -0,0 +1,650 @@
import numpy as np
import pytest
from scipy import sparse
from numpy.testing import assert_array_almost_equal
from numpy.testing import assert_array_equal
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_allclose
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression, RANSACRegressor, Ridge
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model._ransac import _dynamic_max_trials
from sklearn.exceptions import ConvergenceWarning
# Generate coordinates of line
X = np.arange(-200, 200)
y = 0.2 * X + 20
data = np.column_stack([X, y])
# Add some faulty data
rng = np.random.RandomState(1000)
outliers = np.unique(rng.randint(len(X), size=200))
data[outliers, :] += 50 + rng.rand(len(outliers), 2) * 10
X = data[:, 0][:, np.newaxis]
y = data[:, 1]
def test_ransac_inliers_outliers():
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, min_samples=2, residual_threshold=5, random_state=0
)
# Estimate parameters of corrupted data
ransac_estimator.fit(X, y)
# Ground truth / reference inlier mask
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
ref_inlier_mask[outliers] = False
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
def test_ransac_is_data_valid():
def is_data_valid(X, y):
assert X.shape[0] == 2
assert y.shape[0] == 2
return False
rng = np.random.RandomState(0)
X = rng.rand(10, 2)
y = rng.rand(10, 1)
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator,
min_samples=2,
residual_threshold=5,
is_data_valid=is_data_valid,
random_state=0,
)
with pytest.raises(ValueError):
ransac_estimator.fit(X, y)
def test_ransac_is_model_valid():
def is_model_valid(estimator, X, y):
assert X.shape[0] == 2
assert y.shape[0] == 2
return False
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator,
min_samples=2,
residual_threshold=5,
is_model_valid=is_model_valid,
random_state=0,
)
with pytest.raises(ValueError):
ransac_estimator.fit(X, y)
def test_ransac_max_trials():
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator,
min_samples=2,
residual_threshold=5,
max_trials=0,
random_state=0,
)
with pytest.raises(ValueError):
ransac_estimator.fit(X, y)
# there is a 1e-9 chance it will take these many trials. No good reason
# 1e-2 isn't enough, can still happen
# 2 is the what ransac defines as min_samples = X.shape[1] + 1
max_trials = _dynamic_max_trials(len(X) - len(outliers), X.shape[0], 2, 1 - 1e-9)
ransac_estimator = RANSACRegressor(estimator, min_samples=2)
for i in range(50):
ransac_estimator.set_params(min_samples=2, random_state=i)
ransac_estimator.fit(X, y)
assert ransac_estimator.n_trials_ < max_trials + 1
def test_ransac_stop_n_inliers():
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator,
min_samples=2,
residual_threshold=5,
stop_n_inliers=2,
random_state=0,
)
ransac_estimator.fit(X, y)
assert ransac_estimator.n_trials_ == 1
def test_ransac_stop_score():
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator,
min_samples=2,
residual_threshold=5,
stop_score=0,
random_state=0,
)
ransac_estimator.fit(X, y)
assert ransac_estimator.n_trials_ == 1
def test_ransac_score():
X = np.arange(100)[:, None]
y = np.zeros((100,))
y[0] = 1
y[1] = 100
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, min_samples=2, residual_threshold=0.5, random_state=0
)
ransac_estimator.fit(X, y)
assert ransac_estimator.score(X[2:], y[2:]) == 1
assert ransac_estimator.score(X[:2], y[:2]) < 1
def test_ransac_predict():
X = np.arange(100)[:, None]
y = np.zeros((100,))
y[0] = 1
y[1] = 100
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, min_samples=2, residual_threshold=0.5, random_state=0
)
ransac_estimator.fit(X, y)
assert_array_equal(ransac_estimator.predict(X), np.zeros(100))
def test_ransac_residuals_threshold_no_inliers():
# When residual_threshold=nan there are no inliers and a
# ValueError with a message should be raised
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator,
min_samples=2,
residual_threshold=float("nan"),
random_state=0,
max_trials=5,
)
msg = "RANSAC could not find a valid consensus set"
with pytest.raises(ValueError, match=msg):
ransac_estimator.fit(X, y)
assert ransac_estimator.n_skips_no_inliers_ == 5
assert ransac_estimator.n_skips_invalid_data_ == 0
assert ransac_estimator.n_skips_invalid_model_ == 0
def test_ransac_no_valid_data():
def is_data_valid(X, y):
return False
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, is_data_valid=is_data_valid, max_trials=5
)
msg = "RANSAC could not find a valid consensus set"
with pytest.raises(ValueError, match=msg):
ransac_estimator.fit(X, y)
assert ransac_estimator.n_skips_no_inliers_ == 0
assert ransac_estimator.n_skips_invalid_data_ == 5
assert ransac_estimator.n_skips_invalid_model_ == 0
def test_ransac_no_valid_model():
def is_model_valid(estimator, X, y):
return False
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, is_model_valid=is_model_valid, max_trials=5
)
msg = "RANSAC could not find a valid consensus set"
with pytest.raises(ValueError, match=msg):
ransac_estimator.fit(X, y)
assert ransac_estimator.n_skips_no_inliers_ == 0
assert ransac_estimator.n_skips_invalid_data_ == 0
assert ransac_estimator.n_skips_invalid_model_ == 5
def test_ransac_exceed_max_skips():
def is_data_valid(X, y):
return False
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, is_data_valid=is_data_valid, max_trials=5, max_skips=3
)
msg = "RANSAC skipped more iterations than `max_skips`"
with pytest.raises(ValueError, match=msg):
ransac_estimator.fit(X, y)
assert ransac_estimator.n_skips_no_inliers_ == 0
assert ransac_estimator.n_skips_invalid_data_ == 4
assert ransac_estimator.n_skips_invalid_model_ == 0
def test_ransac_warn_exceed_max_skips():
global cause_skip
cause_skip = False
def is_data_valid(X, y):
global cause_skip
if not cause_skip:
cause_skip = True
return True
else:
return False
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, is_data_valid=is_data_valid, max_skips=3, max_trials=5
)
warning_message = (
"RANSAC found a valid consensus set but exited "
"early due to skipping more iterations than "
"`max_skips`. See estimator attributes for "
"diagnostics."
)
with pytest.warns(ConvergenceWarning, match=warning_message):
ransac_estimator.fit(X, y)
assert ransac_estimator.n_skips_no_inliers_ == 0
assert ransac_estimator.n_skips_invalid_data_ == 4
assert ransac_estimator.n_skips_invalid_model_ == 0
def test_ransac_sparse_coo():
X_sparse = sparse.coo_matrix(X)
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, min_samples=2, residual_threshold=5, random_state=0
)
ransac_estimator.fit(X_sparse, y)
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
ref_inlier_mask[outliers] = False
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
def test_ransac_sparse_csr():
X_sparse = sparse.csr_matrix(X)
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, min_samples=2, residual_threshold=5, random_state=0
)
ransac_estimator.fit(X_sparse, y)
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
ref_inlier_mask[outliers] = False
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
def test_ransac_sparse_csc():
X_sparse = sparse.csc_matrix(X)
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, min_samples=2, residual_threshold=5, random_state=0
)
ransac_estimator.fit(X_sparse, y)
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
ref_inlier_mask[outliers] = False
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
def test_ransac_none_estimator():
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, min_samples=2, residual_threshold=5, random_state=0
)
ransac_none_estimator = RANSACRegressor(
None, min_samples=2, residual_threshold=5, random_state=0
)
ransac_estimator.fit(X, y)
ransac_none_estimator.fit(X, y)
assert_array_almost_equal(
ransac_estimator.predict(X), ransac_none_estimator.predict(X)
)
def test_ransac_min_n_samples():
estimator = LinearRegression()
ransac_estimator1 = RANSACRegressor(
estimator, min_samples=2, residual_threshold=5, random_state=0
)
ransac_estimator2 = RANSACRegressor(
estimator,
min_samples=2.0 / X.shape[0],
residual_threshold=5,
random_state=0,
)
ransac_estimator3 = RANSACRegressor(
estimator, min_samples=-1, residual_threshold=5, random_state=0
)
ransac_estimator4 = RANSACRegressor(
estimator, min_samples=5.2, residual_threshold=5, random_state=0
)
ransac_estimator5 = RANSACRegressor(
estimator, min_samples=2.0, residual_threshold=5, random_state=0
)
ransac_estimator6 = RANSACRegressor(estimator, residual_threshold=5, random_state=0)
ransac_estimator7 = RANSACRegressor(
estimator, min_samples=X.shape[0] + 1, residual_threshold=5, random_state=0
)
# GH #19390
ransac_estimator8 = RANSACRegressor(
Ridge(), min_samples=None, residual_threshold=5, random_state=0
)
ransac_estimator1.fit(X, y)
ransac_estimator2.fit(X, y)
ransac_estimator5.fit(X, y)
ransac_estimator6.fit(X, y)
assert_array_almost_equal(
ransac_estimator1.predict(X), ransac_estimator2.predict(X)
)
assert_array_almost_equal(
ransac_estimator1.predict(X), ransac_estimator5.predict(X)
)
assert_array_almost_equal(
ransac_estimator1.predict(X), ransac_estimator6.predict(X)
)
with pytest.raises(ValueError):
ransac_estimator3.fit(X, y)
with pytest.raises(ValueError):
ransac_estimator4.fit(X, y)
with pytest.raises(ValueError):
ransac_estimator7.fit(X, y)
err_msg = "From version 1.2, `min_samples` needs to be explicitly set"
with pytest.warns(FutureWarning, match=err_msg):
ransac_estimator8.fit(X, y)
def test_ransac_multi_dimensional_targets():
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(
estimator, min_samples=2, residual_threshold=5, random_state=0
)
# 3-D target values
yyy = np.column_stack([y, y, y])
# Estimate parameters of corrupted data
ransac_estimator.fit(X, yyy)
# Ground truth / reference inlier mask
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
ref_inlier_mask[outliers] = False
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
def test_ransac_residual_loss():
def loss_multi1(y_true, y_pred):
return np.sum(np.abs(y_true - y_pred), axis=1)
def loss_multi2(y_true, y_pred):
return np.sum((y_true - y_pred) ** 2, axis=1)
def loss_mono(y_true, y_pred):
return np.abs(y_true - y_pred)
yyy = np.column_stack([y, y, y])
estimator = LinearRegression()
ransac_estimator0 = RANSACRegressor(
estimator, min_samples=2, residual_threshold=5, random_state=0
)
ransac_estimator1 = RANSACRegressor(
estimator,
min_samples=2,
residual_threshold=5,
random_state=0,
loss=loss_multi1,
)
ransac_estimator2 = RANSACRegressor(
estimator,
min_samples=2,
residual_threshold=5,
random_state=0,
loss=loss_multi2,
)
# multi-dimensional
ransac_estimator0.fit(X, yyy)
ransac_estimator1.fit(X, yyy)
ransac_estimator2.fit(X, yyy)
assert_array_almost_equal(
ransac_estimator0.predict(X), ransac_estimator1.predict(X)
)
assert_array_almost_equal(
ransac_estimator0.predict(X), ransac_estimator2.predict(X)
)
# one-dimensional
ransac_estimator0.fit(X, y)
ransac_estimator2.loss = loss_mono
ransac_estimator2.fit(X, y)
assert_array_almost_equal(
ransac_estimator0.predict(X), ransac_estimator2.predict(X)
)
ransac_estimator3 = RANSACRegressor(
estimator,
min_samples=2,
residual_threshold=5,
random_state=0,
loss="squared_error",
)
ransac_estimator3.fit(X, y)
assert_array_almost_equal(
ransac_estimator0.predict(X), ransac_estimator2.predict(X)
)
def test_ransac_default_residual_threshold():
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(estimator, min_samples=2, random_state=0)
# Estimate parameters of corrupted data
ransac_estimator.fit(X, y)
# Ground truth / reference inlier mask
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
ref_inlier_mask[outliers] = False
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
def test_ransac_dynamic_max_trials():
# Numbers hand-calculated and confirmed on page 119 (Table 4.3) in
# Hartley, R.~I. and Zisserman, A., 2004,
# Multiple View Geometry in Computer Vision, Second Edition,
# Cambridge University Press, ISBN: 0521540518
# e = 0%, min_samples = X
assert _dynamic_max_trials(100, 100, 2, 0.99) == 1
# e = 5%, min_samples = 2
assert _dynamic_max_trials(95, 100, 2, 0.99) == 2
# e = 10%, min_samples = 2
assert _dynamic_max_trials(90, 100, 2, 0.99) == 3
# e = 30%, min_samples = 2
assert _dynamic_max_trials(70, 100, 2, 0.99) == 7
# e = 50%, min_samples = 2
assert _dynamic_max_trials(50, 100, 2, 0.99) == 17
# e = 5%, min_samples = 8
assert _dynamic_max_trials(95, 100, 8, 0.99) == 5
# e = 10%, min_samples = 8
assert _dynamic_max_trials(90, 100, 8, 0.99) == 9
# e = 30%, min_samples = 8
assert _dynamic_max_trials(70, 100, 8, 0.99) == 78
# e = 50%, min_samples = 8
assert _dynamic_max_trials(50, 100, 8, 0.99) == 1177
# e = 0%, min_samples = 10
assert _dynamic_max_trials(1, 100, 10, 0) == 0
assert _dynamic_max_trials(1, 100, 10, 1) == float("inf")
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(estimator, min_samples=2, stop_probability=-0.1)
with pytest.raises(ValueError):
ransac_estimator.fit(X, y)
ransac_estimator = RANSACRegressor(estimator, min_samples=2, stop_probability=1.1)
with pytest.raises(ValueError):
ransac_estimator.fit(X, y)
def test_ransac_fit_sample_weight():
ransac_estimator = RANSACRegressor(random_state=0)
n_samples = y.shape[0]
weights = np.ones(n_samples)
ransac_estimator.fit(X, y, weights)
# sanity check
assert ransac_estimator.inlier_mask_.shape[0] == n_samples
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
ref_inlier_mask[outliers] = False
# check that mask is correct
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
# check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where
# X = X1 repeated n1 times, X2 repeated n2 times and so forth
random_state = check_random_state(0)
X_ = random_state.randint(0, 200, [10, 1])
y_ = np.ndarray.flatten(0.2 * X_ + 2)
sample_weight = random_state.randint(0, 10, 10)
outlier_X = random_state.randint(0, 1000, [1, 1])
outlier_weight = random_state.randint(0, 10, 1)
outlier_y = random_state.randint(-1000, 0, 1)
X_flat = np.append(
np.repeat(X_, sample_weight, axis=0),
np.repeat(outlier_X, outlier_weight, axis=0),
axis=0,
)
y_flat = np.ndarray.flatten(
np.append(
np.repeat(y_, sample_weight, axis=0),
np.repeat(outlier_y, outlier_weight, axis=0),
axis=0,
)
)
ransac_estimator.fit(X_flat, y_flat)
ref_coef_ = ransac_estimator.estimator_.coef_
sample_weight = np.append(sample_weight, outlier_weight)
X_ = np.append(X_, outlier_X, axis=0)
y_ = np.append(y_, outlier_y)
ransac_estimator.fit(X_, y_, sample_weight)
assert_allclose(ransac_estimator.estimator_.coef_, ref_coef_)
# check that if estimator.fit doesn't support
# sample_weight, raises error
estimator = OrthogonalMatchingPursuit()
ransac_estimator = RANSACRegressor(estimator, min_samples=10)
err_msg = f"{estimator.__class__.__name__} does not support sample_weight."
with pytest.raises(ValueError, match=err_msg):
ransac_estimator.fit(X, y, weights)
def test_ransac_final_model_fit_sample_weight():
X, y = make_regression(n_samples=1000, random_state=10)
rng = check_random_state(42)
sample_weight = rng.randint(1, 4, size=y.shape[0])
sample_weight = sample_weight / sample_weight.sum()
ransac = RANSACRegressor(estimator=LinearRegression(), random_state=0)
ransac.fit(X, y, sample_weight=sample_weight)
final_model = LinearRegression()
mask_samples = ransac.inlier_mask_
final_model.fit(
X[mask_samples], y[mask_samples], sample_weight=sample_weight[mask_samples]
)
assert_allclose(ransac.estimator_.coef_, final_model.coef_, atol=1e-12)
def test_perfect_horizontal_line():
"""Check that we can fit a line where all samples are inliers.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/19497
"""
X = np.arange(100)[:, None]
y = np.zeros((100,))
estimator = LinearRegression()
ransac_estimator = RANSACRegressor(estimator, random_state=0)
ransac_estimator.fit(X, y)
assert_allclose(ransac_estimator.estimator_.coef_, 0.0)
assert_allclose(ransac_estimator.estimator_.intercept_, 0.0)
# TODO: Remove in v1.2
@pytest.mark.parametrize(
"old_loss, new_loss",
[
("absolute_loss", "squared_error"),
("squared_loss", "absolute_error"),
],
)
def test_loss_deprecated(old_loss, new_loss):
est1 = RANSACRegressor(loss=old_loss, random_state=0)
with pytest.warns(FutureWarning, match=f"The loss '{old_loss}' was deprecated"):
est1.fit(X, y)
est2 = RANSACRegressor(loss=new_loss, random_state=0)
est2.fit(X, y)
assert_allclose(est1.predict(X), est2.predict(X))
def test_base_estimator_deprecated():
ransac_estimator = RANSACRegressor(
base_estimator=LinearRegression(),
min_samples=2,
residual_threshold=5,
random_state=0,
)
err_msg = (
"`base_estimator` was renamed to `estimator` in version 1.1 and "
"will be removed in 1.3."
)
with pytest.warns(FutureWarning, match=err_msg):
ransac_estimator.fit(X, y)

View File

@@ -0,0 +1,376 @@
import numpy as np
from numpy.testing import assert_allclose
import pytest
import scipy.sparse as sp
from sklearn.datasets import make_regression
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import Lasso, ElasticNet, LassoCV, ElasticNetCV
# FIXME: 'normalize' to be removed in 1.2
filterwarnings_normalize = pytest.mark.filterwarnings(
"ignore:'normalize' was deprecated in version 1.0"
)
def test_sparse_coef():
# Check that the sparse_coef property works
clf = ElasticNet()
clf.coef_ = [1, 2, 3]
assert sp.isspmatrix(clf.sparse_coef_)
assert clf.sparse_coef_.toarray().tolist()[0] == clf.coef_
@filterwarnings_normalize
def test_normalize_option():
# Check that the normalize option in enet works
X = sp.csc_matrix([[-1], [0], [1]])
y = [-1, 0, 1]
clf_dense = ElasticNet(normalize=True)
clf_sparse = ElasticNet(normalize=True)
clf_dense.fit(X, y)
X = sp.csc_matrix(X)
clf_sparse.fit(X, y)
assert_almost_equal(clf_dense.dual_gap_, 0)
assert_array_almost_equal(clf_dense.coef_, clf_sparse.coef_)
def test_lasso_zero():
# Check that the sparse lasso can handle zero data without crashing
X = sp.csc_matrix((3, 1))
y = [0, 0, 0]
T = np.array([[1], [2], [3]])
clf = Lasso().fit(X, y)
pred = clf.predict(T)
assert_array_almost_equal(clf.coef_, [0])
assert_array_almost_equal(pred, [0, 0, 0])
assert_almost_equal(clf.dual_gap_, 0)
@pytest.mark.parametrize("with_sample_weight", [True, False])
def test_enet_toy_list_input(with_sample_weight):
# Test ElasticNet for various values of alpha and l1_ratio with list X
X = np.array([[-1], [0], [1]])
X = sp.csc_matrix(X)
Y = [-1, 0, 1] # just a straight line
T = np.array([[2], [3], [4]]) # test sample
if with_sample_weight:
sw = np.array([2.0, 2, 2])
else:
sw = None
# this should be the same as unregularized least squares
clf = ElasticNet(alpha=0, l1_ratio=1.0)
# catch warning about alpha=0.
# this is discouraged but should work.
ignore_warnings(clf.fit)(X, Y, sample_weight=sw)
pred = clf.predict(T)
assert_array_almost_equal(clf.coef_, [1])
assert_array_almost_equal(pred, [2, 3, 4])
assert_almost_equal(clf.dual_gap_, 0)
clf = ElasticNet(alpha=0.5, l1_ratio=0.3)
clf.fit(X, Y, sample_weight=sw)
pred = clf.predict(T)
assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
assert_almost_equal(clf.dual_gap_, 0)
clf = ElasticNet(alpha=0.5, l1_ratio=0.5)
clf.fit(X, Y, sample_weight=sw)
pred = clf.predict(T)
assert_array_almost_equal(clf.coef_, [0.45454], 3)
assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3)
assert_almost_equal(clf.dual_gap_, 0)
def test_enet_toy_explicit_sparse_input():
# Test ElasticNet for various values of alpha and l1_ratio with sparse X
f = ignore_warnings
# training samples
X = sp.lil_matrix((3, 1))
X[0, 0] = -1
# X[1, 0] = 0
X[2, 0] = 1
Y = [-1, 0, 1] # just a straight line (the identity function)
# test samples
T = sp.lil_matrix((3, 1))
T[0, 0] = 2
T[1, 0] = 3
T[2, 0] = 4
# this should be the same as lasso
clf = ElasticNet(alpha=0, l1_ratio=1.0)
f(clf.fit)(X, Y)
pred = clf.predict(T)
assert_array_almost_equal(clf.coef_, [1])
assert_array_almost_equal(pred, [2, 3, 4])
assert_almost_equal(clf.dual_gap_, 0)
clf = ElasticNet(alpha=0.5, l1_ratio=0.3)
clf.fit(X, Y)
pred = clf.predict(T)
assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
assert_almost_equal(clf.dual_gap_, 0)
clf = ElasticNet(alpha=0.5, l1_ratio=0.5)
clf.fit(X, Y)
pred = clf.predict(T)
assert_array_almost_equal(clf.coef_, [0.45454], 3)
assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3)
assert_almost_equal(clf.dual_gap_, 0)
def make_sparse_data(
n_samples=100,
n_features=100,
n_informative=10,
seed=42,
positive=False,
n_targets=1,
):
random_state = np.random.RandomState(seed)
# build an ill-posed linear regression problem with many noisy features and
# comparatively few samples
# generate a ground truth model
w = random_state.randn(n_features, n_targets)
w[n_informative:] = 0.0 # only the top features are impacting the model
if positive:
w = np.abs(w)
X = random_state.randn(n_samples, n_features)
rnd = random_state.uniform(size=(n_samples, n_features))
X[rnd > 0.5] = 0.0 # 50% of zeros in input signal
# generate training ground truth labels
y = np.dot(X, w)
X = sp.csc_matrix(X)
if n_targets == 1:
y = np.ravel(y)
return X, y
def _test_sparse_enet_not_as_toy_dataset(alpha, fit_intercept, positive):
n_samples, n_features, max_iter = 100, 100, 1000
n_informative = 10
X, y = make_sparse_data(n_samples, n_features, n_informative, positive=positive)
X_train, X_test = X[n_samples // 2 :], X[: n_samples // 2]
y_train, y_test = y[n_samples // 2 :], y[: n_samples // 2]
s_clf = ElasticNet(
alpha=alpha,
l1_ratio=0.8,
fit_intercept=fit_intercept,
max_iter=max_iter,
tol=1e-7,
positive=positive,
warm_start=True,
)
s_clf.fit(X_train, y_train)
assert_almost_equal(s_clf.dual_gap_, 0, 4)
assert s_clf.score(X_test, y_test) > 0.85
# check the convergence is the same as the dense version
d_clf = ElasticNet(
alpha=alpha,
l1_ratio=0.8,
fit_intercept=fit_intercept,
max_iter=max_iter,
tol=1e-7,
positive=positive,
warm_start=True,
)
d_clf.fit(X_train.toarray(), y_train)
assert_almost_equal(d_clf.dual_gap_, 0, 4)
assert d_clf.score(X_test, y_test) > 0.85
assert_almost_equal(s_clf.coef_, d_clf.coef_, 5)
assert_almost_equal(s_clf.intercept_, d_clf.intercept_, 5)
# check that the coefs are sparse
assert np.sum(s_clf.coef_ != 0.0) < 2 * n_informative
def test_sparse_enet_not_as_toy_dataset():
_test_sparse_enet_not_as_toy_dataset(alpha=0.1, fit_intercept=False, positive=False)
_test_sparse_enet_not_as_toy_dataset(alpha=0.1, fit_intercept=True, positive=False)
_test_sparse_enet_not_as_toy_dataset(alpha=1e-3, fit_intercept=False, positive=True)
_test_sparse_enet_not_as_toy_dataset(alpha=1e-3, fit_intercept=True, positive=True)
def test_sparse_lasso_not_as_toy_dataset():
n_samples = 100
max_iter = 1000
n_informative = 10
X, y = make_sparse_data(n_samples=n_samples, n_informative=n_informative)
X_train, X_test = X[n_samples // 2 :], X[: n_samples // 2]
y_train, y_test = y[n_samples // 2 :], y[: n_samples // 2]
s_clf = Lasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7)
s_clf.fit(X_train, y_train)
assert_almost_equal(s_clf.dual_gap_, 0, 4)
assert s_clf.score(X_test, y_test) > 0.85
# check the convergence is the same as the dense version
d_clf = Lasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7)
d_clf.fit(X_train.toarray(), y_train)
assert_almost_equal(d_clf.dual_gap_, 0, 4)
assert d_clf.score(X_test, y_test) > 0.85
# check that the coefs are sparse
assert np.sum(s_clf.coef_ != 0.0) == n_informative
def test_enet_multitarget():
n_targets = 3
X, y = make_sparse_data(n_targets=n_targets)
estimator = ElasticNet(alpha=0.01, precompute=None)
# XXX: There is a bug when precompute is not None!
estimator.fit(X, y)
coef, intercept, dual_gap = (
estimator.coef_,
estimator.intercept_,
estimator.dual_gap_,
)
for k in range(n_targets):
estimator.fit(X, y[:, k])
assert_array_almost_equal(coef[k, :], estimator.coef_)
assert_array_almost_equal(intercept[k], estimator.intercept_)
assert_array_almost_equal(dual_gap[k], estimator.dual_gap_)
def test_path_parameters():
X, y = make_sparse_data()
max_iter = 50
n_alphas = 10
clf = ElasticNetCV(
n_alphas=n_alphas,
eps=1e-3,
max_iter=max_iter,
l1_ratio=0.5,
fit_intercept=False,
)
ignore_warnings(clf.fit)(X, y) # new params
assert_almost_equal(0.5, clf.l1_ratio)
assert n_alphas == clf.n_alphas
assert n_alphas == len(clf.alphas_)
sparse_mse_path = clf.mse_path_
ignore_warnings(clf.fit)(X.toarray(), y) # compare with dense data
assert_almost_equal(clf.mse_path_, sparse_mse_path)
@pytest.mark.parametrize("Model", [Lasso, ElasticNet, LassoCV, ElasticNetCV])
@pytest.mark.parametrize("fit_intercept", [False, True])
@pytest.mark.parametrize("n_samples, n_features", [(24, 6), (6, 24)])
@pytest.mark.parametrize("with_sample_weight", [True, False])
def test_sparse_dense_equality(
Model, fit_intercept, n_samples, n_features, with_sample_weight
):
X, y = make_regression(
n_samples=n_samples,
n_features=n_features,
effective_rank=n_features // 2,
n_informative=n_features // 2,
bias=4 * fit_intercept,
noise=1,
random_state=42,
)
if with_sample_weight:
sw = np.abs(np.random.RandomState(42).normal(scale=10, size=y.shape))
else:
sw = None
Xs = sp.csc_matrix(X)
params = {"fit_intercept": fit_intercept}
reg_dense = Model(**params).fit(X, y, sample_weight=sw)
reg_sparse = Model(**params).fit(Xs, y, sample_weight=sw)
if fit_intercept:
assert reg_sparse.intercept_ == pytest.approx(reg_dense.intercept_)
# balance property
assert np.average(reg_sparse.predict(X), weights=sw) == pytest.approx(
np.average(y, weights=sw)
)
assert_allclose(reg_sparse.coef_, reg_dense.coef_)
def test_same_output_sparse_dense_lasso_and_enet_cv():
X, y = make_sparse_data(n_samples=40, n_features=10)
for normalize in [True, False]:
clfs = ElasticNetCV(max_iter=100, normalize=normalize)
ignore_warnings(clfs.fit)(X, y)
clfd = ElasticNetCV(max_iter=100, normalize=normalize)
ignore_warnings(clfd.fit)(X.toarray(), y)
assert_almost_equal(clfs.alpha_, clfd.alpha_, 7)
assert_almost_equal(clfs.intercept_, clfd.intercept_, 7)
assert_array_almost_equal(clfs.mse_path_, clfd.mse_path_)
assert_array_almost_equal(clfs.alphas_, clfd.alphas_)
clfs = LassoCV(max_iter=100, cv=4, normalize=normalize)
ignore_warnings(clfs.fit)(X, y)
clfd = LassoCV(max_iter=100, cv=4, normalize=normalize)
ignore_warnings(clfd.fit)(X.toarray(), y)
assert_almost_equal(clfs.alpha_, clfd.alpha_, 7)
assert_almost_equal(clfs.intercept_, clfd.intercept_, 7)
assert_array_almost_equal(clfs.mse_path_, clfd.mse_path_)
assert_array_almost_equal(clfs.alphas_, clfd.alphas_)
def test_same_multiple_output_sparse_dense():
for normalize in [True, False]:
l = ElasticNet(normalize=normalize)
X = [
[0, 1, 2, 3, 4],
[0, 2, 5, 8, 11],
[9, 10, 11, 12, 13],
[10, 11, 12, 13, 14],
]
y = [
[1, 2, 3, 4, 5],
[1, 3, 6, 9, 12],
[10, 11, 12, 13, 14],
[11, 12, 13, 14, 15],
]
ignore_warnings(l.fit)(X, y)
sample = np.array([1, 2, 3, 4, 5]).reshape(1, -1)
predict_dense = l.predict(sample)
l_sp = ElasticNet(normalize=normalize)
X_sp = sp.coo_matrix(X)
ignore_warnings(l_sp.fit)(X_sp, y)
sample_sparse = sp.coo_matrix(sample)
predict_sparse = l_sp.predict(sample_sparse)
assert_array_almost_equal(predict_sparse, predict_dense)
def test_sparse_enet_coordinate_descent():
"""Test that a warning is issued if model does not converge"""
clf = Lasso(max_iter=2)
n_samples = 5
n_features = 2
X = sp.csc_matrix((n_samples, n_features)) * 1e50
y = np.ones(n_samples)
warning_message = (
"Objective did not converge. You might want "
"to increase the number of iterations."
)
with pytest.warns(ConvergenceWarning, match=warning_message):
clf.fit(X, y)

View File

@@ -0,0 +1,296 @@
"""
Testing for Theil-Sen module (sklearn.linear_model.theil_sen)
"""
# Author: Florian Wilhelm <florian.wilhelm@gmail.com>
# License: BSD 3 clause
import os
import re
import sys
from contextlib import contextmanager
import numpy as np
import pytest
from numpy.testing import assert_array_equal, assert_array_less
from numpy.testing import assert_array_almost_equal
from scipy.linalg import norm
from scipy.optimize import fmin_bfgs
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, TheilSenRegressor
from sklearn.linear_model._theil_sen import _spatial_median, _breakdown_point
from sklearn.linear_model._theil_sen import _modified_weiszfeld_step
from sklearn.utils._testing import assert_almost_equal
@contextmanager
def no_stdout_stderr():
old_stdout = sys.stdout
old_stderr = sys.stderr
with open(os.devnull, "w") as devnull:
sys.stdout = devnull
sys.stderr = devnull
yield
devnull.flush()
sys.stdout = old_stdout
sys.stderr = old_stderr
def gen_toy_problem_1d(intercept=True):
random_state = np.random.RandomState(0)
# Linear model y = 3*x + N(2, 0.1**2)
w = 3.0
if intercept:
c = 2.0
n_samples = 50
else:
c = 0.1
n_samples = 100
x = random_state.normal(size=n_samples)
noise = 0.1 * random_state.normal(size=n_samples)
y = w * x + c + noise
# Add some outliers
if intercept:
x[42], y[42] = (-2, 4)
x[43], y[43] = (-2.5, 8)
x[33], y[33] = (2.5, 1)
x[49], y[49] = (2.1, 2)
else:
x[42], y[42] = (-2, 4)
x[43], y[43] = (-2.5, 8)
x[53], y[53] = (2.5, 1)
x[60], y[60] = (2.1, 2)
x[72], y[72] = (1.8, -7)
return x[:, np.newaxis], y, w, c
def gen_toy_problem_2d():
random_state = np.random.RandomState(0)
n_samples = 100
# Linear model y = 5*x_1 + 10*x_2 + N(1, 0.1**2)
X = random_state.normal(size=(n_samples, 2))
w = np.array([5.0, 10.0])
c = 1.0
noise = 0.1 * random_state.normal(size=n_samples)
y = np.dot(X, w) + c + noise
# Add some outliers
n_outliers = n_samples // 10
ix = random_state.randint(0, n_samples, size=n_outliers)
y[ix] = 50 * random_state.normal(size=n_outliers)
return X, y, w, c
def gen_toy_problem_4d():
random_state = np.random.RandomState(0)
n_samples = 10000
# Linear model y = 5*x_1 + 10*x_2 + 42*x_3 + 7*x_4 + N(1, 0.1**2)
X = random_state.normal(size=(n_samples, 4))
w = np.array([5.0, 10.0, 42.0, 7.0])
c = 1.0
noise = 0.1 * random_state.normal(size=n_samples)
y = np.dot(X, w) + c + noise
# Add some outliers
n_outliers = n_samples // 10
ix = random_state.randint(0, n_samples, size=n_outliers)
y[ix] = 50 * random_state.normal(size=n_outliers)
return X, y, w, c
def test_modweiszfeld_step_1d():
X = np.array([1.0, 2.0, 3.0]).reshape(3, 1)
# Check startvalue is element of X and solution
median = 2.0
new_y = _modified_weiszfeld_step(X, median)
assert_array_almost_equal(new_y, median)
# Check startvalue is not the solution
y = 2.5
new_y = _modified_weiszfeld_step(X, y)
assert_array_less(median, new_y)
assert_array_less(new_y, y)
# Check startvalue is not the solution but element of X
y = 3.0
new_y = _modified_weiszfeld_step(X, y)
assert_array_less(median, new_y)
assert_array_less(new_y, y)
# Check that a single vector is identity
X = np.array([1.0, 2.0, 3.0]).reshape(1, 3)
y = X[0]
new_y = _modified_weiszfeld_step(X, y)
assert_array_equal(y, new_y)
def test_modweiszfeld_step_2d():
X = np.array([0.0, 0.0, 1.0, 1.0, 0.0, 1.0]).reshape(3, 2)
y = np.array([0.5, 0.5])
# Check first two iterations
new_y = _modified_weiszfeld_step(X, y)
assert_array_almost_equal(new_y, np.array([1 / 3, 2 / 3]))
new_y = _modified_weiszfeld_step(X, new_y)
assert_array_almost_equal(new_y, np.array([0.2792408, 0.7207592]))
# Check fix point
y = np.array([0.21132505, 0.78867497])
new_y = _modified_weiszfeld_step(X, y)
assert_array_almost_equal(new_y, y)
def test_spatial_median_1d():
X = np.array([1.0, 2.0, 3.0]).reshape(3, 1)
true_median = 2.0
_, median = _spatial_median(X)
assert_array_almost_equal(median, true_median)
# Test larger problem and for exact solution in 1d case
random_state = np.random.RandomState(0)
X = random_state.randint(100, size=(1000, 1))
true_median = np.median(X.ravel())
_, median = _spatial_median(X)
assert_array_equal(median, true_median)
def test_spatial_median_2d():
X = np.array([0.0, 0.0, 1.0, 1.0, 0.0, 1.0]).reshape(3, 2)
_, median = _spatial_median(X, max_iter=100, tol=1.0e-6)
def cost_func(y):
dists = np.array([norm(x - y) for x in X])
return np.sum(dists)
# Check if median is solution of the Fermat-Weber location problem
fermat_weber = fmin_bfgs(cost_func, median, disp=False)
assert_array_almost_equal(median, fermat_weber)
# Check when maximum iteration is exceeded a warning is emitted
warning_message = "Maximum number of iterations 30 reached in spatial median."
with pytest.warns(ConvergenceWarning, match=warning_message):
_spatial_median(X, max_iter=30, tol=0.0)
def test_theil_sen_1d():
X, y, w, c = gen_toy_problem_1d()
# Check that Least Squares fails
lstq = LinearRegression().fit(X, y)
assert np.abs(lstq.coef_ - w) > 0.9
# Check that Theil-Sen works
theil_sen = TheilSenRegressor(random_state=0).fit(X, y)
assert_array_almost_equal(theil_sen.coef_, w, 1)
assert_array_almost_equal(theil_sen.intercept_, c, 1)
def test_theil_sen_1d_no_intercept():
X, y, w, c = gen_toy_problem_1d(intercept=False)
# Check that Least Squares fails
lstq = LinearRegression(fit_intercept=False).fit(X, y)
assert np.abs(lstq.coef_ - w - c) > 0.5
# Check that Theil-Sen works
theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y)
assert_array_almost_equal(theil_sen.coef_, w + c, 1)
assert_almost_equal(theil_sen.intercept_, 0.0)
# non-regression test for #18104
theil_sen.score(X, y)
def test_theil_sen_2d():
X, y, w, c = gen_toy_problem_2d()
# Check that Least Squares fails
lstq = LinearRegression().fit(X, y)
assert norm(lstq.coef_ - w) > 1.0
# Check that Theil-Sen works
theil_sen = TheilSenRegressor(max_subpopulation=1e3, random_state=0).fit(X, y)
assert_array_almost_equal(theil_sen.coef_, w, 1)
assert_array_almost_equal(theil_sen.intercept_, c, 1)
def test_calc_breakdown_point():
bp = _breakdown_point(1e10, 2)
assert np.abs(bp - 1 + 1 / (np.sqrt(2))) < 1.0e-6
@pytest.mark.parametrize(
"param, ExceptionCls, match",
[
(
{"max_subpopulation": "hello"},
TypeError,
"max_subpopulation must be an instance of {float, int}",
),
(
{"max_subpopulation": -1},
ValueError,
"max_subpopulation == -1, must be >= 1",
),
(
{"n_subsamples": 1},
ValueError,
re.escape("Invalid parameter since n_features+1 > n_subsamples (2 > 1)"),
),
(
{"n_subsamples": 101},
ValueError,
re.escape("Invalid parameter since n_subsamples > n_samples (101 > 50)"),
),
],
)
def test_checksubparams_invalid_input(param, ExceptionCls, match):
X, y, w, c = gen_toy_problem_1d()
theil_sen = TheilSenRegressor(**param, random_state=0)
with pytest.raises(ExceptionCls, match=match):
theil_sen.fit(X, y)
def test_checksubparams_n_subsamples_if_less_samples_than_features():
random_state = np.random.RandomState(0)
n_samples, n_features = 10, 20
X = random_state.normal(size=(n_samples, n_features))
y = random_state.normal(size=n_samples)
theil_sen = TheilSenRegressor(n_subsamples=9, random_state=0)
with pytest.raises(ValueError):
theil_sen.fit(X, y)
def test_subpopulation():
X, y, w, c = gen_toy_problem_4d()
theil_sen = TheilSenRegressor(max_subpopulation=250, random_state=0).fit(X, y)
assert_array_almost_equal(theil_sen.coef_, w, 1)
assert_array_almost_equal(theil_sen.intercept_, c, 1)
def test_subsamples():
X, y, w, c = gen_toy_problem_4d()
theil_sen = TheilSenRegressor(n_subsamples=X.shape[0], random_state=0).fit(X, y)
lstq = LinearRegression().fit(X, y)
# Check for exact the same results as Least Squares
assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 9)
def test_verbosity():
X, y, w, c = gen_toy_problem_1d()
# Check that Theil-Sen can be verbose
with no_stdout_stderr():
TheilSenRegressor(verbose=True, random_state=0).fit(X, y)
TheilSenRegressor(verbose=True, max_subpopulation=10, random_state=0).fit(X, y)
def test_theil_sen_parallel():
X, y, w, c = gen_toy_problem_2d()
# Check that Least Squares fails
lstq = LinearRegression().fit(X, y)
assert norm(lstq.coef_ - w) > 1.0
# Check that Theil-Sen works
theil_sen = TheilSenRegressor(n_jobs=2, random_state=0, max_subpopulation=2e3).fit(
X, y
)
assert_array_almost_equal(theil_sen.coef_, w, 1)
assert_array_almost_equal(theil_sen.intercept_, c, 1)
def test_less_samples_than_features():
random_state = np.random.RandomState(0)
n_samples, n_features = 10, 20
X = random_state.normal(size=(n_samples, n_features))
y = random_state.normal(size=n_samples)
# Check that Theil-Sen falls back to Least Squares if fit_intercept=False
theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y)
lstq = LinearRegression(fit_intercept=False).fit(X, y)
assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 12)
# Check fit_intercept=True case. This will not be equal to the Least
# Squares solution since the intercept is calculated differently.
theil_sen = TheilSenRegressor(fit_intercept=True, random_state=0).fit(X, y)
y_pred = theil_sen.predict(X)
assert_array_almost_equal(y_pred, y, 12)