first commit
This commit is contained in:
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
The :mod:`sklearn.ensemble` module includes ensemble-based methods for
|
||||
classification, regression and anomaly detection.
|
||||
"""
|
||||
from ._base import BaseEnsemble
|
||||
from ._forest import RandomForestClassifier
|
||||
from ._forest import RandomForestRegressor
|
||||
from ._forest import RandomTreesEmbedding
|
||||
from ._forest import ExtraTreesClassifier
|
||||
from ._forest import ExtraTreesRegressor
|
||||
from ._bagging import BaggingClassifier
|
||||
from ._bagging import BaggingRegressor
|
||||
from ._iforest import IsolationForest
|
||||
from ._weight_boosting import AdaBoostClassifier
|
||||
from ._weight_boosting import AdaBoostRegressor
|
||||
from ._gb import GradientBoostingClassifier
|
||||
from ._gb import GradientBoostingRegressor
|
||||
from ._voting import VotingClassifier
|
||||
from ._voting import VotingRegressor
|
||||
from ._stacking import StackingClassifier
|
||||
from ._stacking import StackingRegressor
|
||||
from ._hist_gradient_boosting.gradient_boosting import (
|
||||
HistGradientBoostingRegressor,
|
||||
HistGradientBoostingClassifier,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"BaseEnsemble",
|
||||
"RandomForestClassifier",
|
||||
"RandomForestRegressor",
|
||||
"RandomTreesEmbedding",
|
||||
"ExtraTreesClassifier",
|
||||
"ExtraTreesRegressor",
|
||||
"BaggingClassifier",
|
||||
"BaggingRegressor",
|
||||
"IsolationForest",
|
||||
"GradientBoostingClassifier",
|
||||
"GradientBoostingRegressor",
|
||||
"AdaBoostClassifier",
|
||||
"AdaBoostRegressor",
|
||||
"VotingClassifier",
|
||||
"VotingRegressor",
|
||||
"StackingClassifier",
|
||||
"StackingRegressor",
|
||||
"HistGradientBoostingClassifier",
|
||||
"HistGradientBoostingRegressor",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,329 @@
|
||||
"""Base class for ensemble-based estimators."""
|
||||
|
||||
# Authors: Gilles Louppe
|
||||
# License: BSD 3 clause
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
import numbers
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from joblib import effective_n_jobs
|
||||
|
||||
from ..base import clone
|
||||
from ..base import is_classifier, is_regressor
|
||||
from ..base import BaseEstimator
|
||||
from ..base import MetaEstimatorMixin
|
||||
from ..tree import (
|
||||
DecisionTreeRegressor,
|
||||
ExtraTreeRegressor,
|
||||
BaseDecisionTree,
|
||||
DecisionTreeClassifier,
|
||||
)
|
||||
from ..utils import Bunch, _print_elapsed_time
|
||||
from ..utils import check_random_state
|
||||
from ..utils.metaestimators import _BaseComposition
|
||||
|
||||
|
||||
def _fit_single_estimator(
|
||||
estimator, X, y, sample_weight=None, message_clsname=None, message=None
|
||||
):
|
||||
"""Private function used to fit an estimator within a job."""
|
||||
if sample_weight is not None:
|
||||
try:
|
||||
with _print_elapsed_time(message_clsname, message):
|
||||
estimator.fit(X, y, sample_weight=sample_weight)
|
||||
except TypeError as exc:
|
||||
if "unexpected keyword argument 'sample_weight'" in str(exc):
|
||||
raise TypeError(
|
||||
"Underlying estimator {} does not support sample weights.".format(
|
||||
estimator.__class__.__name__
|
||||
)
|
||||
) from exc
|
||||
raise
|
||||
else:
|
||||
with _print_elapsed_time(message_clsname, message):
|
||||
estimator.fit(X, y)
|
||||
return estimator
|
||||
|
||||
|
||||
def _set_random_states(estimator, random_state=None):
|
||||
"""Set fixed random_state parameters for an estimator.
|
||||
|
||||
Finds all parameters ending ``random_state`` and sets them to integers
|
||||
derived from ``random_state``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator supporting get/set_params
|
||||
Estimator with potential randomness managed by random_state
|
||||
parameters.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Pseudo-random number generator to control the generation of the random
|
||||
integers. Pass an int for reproducible output across multiple function
|
||||
calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This does not necessarily set *all* ``random_state`` attributes that
|
||||
control an estimator's randomness, only those accessible through
|
||||
``estimator.get_params()``. ``random_state``s not controlled include
|
||||
those belonging to:
|
||||
|
||||
* cross-validation splitters
|
||||
* ``scipy.stats`` rvs
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
to_set = {}
|
||||
for key in sorted(estimator.get_params(deep=True)):
|
||||
if key == "random_state" or key.endswith("__random_state"):
|
||||
to_set[key] = random_state.randint(np.iinfo(np.int32).max)
|
||||
|
||||
if to_set:
|
||||
estimator.set_params(**to_set)
|
||||
|
||||
|
||||
class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
|
||||
"""Base class for all ensemble classes.
|
||||
|
||||
Warning: This class should not be used directly. Use derived classes
|
||||
instead.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
base_estimator : object
|
||||
The base estimator from which the ensemble is built.
|
||||
|
||||
n_estimators : int, default=10
|
||||
The number of estimators in the ensemble.
|
||||
|
||||
estimator_params : list of str, default=tuple()
|
||||
The list of attributes to use as parameters when instantiating a
|
||||
new base estimator. If none are given, default parameters are used.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
base_estimator_ : estimator
|
||||
The base estimator from which the ensemble is grown.
|
||||
|
||||
estimators_ : list of estimators
|
||||
The collection of fitted base estimators.
|
||||
"""
|
||||
|
||||
# overwrite _required_parameters from MetaEstimatorMixin
|
||||
_required_parameters: List[str] = []
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, base_estimator, *, n_estimators=10, estimator_params=tuple()):
|
||||
# Set parameters
|
||||
self.base_estimator = base_estimator
|
||||
self.n_estimators = n_estimators
|
||||
self.estimator_params = estimator_params
|
||||
|
||||
# Don't instantiate estimators now! Parameters of base_estimator might
|
||||
# still change. Eg., when grid-searching with the nested object syntax.
|
||||
# self.estimators_ needs to be filled by the derived classes in fit.
|
||||
|
||||
def _validate_estimator(self, default=None):
|
||||
"""Check the estimator and the n_estimator attribute.
|
||||
|
||||
Sets the base_estimator_` attributes.
|
||||
"""
|
||||
if not isinstance(self.n_estimators, numbers.Integral):
|
||||
raise ValueError(
|
||||
"n_estimators must be an integer, got {0}.".format(
|
||||
type(self.n_estimators)
|
||||
)
|
||||
)
|
||||
|
||||
if self.n_estimators <= 0:
|
||||
raise ValueError(
|
||||
"n_estimators must be greater than zero, got {0}.".format(
|
||||
self.n_estimators
|
||||
)
|
||||
)
|
||||
|
||||
if self.base_estimator is not None:
|
||||
self.base_estimator_ = self.base_estimator
|
||||
else:
|
||||
self.base_estimator_ = default
|
||||
|
||||
if self.base_estimator_ is None:
|
||||
raise ValueError("base_estimator cannot be None")
|
||||
|
||||
def _make_estimator(self, append=True, random_state=None):
|
||||
"""Make and configure a copy of the `base_estimator_` attribute.
|
||||
|
||||
Warning: This method should be used to properly instantiate new
|
||||
sub-estimators.
|
||||
"""
|
||||
estimator = clone(self.base_estimator_)
|
||||
estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params})
|
||||
|
||||
# TODO: Remove in v1.2
|
||||
# criterion "mse" and "mae" would cause warnings in every call to
|
||||
# DecisionTreeRegressor.fit(..)
|
||||
if isinstance(estimator, (DecisionTreeRegressor, ExtraTreeRegressor)):
|
||||
if getattr(estimator, "criterion", None) == "mse":
|
||||
estimator.set_params(criterion="squared_error")
|
||||
elif getattr(estimator, "criterion", None) == "mae":
|
||||
estimator.set_params(criterion="absolute_error")
|
||||
|
||||
# TODO(1.3): Remove
|
||||
# max_features = 'auto' would cause warnings in every call to
|
||||
# Tree.fit(..)
|
||||
if isinstance(estimator, BaseDecisionTree):
|
||||
if getattr(estimator, "max_features", None) == "auto":
|
||||
if isinstance(estimator, DecisionTreeClassifier):
|
||||
estimator.set_params(max_features="sqrt")
|
||||
elif isinstance(estimator, DecisionTreeRegressor):
|
||||
estimator.set_params(max_features=1.0)
|
||||
|
||||
if random_state is not None:
|
||||
_set_random_states(estimator, random_state)
|
||||
|
||||
if append:
|
||||
self.estimators_.append(estimator)
|
||||
|
||||
return estimator
|
||||
|
||||
def __len__(self):
|
||||
"""Return the number of estimators in the ensemble."""
|
||||
return len(self.estimators_)
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""Return the index'th estimator in the ensemble."""
|
||||
return self.estimators_[index]
|
||||
|
||||
def __iter__(self):
|
||||
"""Return iterator over estimators in the ensemble."""
|
||||
return iter(self.estimators_)
|
||||
|
||||
|
||||
def _partition_estimators(n_estimators, n_jobs):
|
||||
"""Private function used to partition estimators between jobs."""
|
||||
# Compute the number of jobs
|
||||
n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
|
||||
|
||||
# Partition estimators between jobs
|
||||
n_estimators_per_job = np.full(n_jobs, n_estimators // n_jobs, dtype=int)
|
||||
n_estimators_per_job[: n_estimators % n_jobs] += 1
|
||||
starts = np.cumsum(n_estimators_per_job)
|
||||
|
||||
return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
|
||||
|
||||
|
||||
class _BaseHeterogeneousEnsemble(
|
||||
MetaEstimatorMixin, _BaseComposition, metaclass=ABCMeta
|
||||
):
|
||||
"""Base class for heterogeneous ensemble of learners.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimators : list of (str, estimator) tuples
|
||||
The ensemble of estimators to use in the ensemble. Each element of the
|
||||
list is defined as a tuple of string (i.e. name of the estimator) and
|
||||
an estimator instance. An estimator can be set to `'drop'` using
|
||||
`set_params`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
estimators_ : list of estimators
|
||||
The elements of the estimators parameter, having been fitted on the
|
||||
training data. If an estimator has been set to `'drop'`, it will not
|
||||
appear in `estimators_`.
|
||||
"""
|
||||
|
||||
_required_parameters = ["estimators"]
|
||||
|
||||
@property
|
||||
def named_estimators(self):
|
||||
"""Dictionary to access any fitted sub-estimators by name.
|
||||
|
||||
Returns
|
||||
-------
|
||||
:class:`~sklearn.utils.Bunch`
|
||||
"""
|
||||
return Bunch(**dict(self.estimators))
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, estimators):
|
||||
self.estimators = estimators
|
||||
|
||||
def _validate_estimators(self):
|
||||
if self.estimators is None or len(self.estimators) == 0:
|
||||
raise ValueError(
|
||||
"Invalid 'estimators' attribute, 'estimators' should be a list"
|
||||
" of (string, estimator) tuples."
|
||||
)
|
||||
names, estimators = zip(*self.estimators)
|
||||
# defined by MetaEstimatorMixin
|
||||
self._validate_names(names)
|
||||
|
||||
has_estimator = any(est != "drop" for est in estimators)
|
||||
if not has_estimator:
|
||||
raise ValueError(
|
||||
"All estimators are dropped. At least one is required "
|
||||
"to be an estimator."
|
||||
)
|
||||
|
||||
is_estimator_type = is_classifier if is_classifier(self) else is_regressor
|
||||
|
||||
for est in estimators:
|
||||
if est != "drop" and not is_estimator_type(est):
|
||||
raise ValueError(
|
||||
"The estimator {} should be a {}.".format(
|
||||
est.__class__.__name__, is_estimator_type.__name__[3:]
|
||||
)
|
||||
)
|
||||
|
||||
return names, estimators
|
||||
|
||||
def set_params(self, **params):
|
||||
"""
|
||||
Set the parameters of an estimator from the ensemble.
|
||||
|
||||
Valid parameter keys can be listed with `get_params()`. Note that you
|
||||
can directly set the parameters of the estimators contained in
|
||||
`estimators`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
**params : keyword arguments
|
||||
Specific parameters using e.g.
|
||||
`set_params(parameter_name=new_value)`. In addition, to setting the
|
||||
parameters of the estimator, the individual estimator of the
|
||||
estimators can also be set, or can be removed by setting them to
|
||||
'drop'.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Estimator instance.
|
||||
"""
|
||||
super()._set_params("estimators", **params)
|
||||
return self
|
||||
|
||||
def get_params(self, deep=True):
|
||||
"""
|
||||
Get the parameters of an estimator from the ensemble.
|
||||
|
||||
Returns the parameters given in the constructor as well as the
|
||||
estimators contained within the `estimators` parameter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
deep : bool, default=True
|
||||
Setting it to True gets the various estimators and the parameters
|
||||
of the estimators as well.
|
||||
|
||||
Returns
|
||||
-------
|
||||
params : dict
|
||||
Parameter and estimator names mapped to their values or parameter
|
||||
names mapped to their values.
|
||||
"""
|
||||
return super()._get_params("estimators", deep=deep)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,5 @@
|
||||
"""This module implements histogram-based gradient boosting estimators.
|
||||
|
||||
The implementation is a port from pygbm which is itself strongly inspired
|
||||
from LightGBM.
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,18 @@
|
||||
from .common cimport X_BINNED_DTYPE_C
|
||||
from .common cimport BITSET_DTYPE_C
|
||||
from .common cimport BITSET_INNER_DTYPE_C
|
||||
from .common cimport X_DTYPE_C
|
||||
|
||||
cdef void init_bitset(BITSET_DTYPE_C bitset) nogil
|
||||
|
||||
cdef void set_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) nogil
|
||||
|
||||
cdef unsigned char in_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) nogil
|
||||
|
||||
cpdef unsigned char in_bitset_memoryview(const BITSET_INNER_DTYPE_C[:] bitset,
|
||||
X_BINNED_DTYPE_C val) nogil
|
||||
|
||||
cdef unsigned char in_bitset_2d_memoryview(
|
||||
const BITSET_INNER_DTYPE_C [:, :] bitset,
|
||||
X_BINNED_DTYPE_C val,
|
||||
unsigned int row) nogil
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,316 @@
|
||||
"""
|
||||
This module contains the BinMapper class.
|
||||
|
||||
BinMapper is used for mapping a real-valued dataset into integer-valued bins.
|
||||
Bin thresholds are computed with the quantiles so that each bin contains
|
||||
approximately the same number of samples.
|
||||
"""
|
||||
# Author: Nicolas Hug
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...utils import check_random_state, check_array
|
||||
from ...base import BaseEstimator, TransformerMixin
|
||||
from ...utils.validation import check_is_fitted
|
||||
from ...utils.fixes import percentile
|
||||
from ...utils._openmp_helpers import _openmp_effective_n_threads
|
||||
from ._binning import _map_to_bins
|
||||
from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF, X_BITSET_INNER_DTYPE
|
||||
from ._bitset import set_bitset_memoryview
|
||||
|
||||
|
||||
def _find_binning_thresholds(col_data, max_bins):
|
||||
"""Extract quantiles from a continuous feature.
|
||||
|
||||
Missing values are ignored for finding the thresholds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col_data : array-like, shape (n_samples,)
|
||||
The continuous feature to bin.
|
||||
max_bins: int
|
||||
The maximum number of bins to use for non-missing values. If for a
|
||||
given feature the number of unique values is less than ``max_bins``,
|
||||
then those unique values will be used to compute the bin thresholds,
|
||||
instead of the quantiles
|
||||
|
||||
Return
|
||||
------
|
||||
binning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,)
|
||||
The increasing numeric values that can be used to separate the bins.
|
||||
A given value x will be mapped into bin value i iff
|
||||
bining_thresholds[i - 1] < x <= binning_thresholds[i]
|
||||
"""
|
||||
# ignore missing values when computing bin thresholds
|
||||
missing_mask = np.isnan(col_data)
|
||||
if missing_mask.any():
|
||||
col_data = col_data[~missing_mask]
|
||||
col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
|
||||
distinct_values = np.unique(col_data)
|
||||
if len(distinct_values) <= max_bins:
|
||||
midpoints = distinct_values[:-1] + distinct_values[1:]
|
||||
midpoints *= 0.5
|
||||
else:
|
||||
# We sort again the data in this case. We could compute
|
||||
# approximate midpoint percentiles using the output of
|
||||
# np.unique(col_data, return_counts) instead but this is more
|
||||
# work and the performance benefit will be limited because we
|
||||
# work on a fixed-size subsample of the full data.
|
||||
percentiles = np.linspace(0, 100, num=max_bins + 1)
|
||||
percentiles = percentiles[1:-1]
|
||||
midpoints = percentile(col_data, percentiles, method="midpoint").astype(X_DTYPE)
|
||||
assert midpoints.shape[0] == max_bins - 1
|
||||
|
||||
# We avoid having +inf thresholds: +inf thresholds are only allowed in
|
||||
# a "split on nan" situation.
|
||||
np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
|
||||
return midpoints
|
||||
|
||||
|
||||
class _BinMapper(TransformerMixin, BaseEstimator):
|
||||
"""Transformer that maps a dataset into integer-valued bins.
|
||||
|
||||
For continuous features, the bins are created in a feature-wise fashion,
|
||||
using quantiles so that each bins contains approximately the same number
|
||||
of samples. For large datasets, quantiles are computed on a subset of the
|
||||
data to speed-up the binning, but the quantiles should remain stable.
|
||||
|
||||
For categorical features, the raw categorical values are expected to be
|
||||
in [0, 254] (this is not validated here though) and each category
|
||||
corresponds to a bin. All categorical values must be known at
|
||||
initialization: transform() doesn't know how to bin unknown categorical
|
||||
values. Note that transform() is only used on non-training data in the
|
||||
case of early stopping.
|
||||
|
||||
Features with a small number of values may be binned into less than
|
||||
``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
|
||||
for missing values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_bins : int, default=256
|
||||
The maximum number of bins to use (including the bin for missing
|
||||
values). Should be in [3, 256]. Non-missing values are binned on
|
||||
``max_bins = n_bins - 1`` bins. The last bin is always reserved for
|
||||
missing values. If for a given feature the number of unique values is
|
||||
less than ``max_bins``, then those unique values will be used to
|
||||
compute the bin thresholds, instead of the quantiles. For categorical
|
||||
features indicated by ``is_categorical``, the docstring for
|
||||
``is_categorical`` details on this procedure.
|
||||
subsample : int or None, default=2e5
|
||||
If ``n_samples > subsample``, then ``sub_samples`` samples will be
|
||||
randomly chosen to compute the quantiles. If ``None``, the whole data
|
||||
is used.
|
||||
is_categorical : ndarray of bool of shape (n_features,), default=None
|
||||
Indicates categorical features. By default, all features are
|
||||
considered continuous.
|
||||
known_categories : list of {ndarray, None} of shape (n_features,), \
|
||||
default=none
|
||||
For each categorical feature, the array indicates the set of unique
|
||||
categorical values. These should be the possible values over all the
|
||||
data, not just the training data. For continuous features, the
|
||||
corresponding entry should be None.
|
||||
random_state: int, RandomState instance or None, default=None
|
||||
Pseudo-random number generator to control the random sub-sampling.
|
||||
Pass an int for reproducible output across multiple
|
||||
function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
n_threads : int, default=None
|
||||
Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
|
||||
to determine the effective number of threads use, which takes cgroups CPU
|
||||
quotes into account. See the docstring of `_openmp_effective_n_threads`
|
||||
for details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
bin_thresholds_ : list of ndarray
|
||||
For each feature, each array indicates how to map a feature into a
|
||||
binned feature. The semantic and size depends on the nature of the
|
||||
feature:
|
||||
- for real-valued features, the array corresponds to the real-valued
|
||||
bin thresholds (the upper bound of each bin). There are ``max_bins
|
||||
- 1`` thresholds, where ``max_bins = n_bins - 1`` is the number of
|
||||
bins used for non-missing values.
|
||||
- for categorical features, the array is a map from a binned category
|
||||
value to the raw category value. The size of the array is equal to
|
||||
``min(max_bins, category_cardinality)`` where we ignore missing
|
||||
values in the cardinality.
|
||||
n_bins_non_missing_ : ndarray, dtype=np.uint32
|
||||
For each feature, gives the number of bins actually used for
|
||||
non-missing values. For features with a lot of unique values, this is
|
||||
equal to ``n_bins - 1``.
|
||||
is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8
|
||||
Indicator for categorical features.
|
||||
missing_values_bin_idx_ : np.uint8
|
||||
The index of the bin where missing values are mapped. This is a
|
||||
constant across all features. This corresponds to the last bin, and
|
||||
it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_``
|
||||
is less than ``n_bins - 1`` for a given feature, then there are
|
||||
empty (and unused) bins.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_bins=256,
|
||||
subsample=int(2e5),
|
||||
is_categorical=None,
|
||||
known_categories=None,
|
||||
random_state=None,
|
||||
n_threads=None,
|
||||
):
|
||||
self.n_bins = n_bins
|
||||
self.subsample = subsample
|
||||
self.is_categorical = is_categorical
|
||||
self.known_categories = known_categories
|
||||
self.random_state = random_state
|
||||
self.n_threads = n_threads
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit data X by computing the binning thresholds.
|
||||
|
||||
The last bin is reserved for missing values, whether missing values
|
||||
are present in the data or not.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data to bin.
|
||||
y: None
|
||||
Ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
if not (3 <= self.n_bins <= 256):
|
||||
# min is 3: at least 2 distinct bins and a missing values bin
|
||||
raise ValueError(
|
||||
"n_bins={} should be no smaller than 3 and no larger than 256.".format(
|
||||
self.n_bins
|
||||
)
|
||||
)
|
||||
|
||||
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
|
||||
max_bins = self.n_bins - 1
|
||||
|
||||
rng = check_random_state(self.random_state)
|
||||
if self.subsample is not None and X.shape[0] > self.subsample:
|
||||
subset = rng.choice(X.shape[0], self.subsample, replace=False)
|
||||
X = X.take(subset, axis=0)
|
||||
|
||||
if self.is_categorical is None:
|
||||
self.is_categorical_ = np.zeros(X.shape[1], dtype=np.uint8)
|
||||
else:
|
||||
self.is_categorical_ = np.asarray(self.is_categorical, dtype=np.uint8)
|
||||
|
||||
n_features = X.shape[1]
|
||||
known_categories = self.known_categories
|
||||
if known_categories is None:
|
||||
known_categories = [None] * n_features
|
||||
|
||||
# validate is_categorical and known_categories parameters
|
||||
for f_idx in range(n_features):
|
||||
is_categorical = self.is_categorical_[f_idx]
|
||||
known_cats = known_categories[f_idx]
|
||||
if is_categorical and known_cats is None:
|
||||
raise ValueError(
|
||||
f"Known categories for feature {f_idx} must be provided."
|
||||
)
|
||||
if not is_categorical and known_cats is not None:
|
||||
raise ValueError(
|
||||
f"Feature {f_idx} isn't marked as a categorical feature, "
|
||||
"but categories were passed."
|
||||
)
|
||||
|
||||
self.missing_values_bin_idx_ = self.n_bins - 1
|
||||
|
||||
self.bin_thresholds_ = []
|
||||
n_bins_non_missing = []
|
||||
|
||||
for f_idx in range(n_features):
|
||||
if not self.is_categorical_[f_idx]:
|
||||
thresholds = _find_binning_thresholds(X[:, f_idx], max_bins)
|
||||
n_bins_non_missing.append(thresholds.shape[0] + 1)
|
||||
else:
|
||||
# Since categories are assumed to be encoded in
|
||||
# [0, n_cats] and since n_cats <= max_bins,
|
||||
# the thresholds *are* the unique categorical values. This will
|
||||
# lead to the correct mapping in transform()
|
||||
thresholds = known_categories[f_idx]
|
||||
n_bins_non_missing.append(thresholds.shape[0])
|
||||
|
||||
self.bin_thresholds_.append(thresholds)
|
||||
|
||||
self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Bin data X.
|
||||
|
||||
Missing values will be mapped to the last bin.
|
||||
|
||||
For categorical features, the mapping will be incorrect for unknown
|
||||
categories. Since the BinMapper is given known_categories of the
|
||||
entire training data (i.e. before the call to train_test_split() in
|
||||
case of early-stopping), this never happens.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data to bin.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_binned : array-like of shape (n_samples, n_features)
|
||||
The binned data (fortran-aligned).
|
||||
"""
|
||||
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
|
||||
check_is_fitted(self)
|
||||
if X.shape[1] != self.n_bins_non_missing_.shape[0]:
|
||||
raise ValueError(
|
||||
"This estimator was fitted with {} features but {} got passed "
|
||||
"to transform()".format(self.n_bins_non_missing_.shape[0], X.shape[1])
|
||||
)
|
||||
|
||||
n_threads = _openmp_effective_n_threads(self.n_threads)
|
||||
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
|
||||
_map_to_bins(
|
||||
X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned
|
||||
)
|
||||
return binned
|
||||
|
||||
def make_known_categories_bitsets(self):
|
||||
"""Create bitsets of known categories.
|
||||
|
||||
Returns
|
||||
-------
|
||||
- known_cat_bitsets : ndarray of shape (n_categorical_features, 8)
|
||||
Array of bitsets of known categories, for each categorical feature.
|
||||
- f_idx_map : ndarray of shape (n_features,)
|
||||
Map from original feature index to the corresponding index in the
|
||||
known_cat_bitsets array.
|
||||
"""
|
||||
|
||||
categorical_features_indices = np.flatnonzero(self.is_categorical_)
|
||||
|
||||
n_features = self.is_categorical_.size
|
||||
n_categorical_features = categorical_features_indices.size
|
||||
|
||||
f_idx_map = np.zeros(n_features, dtype=np.uint32)
|
||||
f_idx_map[categorical_features_indices] = np.arange(
|
||||
n_categorical_features, dtype=np.uint32
|
||||
)
|
||||
|
||||
known_categories = self.bin_thresholds_
|
||||
|
||||
known_cat_bitsets = np.zeros(
|
||||
(n_categorical_features, 8), dtype=X_BITSET_INNER_DTYPE
|
||||
)
|
||||
|
||||
# TODO: complexity is O(n_categorical_features * 255). Maybe this is
|
||||
# worth cythonizing
|
||||
for mapped_f_idx, f_idx in enumerate(categorical_features_indices):
|
||||
for raw_cat_val in known_categories[f_idx]:
|
||||
set_bitset_memoryview(known_cat_bitsets[mapped_f_idx], raw_cat_val)
|
||||
|
||||
return known_cat_bitsets, f_idx_map
|
||||
Binary file not shown.
@@ -0,0 +1,44 @@
|
||||
import numpy as np
|
||||
cimport numpy as np
|
||||
|
||||
np.import_array()
|
||||
|
||||
|
||||
ctypedef np.npy_float64 X_DTYPE_C
|
||||
ctypedef np.npy_uint8 X_BINNED_DTYPE_C
|
||||
ctypedef np.npy_float64 Y_DTYPE_C
|
||||
ctypedef np.npy_float32 G_H_DTYPE_C
|
||||
ctypedef np.npy_uint32 BITSET_INNER_DTYPE_C
|
||||
ctypedef BITSET_INNER_DTYPE_C[8] BITSET_DTYPE_C
|
||||
|
||||
cdef packed struct hist_struct:
|
||||
# Same as histogram dtype but we need a struct to declare views. It needs
|
||||
# to be packed since by default numpy dtypes aren't aligned
|
||||
Y_DTYPE_C sum_gradients
|
||||
Y_DTYPE_C sum_hessians
|
||||
unsigned int count
|
||||
|
||||
|
||||
cdef packed struct node_struct:
|
||||
# Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It
|
||||
# needs to be packed since by default numpy dtypes aren't aligned
|
||||
Y_DTYPE_C value
|
||||
unsigned int count
|
||||
unsigned int feature_idx
|
||||
X_DTYPE_C num_threshold
|
||||
unsigned char missing_go_to_left
|
||||
unsigned int left
|
||||
unsigned int right
|
||||
Y_DTYPE_C gain
|
||||
unsigned int depth
|
||||
unsigned char is_leaf
|
||||
X_BINNED_DTYPE_C bin_threshold
|
||||
unsigned char is_categorical
|
||||
# The index of the corresponding bitsets in the Predictor's bitset arrays.
|
||||
# Only used if is_categorical is True
|
||||
unsigned int bitset_idx
|
||||
|
||||
cpdef enum MonotonicConstraint:
|
||||
NO_CST = 0
|
||||
POS = 1
|
||||
NEG = -1
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,709 @@
|
||||
"""
|
||||
This module contains the TreeGrower class.
|
||||
|
||||
TreeGrower builds a regression tree fitting a Newton-Raphson step, based on
|
||||
the gradients and hessians of the training data.
|
||||
"""
|
||||
# Author: Nicolas Hug
|
||||
|
||||
from heapq import heappush, heappop
|
||||
import numpy as np
|
||||
from timeit import default_timer as time
|
||||
import numbers
|
||||
|
||||
from .splitting import Splitter
|
||||
from .histogram import HistogramBuilder
|
||||
from .predictor import TreePredictor
|
||||
from .utils import sum_parallel
|
||||
from .common import PREDICTOR_RECORD_DTYPE
|
||||
from .common import X_BITSET_INNER_DTYPE
|
||||
from .common import Y_DTYPE
|
||||
from .common import MonotonicConstraint
|
||||
from ._bitset import set_raw_bitset_from_binned_bitset
|
||||
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
|
||||
|
||||
|
||||
EPS = np.finfo(Y_DTYPE).eps # to avoid zero division errors
|
||||
|
||||
|
||||
class TreeNode:
|
||||
"""Tree Node class used in TreeGrower.
|
||||
|
||||
This isn't used for prediction purposes, only for training (see
|
||||
TreePredictor).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
depth : int
|
||||
The depth of the node, i.e. its distance from the root.
|
||||
sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint
|
||||
The indices of the samples at the node.
|
||||
sum_gradients : float
|
||||
The sum of the gradients of the samples at the node.
|
||||
sum_hessians : float
|
||||
The sum of the hessians of the samples at the node.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
depth : int
|
||||
The depth of the node, i.e. its distance from the root.
|
||||
sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint
|
||||
The indices of the samples at the node.
|
||||
sum_gradients : float
|
||||
The sum of the gradients of the samples at the node.
|
||||
sum_hessians : float
|
||||
The sum of the hessians of the samples at the node.
|
||||
split_info : SplitInfo or None
|
||||
The result of the split evaluation.
|
||||
left_child : TreeNode or None
|
||||
The left child of the node. None for leaves.
|
||||
right_child : TreeNode or None
|
||||
The right child of the node. None for leaves.
|
||||
value : float or None
|
||||
The value of the leaf, as computed in finalize_leaf(). None for
|
||||
non-leaf nodes.
|
||||
partition_start : int
|
||||
start position of the node's sample_indices in splitter.partition.
|
||||
partition_stop : int
|
||||
stop position of the node's sample_indices in splitter.partition.
|
||||
"""
|
||||
|
||||
split_info = None
|
||||
left_child = None
|
||||
right_child = None
|
||||
histograms = None
|
||||
|
||||
# start and stop indices of the node in the splitter.partition
|
||||
# array. Concretely,
|
||||
# self.sample_indices = view(self.splitter.partition[start:stop])
|
||||
# Please see the comments about splitter.partition and
|
||||
# splitter.split_indices for more info about this design.
|
||||
# These 2 attributes are only used in _update_raw_prediction, because we
|
||||
# need to iterate over the leaves and I don't know how to efficiently
|
||||
# store the sample_indices views because they're all of different sizes.
|
||||
partition_start = 0
|
||||
partition_stop = 0
|
||||
|
||||
def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, value=None):
|
||||
self.depth = depth
|
||||
self.sample_indices = sample_indices
|
||||
self.n_samples = sample_indices.shape[0]
|
||||
self.sum_gradients = sum_gradients
|
||||
self.sum_hessians = sum_hessians
|
||||
self.value = value
|
||||
self.is_leaf = False
|
||||
self.set_children_bounds(float("-inf"), float("+inf"))
|
||||
|
||||
def set_children_bounds(self, lower, upper):
|
||||
"""Set children values bounds to respect monotonic constraints."""
|
||||
|
||||
# These are bounds for the node's *children* values, not the node's
|
||||
# value. The bounds are used in the splitter when considering potential
|
||||
# left and right child.
|
||||
self.children_lower_bound = lower
|
||||
self.children_upper_bound = upper
|
||||
|
||||
def __lt__(self, other_node):
|
||||
"""Comparison for priority queue.
|
||||
|
||||
Nodes with high gain are higher priority than nodes with low gain.
|
||||
|
||||
heapq.heappush only need the '<' operator.
|
||||
heapq.heappop take the smallest item first (smaller is higher
|
||||
priority).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other_node : TreeNode
|
||||
The node to compare with.
|
||||
"""
|
||||
return self.split_info.gain > other_node.split_info.gain
|
||||
|
||||
|
||||
class TreeGrower:
|
||||
"""Tree grower class used to build a tree.
|
||||
|
||||
The tree is fitted to predict the values of a Newton-Raphson step. The
|
||||
splits are considered in a best-first fashion, and the quality of a
|
||||
split is defined in splitting._split_gain.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_binned : ndarray of shape (n_samples, n_features), dtype=np.uint8
|
||||
The binned input samples. Must be Fortran-aligned.
|
||||
gradients : ndarray of shape (n_samples,)
|
||||
The gradients of each training sample. Those are the gradients of the
|
||||
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
|
||||
hessians : ndarray of shape (n_samples,)
|
||||
The hessians of each training sample. Those are the hessians of the
|
||||
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
|
||||
max_leaf_nodes : int, default=None
|
||||
The maximum number of leaves for each tree. If None, there is no
|
||||
maximum limit.
|
||||
max_depth : int, default=None
|
||||
The maximum depth of each tree. The depth of a tree is the number of
|
||||
edges to go from the root to the deepest leaf.
|
||||
Depth isn't constrained by default.
|
||||
min_samples_leaf : int, default=20
|
||||
The minimum number of samples per leaf.
|
||||
min_gain_to_split : float, default=0.
|
||||
The minimum gain needed to split a node. Splits with lower gain will
|
||||
be ignored.
|
||||
n_bins : int, default=256
|
||||
The total number of bins, including the bin for missing values. Used
|
||||
to define the shape of the histograms.
|
||||
n_bins_non_missing : ndarray, dtype=np.uint32, default=None
|
||||
For each feature, gives the number of bins actually used for
|
||||
non-missing values. For features with a lot of unique values, this
|
||||
is equal to ``n_bins - 1``. If it's an int, all features are
|
||||
considered to have the same number of bins. If None, all features
|
||||
are considered to have ``n_bins - 1`` bins.
|
||||
has_missing_values : bool or ndarray, dtype=bool, default=False
|
||||
Whether each feature contains missing values (in the training data).
|
||||
If it's a bool, the same value is used for all features.
|
||||
is_categorical : ndarray of bool of shape (n_features,), default=None
|
||||
Indicates categorical features.
|
||||
monotonic_cst : array-like of shape (n_features,), dtype=int, default=None
|
||||
Indicates the monotonic constraint to enforce on each feature. -1, 1
|
||||
and 0 respectively correspond to a positive constraint, negative
|
||||
constraint and no constraint. Read more in the :ref:`User Guide
|
||||
<monotonic_cst_gbdt>`.
|
||||
l2_regularization : float, default=0.
|
||||
The L2 regularization parameter.
|
||||
min_hessian_to_split : float, default=1e-3
|
||||
The minimum sum of hessians needed in each node. Splits that result in
|
||||
at least one child having a sum of hessians less than
|
||||
``min_hessian_to_split`` are discarded.
|
||||
shrinkage : float, default=1.
|
||||
The shrinkage parameter to apply to the leaves values, also known as
|
||||
learning rate.
|
||||
n_threads : int, default=None
|
||||
Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
|
||||
to determine the effective number of threads use, which takes cgroups CPU
|
||||
quotes into account. See the docstring of `_openmp_effective_n_threads`
|
||||
for details.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
X_binned,
|
||||
gradients,
|
||||
hessians,
|
||||
max_leaf_nodes=None,
|
||||
max_depth=None,
|
||||
min_samples_leaf=20,
|
||||
min_gain_to_split=0.0,
|
||||
n_bins=256,
|
||||
n_bins_non_missing=None,
|
||||
has_missing_values=False,
|
||||
is_categorical=None,
|
||||
monotonic_cst=None,
|
||||
l2_regularization=0.0,
|
||||
min_hessian_to_split=1e-3,
|
||||
shrinkage=1.0,
|
||||
n_threads=None,
|
||||
):
|
||||
|
||||
self._validate_parameters(
|
||||
X_binned,
|
||||
max_leaf_nodes,
|
||||
max_depth,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
)
|
||||
n_threads = _openmp_effective_n_threads(n_threads)
|
||||
|
||||
if n_bins_non_missing is None:
|
||||
n_bins_non_missing = n_bins - 1
|
||||
|
||||
if isinstance(n_bins_non_missing, numbers.Integral):
|
||||
n_bins_non_missing = np.array(
|
||||
[n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32
|
||||
)
|
||||
else:
|
||||
n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint32)
|
||||
|
||||
if isinstance(has_missing_values, bool):
|
||||
has_missing_values = [has_missing_values] * X_binned.shape[1]
|
||||
has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
|
||||
|
||||
if monotonic_cst is None:
|
||||
self.with_monotonic_cst = False
|
||||
monotonic_cst = np.full(
|
||||
shape=X_binned.shape[1],
|
||||
fill_value=MonotonicConstraint.NO_CST,
|
||||
dtype=np.int8,
|
||||
)
|
||||
else:
|
||||
self.with_monotonic_cst = True
|
||||
monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
|
||||
|
||||
if monotonic_cst.shape[0] != X_binned.shape[1]:
|
||||
raise ValueError(
|
||||
"monotonic_cst has shape {} but the input data "
|
||||
"X has {} features.".format(
|
||||
monotonic_cst.shape[0], X_binned.shape[1]
|
||||
)
|
||||
)
|
||||
if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):
|
||||
raise ValueError(
|
||||
"monotonic_cst must be None or an array-like of -1, 0 or 1."
|
||||
)
|
||||
|
||||
if is_categorical is None:
|
||||
is_categorical = np.zeros(shape=X_binned.shape[1], dtype=np.uint8)
|
||||
else:
|
||||
is_categorical = np.asarray(is_categorical, dtype=np.uint8)
|
||||
|
||||
if np.any(
|
||||
np.logical_and(
|
||||
is_categorical == 1, monotonic_cst != MonotonicConstraint.NO_CST
|
||||
)
|
||||
):
|
||||
raise ValueError("Categorical features cannot have monotonic constraints.")
|
||||
|
||||
hessians_are_constant = hessians.shape[0] == 1
|
||||
self.histogram_builder = HistogramBuilder(
|
||||
X_binned, n_bins, gradients, hessians, hessians_are_constant, n_threads
|
||||
)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
self.splitter = Splitter(
|
||||
X_binned,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
is_categorical,
|
||||
monotonic_cst,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
hessians_are_constant,
|
||||
n_threads,
|
||||
)
|
||||
self.n_bins_non_missing = n_bins_non_missing
|
||||
self.missing_values_bin_idx = missing_values_bin_idx
|
||||
self.max_leaf_nodes = max_leaf_nodes
|
||||
self.has_missing_values = has_missing_values
|
||||
self.monotonic_cst = monotonic_cst
|
||||
self.is_categorical = is_categorical
|
||||
self.l2_regularization = l2_regularization
|
||||
self.n_features = X_binned.shape[1]
|
||||
self.max_depth = max_depth
|
||||
self.min_samples_leaf = min_samples_leaf
|
||||
self.X_binned = X_binned
|
||||
self.min_gain_to_split = min_gain_to_split
|
||||
self.shrinkage = shrinkage
|
||||
self.n_threads = n_threads
|
||||
self.splittable_nodes = []
|
||||
self.finalized_leaves = []
|
||||
self.total_find_split_time = 0.0 # time spent finding the best splits
|
||||
self.total_compute_hist_time = 0.0 # time spent computing histograms
|
||||
self.total_apply_split_time = 0.0 # time spent splitting nodes
|
||||
self.n_categorical_splits = 0
|
||||
self._intilialize_root(gradients, hessians, hessians_are_constant)
|
||||
self.n_nodes = 1
|
||||
|
||||
def _validate_parameters(
|
||||
self,
|
||||
X_binned,
|
||||
max_leaf_nodes,
|
||||
max_depth,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
):
|
||||
"""Validate parameters passed to __init__.
|
||||
|
||||
Also validate parameters passed to splitter.
|
||||
"""
|
||||
if X_binned.dtype != np.uint8:
|
||||
raise NotImplementedError("X_binned must be of type uint8.")
|
||||
if not X_binned.flags.f_contiguous:
|
||||
raise ValueError(
|
||||
"X_binned should be passed as Fortran contiguous "
|
||||
"array for maximum efficiency."
|
||||
)
|
||||
if max_leaf_nodes is not None and max_leaf_nodes <= 1:
|
||||
raise ValueError(
|
||||
"max_leaf_nodes={} should not be smaller than 2".format(max_leaf_nodes)
|
||||
)
|
||||
if max_depth is not None and max_depth < 1:
|
||||
raise ValueError(
|
||||
"max_depth={} should not be smaller than 1".format(max_depth)
|
||||
)
|
||||
if min_samples_leaf < 1:
|
||||
raise ValueError(
|
||||
"min_samples_leaf={} should not be smaller than 1".format(
|
||||
min_samples_leaf
|
||||
)
|
||||
)
|
||||
if min_gain_to_split < 0:
|
||||
raise ValueError(
|
||||
"min_gain_to_split={} must be positive.".format(min_gain_to_split)
|
||||
)
|
||||
if l2_regularization < 0:
|
||||
raise ValueError(
|
||||
"l2_regularization={} must be positive.".format(l2_regularization)
|
||||
)
|
||||
if min_hessian_to_split < 0:
|
||||
raise ValueError(
|
||||
"min_hessian_to_split={} must be positive.".format(min_hessian_to_split)
|
||||
)
|
||||
|
||||
def grow(self):
|
||||
"""Grow the tree, from root to leaves."""
|
||||
while self.splittable_nodes:
|
||||
self.split_next()
|
||||
|
||||
self._apply_shrinkage()
|
||||
|
||||
def _apply_shrinkage(self):
|
||||
"""Multiply leaves values by shrinkage parameter.
|
||||
|
||||
This must be done at the very end of the growing process. If this were
|
||||
done during the growing process e.g. in finalize_leaf(), then a leaf
|
||||
would be shrunk but its sibling would potentially not be (if it's a
|
||||
non-leaf), which would lead to a wrong computation of the 'middle'
|
||||
value needed to enforce the monotonic constraints.
|
||||
"""
|
||||
for leaf in self.finalized_leaves:
|
||||
leaf.value *= self.shrinkage
|
||||
|
||||
def _intilialize_root(self, gradients, hessians, hessians_are_constant):
|
||||
"""Initialize root node and finalize it if needed."""
|
||||
n_samples = self.X_binned.shape[0]
|
||||
depth = 0
|
||||
sum_gradients = sum_parallel(gradients, self.n_threads)
|
||||
if self.histogram_builder.hessians_are_constant:
|
||||
sum_hessians = hessians[0] * n_samples
|
||||
else:
|
||||
sum_hessians = sum_parallel(hessians, self.n_threads)
|
||||
self.root = TreeNode(
|
||||
depth=depth,
|
||||
sample_indices=self.splitter.partition,
|
||||
sum_gradients=sum_gradients,
|
||||
sum_hessians=sum_hessians,
|
||||
value=0,
|
||||
)
|
||||
|
||||
self.root.partition_start = 0
|
||||
self.root.partition_stop = n_samples
|
||||
|
||||
if self.root.n_samples < 2 * self.min_samples_leaf:
|
||||
# Do not even bother computing any splitting statistics.
|
||||
self._finalize_leaf(self.root)
|
||||
return
|
||||
if sum_hessians < self.splitter.min_hessian_to_split:
|
||||
self._finalize_leaf(self.root)
|
||||
return
|
||||
|
||||
self.root.histograms = self.histogram_builder.compute_histograms_brute(
|
||||
self.root.sample_indices
|
||||
)
|
||||
self._compute_best_split_and_push(self.root)
|
||||
|
||||
def _compute_best_split_and_push(self, node):
|
||||
"""Compute the best possible split (SplitInfo) of a given node.
|
||||
|
||||
Also push it in the heap of splittable nodes if gain isn't zero.
|
||||
The gain of a node is 0 if either all the leaves are pure
|
||||
(best gain = 0), or if no split would satisfy the constraints,
|
||||
(min_hessians_to_split, min_gain_to_split, min_samples_leaf)
|
||||
"""
|
||||
|
||||
node.split_info = self.splitter.find_node_split(
|
||||
node.n_samples,
|
||||
node.histograms,
|
||||
node.sum_gradients,
|
||||
node.sum_hessians,
|
||||
node.value,
|
||||
node.children_lower_bound,
|
||||
node.children_upper_bound,
|
||||
)
|
||||
|
||||
if node.split_info.gain <= 0: # no valid split
|
||||
self._finalize_leaf(node)
|
||||
else:
|
||||
heappush(self.splittable_nodes, node)
|
||||
|
||||
def split_next(self):
|
||||
"""Split the node with highest potential gain.
|
||||
|
||||
Returns
|
||||
-------
|
||||
left : TreeNode
|
||||
The resulting left child.
|
||||
right : TreeNode
|
||||
The resulting right child.
|
||||
"""
|
||||
# Consider the node with the highest loss reduction (a.k.a. gain)
|
||||
node = heappop(self.splittable_nodes)
|
||||
|
||||
tic = time()
|
||||
(
|
||||
sample_indices_left,
|
||||
sample_indices_right,
|
||||
right_child_pos,
|
||||
) = self.splitter.split_indices(node.split_info, node.sample_indices)
|
||||
self.total_apply_split_time += time() - tic
|
||||
|
||||
depth = node.depth + 1
|
||||
n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)
|
||||
n_leaf_nodes += 2
|
||||
|
||||
left_child_node = TreeNode(
|
||||
depth,
|
||||
sample_indices_left,
|
||||
node.split_info.sum_gradient_left,
|
||||
node.split_info.sum_hessian_left,
|
||||
value=node.split_info.value_left,
|
||||
)
|
||||
right_child_node = TreeNode(
|
||||
depth,
|
||||
sample_indices_right,
|
||||
node.split_info.sum_gradient_right,
|
||||
node.split_info.sum_hessian_right,
|
||||
value=node.split_info.value_right,
|
||||
)
|
||||
|
||||
node.right_child = right_child_node
|
||||
node.left_child = left_child_node
|
||||
|
||||
# set start and stop indices
|
||||
left_child_node.partition_start = node.partition_start
|
||||
left_child_node.partition_stop = node.partition_start + right_child_pos
|
||||
right_child_node.partition_start = left_child_node.partition_stop
|
||||
right_child_node.partition_stop = node.partition_stop
|
||||
|
||||
if not self.has_missing_values[node.split_info.feature_idx]:
|
||||
# If no missing values are encountered at fit time, then samples
|
||||
# with missing values during predict() will go to whichever child
|
||||
# has the most samples.
|
||||
node.split_info.missing_go_to_left = (
|
||||
left_child_node.n_samples > right_child_node.n_samples
|
||||
)
|
||||
|
||||
self.n_nodes += 2
|
||||
self.n_categorical_splits += node.split_info.is_categorical
|
||||
|
||||
if self.max_leaf_nodes is not None and n_leaf_nodes == self.max_leaf_nodes:
|
||||
self._finalize_leaf(left_child_node)
|
||||
self._finalize_leaf(right_child_node)
|
||||
self._finalize_splittable_nodes()
|
||||
return left_child_node, right_child_node
|
||||
|
||||
if self.max_depth is not None and depth == self.max_depth:
|
||||
self._finalize_leaf(left_child_node)
|
||||
self._finalize_leaf(right_child_node)
|
||||
return left_child_node, right_child_node
|
||||
|
||||
if left_child_node.n_samples < self.min_samples_leaf * 2:
|
||||
self._finalize_leaf(left_child_node)
|
||||
if right_child_node.n_samples < self.min_samples_leaf * 2:
|
||||
self._finalize_leaf(right_child_node)
|
||||
|
||||
if self.with_monotonic_cst:
|
||||
# Set value bounds for respecting monotonic constraints
|
||||
# See test_nodes_values() for details
|
||||
if (
|
||||
self.monotonic_cst[node.split_info.feature_idx]
|
||||
== MonotonicConstraint.NO_CST
|
||||
):
|
||||
lower_left = lower_right = node.children_lower_bound
|
||||
upper_left = upper_right = node.children_upper_bound
|
||||
else:
|
||||
mid = (left_child_node.value + right_child_node.value) / 2
|
||||
if (
|
||||
self.monotonic_cst[node.split_info.feature_idx]
|
||||
== MonotonicConstraint.POS
|
||||
):
|
||||
lower_left, upper_left = node.children_lower_bound, mid
|
||||
lower_right, upper_right = mid, node.children_upper_bound
|
||||
else: # NEG
|
||||
lower_left, upper_left = mid, node.children_upper_bound
|
||||
lower_right, upper_right = node.children_lower_bound, mid
|
||||
left_child_node.set_children_bounds(lower_left, upper_left)
|
||||
right_child_node.set_children_bounds(lower_right, upper_right)
|
||||
|
||||
# Compute histograms of children, and compute their best possible split
|
||||
# (if needed)
|
||||
should_split_left = not left_child_node.is_leaf
|
||||
should_split_right = not right_child_node.is_leaf
|
||||
if should_split_left or should_split_right:
|
||||
|
||||
# We will compute the histograms of both nodes even if one of them
|
||||
# is a leaf, since computing the second histogram is very cheap
|
||||
# (using histogram subtraction).
|
||||
n_samples_left = left_child_node.sample_indices.shape[0]
|
||||
n_samples_right = right_child_node.sample_indices.shape[0]
|
||||
if n_samples_left < n_samples_right:
|
||||
smallest_child = left_child_node
|
||||
largest_child = right_child_node
|
||||
else:
|
||||
smallest_child = right_child_node
|
||||
largest_child = left_child_node
|
||||
|
||||
# We use the brute O(n_samples) method on the child that has the
|
||||
# smallest number of samples, and the subtraction trick O(n_bins)
|
||||
# on the other one.
|
||||
tic = time()
|
||||
smallest_child.histograms = self.histogram_builder.compute_histograms_brute(
|
||||
smallest_child.sample_indices
|
||||
)
|
||||
largest_child.histograms = (
|
||||
self.histogram_builder.compute_histograms_subtraction(
|
||||
node.histograms, smallest_child.histograms
|
||||
)
|
||||
)
|
||||
self.total_compute_hist_time += time() - tic
|
||||
|
||||
tic = time()
|
||||
if should_split_left:
|
||||
self._compute_best_split_and_push(left_child_node)
|
||||
if should_split_right:
|
||||
self._compute_best_split_and_push(right_child_node)
|
||||
self.total_find_split_time += time() - tic
|
||||
|
||||
# Release memory used by histograms as they are no longer needed
|
||||
# for leaf nodes since they won't be split.
|
||||
for child in (left_child_node, right_child_node):
|
||||
if child.is_leaf:
|
||||
del child.histograms
|
||||
|
||||
# Release memory used by histograms as they are no longer needed for
|
||||
# internal nodes once children histograms have been computed.
|
||||
del node.histograms
|
||||
|
||||
return left_child_node, right_child_node
|
||||
|
||||
def _finalize_leaf(self, node):
|
||||
"""Make node a leaf of the tree being grown."""
|
||||
|
||||
node.is_leaf = True
|
||||
self.finalized_leaves.append(node)
|
||||
|
||||
def _finalize_splittable_nodes(self):
|
||||
"""Transform all splittable nodes into leaves.
|
||||
|
||||
Used when some constraint is met e.g. maximum number of leaves or
|
||||
maximum depth."""
|
||||
while len(self.splittable_nodes) > 0:
|
||||
node = self.splittable_nodes.pop()
|
||||
self._finalize_leaf(node)
|
||||
|
||||
def make_predictor(self, binning_thresholds):
|
||||
"""Make a TreePredictor object out of the current tree.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
binning_thresholds : array-like of floats
|
||||
Corresponds to the bin_thresholds_ attribute of the BinMapper.
|
||||
For each feature, this stores:
|
||||
|
||||
- the bin frontiers for continuous features
|
||||
- the unique raw category values for categorical features
|
||||
|
||||
Returns
|
||||
-------
|
||||
A TreePredictor object.
|
||||
"""
|
||||
predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)
|
||||
binned_left_cat_bitsets = np.zeros(
|
||||
(self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE
|
||||
)
|
||||
raw_left_cat_bitsets = np.zeros(
|
||||
(self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE
|
||||
)
|
||||
_fill_predictor_arrays(
|
||||
predictor_nodes,
|
||||
binned_left_cat_bitsets,
|
||||
raw_left_cat_bitsets,
|
||||
self.root,
|
||||
binning_thresholds,
|
||||
self.n_bins_non_missing,
|
||||
)
|
||||
return TreePredictor(
|
||||
predictor_nodes, binned_left_cat_bitsets, raw_left_cat_bitsets
|
||||
)
|
||||
|
||||
|
||||
def _fill_predictor_arrays(
|
||||
predictor_nodes,
|
||||
binned_left_cat_bitsets,
|
||||
raw_left_cat_bitsets,
|
||||
grower_node,
|
||||
binning_thresholds,
|
||||
n_bins_non_missing,
|
||||
next_free_node_idx=0,
|
||||
next_free_bitset_idx=0,
|
||||
):
|
||||
"""Helper used in make_predictor to set the TreePredictor fields."""
|
||||
node = predictor_nodes[next_free_node_idx]
|
||||
node["count"] = grower_node.n_samples
|
||||
node["depth"] = grower_node.depth
|
||||
if grower_node.split_info is not None:
|
||||
node["gain"] = grower_node.split_info.gain
|
||||
else:
|
||||
node["gain"] = -1
|
||||
|
||||
node["value"] = grower_node.value
|
||||
|
||||
if grower_node.is_leaf:
|
||||
# Leaf node
|
||||
node["is_leaf"] = True
|
||||
return next_free_node_idx + 1, next_free_bitset_idx
|
||||
|
||||
split_info = grower_node.split_info
|
||||
feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx
|
||||
node["feature_idx"] = feature_idx
|
||||
node["bin_threshold"] = bin_idx
|
||||
node["missing_go_to_left"] = split_info.missing_go_to_left
|
||||
node["is_categorical"] = split_info.is_categorical
|
||||
|
||||
if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1:
|
||||
# Split is on the last non-missing bin: it's a "split on nans".
|
||||
# All nans go to the right, the rest go to the left.
|
||||
# Note: for categorical splits, bin_idx is 0 and we rely on the bitset
|
||||
node["num_threshold"] = np.inf
|
||||
elif split_info.is_categorical:
|
||||
categories = binning_thresholds[feature_idx]
|
||||
node["bitset_idx"] = next_free_bitset_idx
|
||||
binned_left_cat_bitsets[next_free_bitset_idx] = split_info.left_cat_bitset
|
||||
set_raw_bitset_from_binned_bitset(
|
||||
raw_left_cat_bitsets[next_free_bitset_idx],
|
||||
split_info.left_cat_bitset,
|
||||
categories,
|
||||
)
|
||||
next_free_bitset_idx += 1
|
||||
else:
|
||||
node["num_threshold"] = binning_thresholds[feature_idx][bin_idx]
|
||||
|
||||
next_free_node_idx += 1
|
||||
|
||||
node["left"] = next_free_node_idx
|
||||
next_free_node_idx, next_free_bitset_idx = _fill_predictor_arrays(
|
||||
predictor_nodes,
|
||||
binned_left_cat_bitsets,
|
||||
raw_left_cat_bitsets,
|
||||
grower_node.left_child,
|
||||
binning_thresholds=binning_thresholds,
|
||||
n_bins_non_missing=n_bins_non_missing,
|
||||
next_free_node_idx=next_free_node_idx,
|
||||
next_free_bitset_idx=next_free_bitset_idx,
|
||||
)
|
||||
|
||||
node["right"] = next_free_node_idx
|
||||
return _fill_predictor_arrays(
|
||||
predictor_nodes,
|
||||
binned_left_cat_bitsets,
|
||||
raw_left_cat_bitsets,
|
||||
grower_node.right_child,
|
||||
binning_thresholds=binning_thresholds,
|
||||
n_bins_non_missing=n_bins_non_missing,
|
||||
next_free_node_idx=next_free_node_idx,
|
||||
next_free_bitset_idx=next_free_bitset_idx,
|
||||
)
|
||||
Binary file not shown.
@@ -0,0 +1,125 @@
|
||||
"""
|
||||
This module contains the TreePredictor class which is used for prediction.
|
||||
"""
|
||||
# Author: Nicolas Hug
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .common import Y_DTYPE
|
||||
from ._predictor import _predict_from_raw_data
|
||||
from ._predictor import _predict_from_binned_data
|
||||
from ._predictor import _compute_partial_dependence
|
||||
|
||||
|
||||
class TreePredictor:
|
||||
"""Tree class used for predictions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nodes : ndarray of PREDICTOR_RECORD_DTYPE
|
||||
The nodes of the tree.
|
||||
binned_left_cat_bitsets : ndarray of shape (n_categorical_splits, 8), \
|
||||
dtype=uint32
|
||||
Array of bitsets for binned categories used in predict_binned when a
|
||||
split is categorical.
|
||||
raw_left_cat_bitsets : ndarray of shape (n_categorical_splits, 8), \
|
||||
dtype=uint32
|
||||
Array of bitsets for raw categories used in predict when a split is
|
||||
categorical.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, nodes, binned_left_cat_bitsets, raw_left_cat_bitsets):
|
||||
self.nodes = nodes
|
||||
self.binned_left_cat_bitsets = binned_left_cat_bitsets
|
||||
self.raw_left_cat_bitsets = raw_left_cat_bitsets
|
||||
|
||||
def get_n_leaf_nodes(self):
|
||||
"""Return number of leaves."""
|
||||
return int(self.nodes["is_leaf"].sum())
|
||||
|
||||
def get_max_depth(self):
|
||||
"""Return maximum depth among all leaves."""
|
||||
return int(self.nodes["depth"].max())
|
||||
|
||||
def predict(self, X, known_cat_bitsets, f_idx_map, n_threads):
|
||||
"""Predict raw values for non-binned data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray, shape (n_samples, n_features)
|
||||
The input samples.
|
||||
|
||||
known_cat_bitsets : ndarray of shape (n_categorical_features, 8)
|
||||
Array of bitsets of known categories, for each categorical feature.
|
||||
|
||||
f_idx_map : ndarray of shape (n_features,)
|
||||
Map from original feature index to the corresponding index in the
|
||||
known_cat_bitsets array.
|
||||
|
||||
n_threads : int
|
||||
Number of OpenMP threads to use.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray, shape (n_samples,)
|
||||
The raw predicted values.
|
||||
"""
|
||||
out = np.empty(X.shape[0], dtype=Y_DTYPE)
|
||||
_predict_from_raw_data(
|
||||
self.nodes,
|
||||
X,
|
||||
self.raw_left_cat_bitsets,
|
||||
known_cat_bitsets,
|
||||
f_idx_map,
|
||||
n_threads,
|
||||
out,
|
||||
)
|
||||
return out
|
||||
|
||||
def predict_binned(self, X, missing_values_bin_idx, n_threads):
|
||||
"""Predict raw values for binned data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray, shape (n_samples, n_features)
|
||||
The input samples.
|
||||
missing_values_bin_idx : uint8
|
||||
Index of the bin that is used for missing values. This is the
|
||||
index of the last bin and is always equal to max_bins (as passed
|
||||
to the GBDT classes), or equivalently to n_bins - 1.
|
||||
n_threads : int
|
||||
Number of OpenMP threads to use.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray, shape (n_samples,)
|
||||
The raw predicted values.
|
||||
"""
|
||||
out = np.empty(X.shape[0], dtype=Y_DTYPE)
|
||||
_predict_from_binned_data(
|
||||
self.nodes,
|
||||
X,
|
||||
self.binned_left_cat_bitsets,
|
||||
missing_values_bin_idx,
|
||||
n_threads,
|
||||
out,
|
||||
)
|
||||
return out
|
||||
|
||||
def compute_partial_dependence(self, grid, target_features, out):
|
||||
"""Fast partial dependence computation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
grid : ndarray, shape (n_samples, n_target_features)
|
||||
The grid points on which the partial dependence should be
|
||||
evaluated.
|
||||
target_features : ndarray, shape (n_target_features)
|
||||
The set of target features for which the partial dependence
|
||||
should be evaluated.
|
||||
out : ndarray, shape (n_samples)
|
||||
The value of the partial dependence function on each grid
|
||||
point.
|
||||
"""
|
||||
_compute_partial_dependence(self.nodes, grid, target_features, out)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,461 @@
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_equal, assert_allclose
|
||||
import pytest
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.binning import (
|
||||
_BinMapper,
|
||||
_find_binning_thresholds,
|
||||
_map_to_bins,
|
||||
)
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF
|
||||
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
|
||||
|
||||
n_threads = _openmp_effective_n_threads()
|
||||
|
||||
|
||||
DATA = (
|
||||
np.random.RandomState(42)
|
||||
.normal(loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2))
|
||||
.astype(X_DTYPE)
|
||||
)
|
||||
|
||||
|
||||
def test_find_binning_thresholds_regular_data():
|
||||
data = np.linspace(0, 10, 1001)
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
|
||||
assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=5)
|
||||
assert_allclose(bin_thresholds, [2, 4, 6, 8])
|
||||
|
||||
|
||||
def test_find_binning_thresholds_small_regular_data():
|
||||
data = np.linspace(0, 10, 11)
|
||||
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=5)
|
||||
assert_allclose(bin_thresholds, [2, 4, 6, 8])
|
||||
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
|
||||
assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=11)
|
||||
assert_allclose(bin_thresholds, np.arange(10) + 0.5)
|
||||
|
||||
bin_thresholds = _find_binning_thresholds(data, max_bins=255)
|
||||
assert_allclose(bin_thresholds, np.arange(10) + 0.5)
|
||||
|
||||
|
||||
def test_find_binning_thresholds_random_data():
|
||||
bin_thresholds = [
|
||||
_find_binning_thresholds(DATA[:, i], max_bins=255) for i in range(2)
|
||||
]
|
||||
for i in range(len(bin_thresholds)):
|
||||
assert bin_thresholds[i].shape == (254,) # 255 - 1
|
||||
assert bin_thresholds[i].dtype == DATA.dtype
|
||||
|
||||
assert_allclose(
|
||||
bin_thresholds[0][[64, 128, 192]], np.array([-0.7, 0.0, 0.7]), atol=1e-1
|
||||
)
|
||||
|
||||
assert_allclose(
|
||||
bin_thresholds[1][[64, 128, 192]], np.array([9.99, 10.00, 10.01]), atol=1e-2
|
||||
)
|
||||
|
||||
|
||||
def test_find_binning_thresholds_low_n_bins():
|
||||
bin_thresholds = [
|
||||
_find_binning_thresholds(DATA[:, i], max_bins=128) for i in range(2)
|
||||
]
|
||||
for i in range(len(bin_thresholds)):
|
||||
assert bin_thresholds[i].shape == (127,) # 128 - 1
|
||||
assert bin_thresholds[i].dtype == DATA.dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_bins", (2, 257))
|
||||
def test_invalid_n_bins(n_bins):
|
||||
err_msg = "n_bins={} should be no smaller than 3 and no larger than 256".format(
|
||||
n_bins
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_BinMapper(n_bins=n_bins).fit(DATA)
|
||||
|
||||
|
||||
def test_bin_mapper_n_features_transform():
|
||||
mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA)
|
||||
err_msg = "This estimator was fitted with 2 features but 4 got passed"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mapper.transform(np.repeat(DATA, 2, axis=1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_bins", [16, 128, 255])
|
||||
def test_map_to_bins(max_bins):
|
||||
bin_thresholds = [
|
||||
_find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2)
|
||||
]
|
||||
binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F")
|
||||
last_bin_idx = max_bins
|
||||
_map_to_bins(DATA, bin_thresholds, last_bin_idx, n_threads, binned)
|
||||
assert binned.shape == DATA.shape
|
||||
assert binned.dtype == np.uint8
|
||||
assert binned.flags.f_contiguous
|
||||
|
||||
min_indices = DATA.argmin(axis=0)
|
||||
max_indices = DATA.argmax(axis=0)
|
||||
|
||||
for feature_idx, min_idx in enumerate(min_indices):
|
||||
assert binned[min_idx, feature_idx] == 0
|
||||
for feature_idx, max_idx in enumerate(max_indices):
|
||||
assert binned[max_idx, feature_idx] == max_bins - 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_bins", [5, 10, 42])
|
||||
def test_bin_mapper_random_data(max_bins):
|
||||
n_samples, n_features = DATA.shape
|
||||
|
||||
expected_count_per_bin = n_samples // max_bins
|
||||
tol = int(0.05 * expected_count_per_bin)
|
||||
|
||||
# max_bins is the number of bins for non-missing values
|
||||
n_bins = max_bins + 1
|
||||
mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA)
|
||||
binned = mapper.transform(DATA)
|
||||
|
||||
assert binned.shape == (n_samples, n_features)
|
||||
assert binned.dtype == np.uint8
|
||||
assert_array_equal(binned.min(axis=0), np.array([0, 0]))
|
||||
assert_array_equal(binned.max(axis=0), np.array([max_bins - 1, max_bins - 1]))
|
||||
assert len(mapper.bin_thresholds_) == n_features
|
||||
for bin_thresholds_feature in mapper.bin_thresholds_:
|
||||
assert bin_thresholds_feature.shape == (max_bins - 1,)
|
||||
assert bin_thresholds_feature.dtype == DATA.dtype
|
||||
assert np.all(mapper.n_bins_non_missing_ == max_bins)
|
||||
|
||||
# Check that the binned data is approximately balanced across bins.
|
||||
for feature_idx in range(n_features):
|
||||
for bin_idx in range(max_bins):
|
||||
count = (binned[:, feature_idx] == bin_idx).sum()
|
||||
assert abs(count - expected_count_per_bin) < tol
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_samples, max_bins", [(5, 5), (5, 10), (5, 11), (42, 255)])
|
||||
def test_bin_mapper_small_random_data(n_samples, max_bins):
|
||||
data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
|
||||
assert len(np.unique(data)) == n_samples
|
||||
|
||||
# max_bins is the number of bins for non-missing values
|
||||
n_bins = max_bins + 1
|
||||
mapper = _BinMapper(n_bins=n_bins, random_state=42)
|
||||
binned = mapper.fit_transform(data)
|
||||
|
||||
assert binned.shape == data.shape
|
||||
assert binned.dtype == np.uint8
|
||||
assert_array_equal(binned.ravel()[np.argsort(data.ravel())], np.arange(n_samples))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_bins, n_distinct, multiplier",
|
||||
[
|
||||
(5, 5, 1),
|
||||
(5, 5, 3),
|
||||
(255, 12, 42),
|
||||
],
|
||||
)
|
||||
def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier):
|
||||
data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
|
||||
# max_bins is the number of bins for non-missing values
|
||||
n_bins = max_bins + 1
|
||||
binned = _BinMapper(n_bins=n_bins).fit_transform(data)
|
||||
assert_array_equal(data, binned)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_distinct", [2, 7, 42])
|
||||
def test_bin_mapper_repeated_values_invariance(n_distinct):
|
||||
rng = np.random.RandomState(42)
|
||||
distinct_values = rng.normal(size=n_distinct)
|
||||
assert len(np.unique(distinct_values)) == n_distinct
|
||||
|
||||
repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
|
||||
data = distinct_values[repeated_indices]
|
||||
rng.shuffle(data)
|
||||
assert_array_equal(np.unique(data), np.sort(distinct_values))
|
||||
|
||||
data = data.reshape(-1, 1)
|
||||
|
||||
mapper_1 = _BinMapper(n_bins=n_distinct + 1)
|
||||
binned_1 = mapper_1.fit_transform(data)
|
||||
assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))
|
||||
|
||||
# Adding more bins to the mapper yields the same results (same thresholds)
|
||||
mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1)
|
||||
binned_2 = mapper_2.fit_transform(data)
|
||||
|
||||
assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
|
||||
assert_array_equal(binned_1, binned_2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_bins, scale, offset",
|
||||
[
|
||||
(3, 2, -1),
|
||||
(42, 1, 0),
|
||||
(255, 0.3, 42),
|
||||
],
|
||||
)
|
||||
def test_bin_mapper_identity_small(max_bins, scale, offset):
|
||||
data = np.arange(max_bins).reshape(-1, 1) * scale + offset
|
||||
# max_bins is the number of bins for non-missing values
|
||||
n_bins = max_bins + 1
|
||||
binned = _BinMapper(n_bins=n_bins).fit_transform(data)
|
||||
assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_bins_small, max_bins_large",
|
||||
[
|
||||
(2, 2),
|
||||
(3, 3),
|
||||
(4, 4),
|
||||
(42, 42),
|
||||
(255, 255),
|
||||
(5, 17),
|
||||
(42, 255),
|
||||
],
|
||||
)
|
||||
def test_bin_mapper_idempotence(max_bins_small, max_bins_large):
|
||||
assert max_bins_large >= max_bins_small
|
||||
data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
|
||||
mapper_small = _BinMapper(n_bins=max_bins_small + 1)
|
||||
mapper_large = _BinMapper(n_bins=max_bins_small + 1)
|
||||
binned_small = mapper_small.fit_transform(data)
|
||||
binned_large = mapper_large.fit_transform(binned_small)
|
||||
assert_array_equal(binned_small, binned_large)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_bins", [10, 100, 256])
|
||||
@pytest.mark.parametrize("diff", [-5, 0, 5])
|
||||
def test_n_bins_non_missing(n_bins, diff):
|
||||
# Check that n_bins_non_missing is n_unique_values when
|
||||
# there are not a lot of unique values, else n_bins - 1.
|
||||
|
||||
n_unique_values = n_bins + diff
|
||||
X = list(range(n_unique_values)) * 2
|
||||
X = np.array(X).reshape(-1, 1)
|
||||
mapper = _BinMapper(n_bins=n_bins).fit(X)
|
||||
assert np.all(mapper.n_bins_non_missing_ == min(n_bins - 1, n_unique_values))
|
||||
|
||||
|
||||
def test_subsample():
|
||||
# Make sure bin thresholds are different when applying subsampling
|
||||
mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA)
|
||||
mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA)
|
||||
|
||||
for feature in range(DATA.shape[1]):
|
||||
assert not np.allclose(
|
||||
mapper_no_subsample.bin_thresholds_[feature],
|
||||
mapper_subsample.bin_thresholds_[feature],
|
||||
rtol=1e-4,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"n_bins, n_bins_non_missing, X_trans_expected",
|
||||
[
|
||||
(
|
||||
256,
|
||||
[4, 2, 2],
|
||||
[
|
||||
[0, 0, 0], # 255 <=> missing value
|
||||
[255, 255, 0],
|
||||
[1, 0, 0],
|
||||
[255, 1, 1],
|
||||
[2, 1, 1],
|
||||
[3, 0, 0],
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[2, 2, 2],
|
||||
[
|
||||
[0, 0, 0], # 2 <=> missing value
|
||||
[2, 2, 0],
|
||||
[0, 0, 0],
|
||||
[2, 1, 1],
|
||||
[1, 1, 1],
|
||||
[1, 0, 0],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected):
|
||||
# check for missing values: make sure nans are mapped to the last bin
|
||||
# and that the _BinMapper attributes are correct
|
||||
|
||||
X = [
|
||||
[1, 1, 0],
|
||||
[np.NaN, np.NaN, 0],
|
||||
[2, 1, 0],
|
||||
[np.NaN, 2, 1],
|
||||
[3, 2, 1],
|
||||
[4, 1, 0],
|
||||
]
|
||||
|
||||
X = np.array(X)
|
||||
|
||||
mapper = _BinMapper(n_bins=n_bins)
|
||||
mapper.fit(X)
|
||||
|
||||
assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing)
|
||||
|
||||
for feature_idx in range(X.shape[1]):
|
||||
assert (
|
||||
len(mapper.bin_thresholds_[feature_idx])
|
||||
== n_bins_non_missing[feature_idx] - 1
|
||||
)
|
||||
|
||||
assert mapper.missing_values_bin_idx_ == n_bins - 1
|
||||
|
||||
X_trans = mapper.transform(X)
|
||||
assert_array_equal(X_trans, X_trans_expected)
|
||||
|
||||
|
||||
def test_infinite_values():
|
||||
# Make sure infinite values are properly handled.
|
||||
bin_mapper = _BinMapper()
|
||||
|
||||
X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
|
||||
|
||||
bin_mapper.fit(X)
|
||||
assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, 0.5, ALMOST_INF])
|
||||
assert bin_mapper.n_bins_non_missing_ == [4]
|
||||
|
||||
expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1)
|
||||
assert_array_equal(bin_mapper.transform(X), expected_binned_X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_bins", [15, 256])
|
||||
def test_categorical_feature(n_bins):
|
||||
# Basic test for categorical features
|
||||
# we make sure that categories are mapped into [0, n_categories - 1] and
|
||||
# that nans are mapped to the last bin
|
||||
X = np.array(
|
||||
[[4] * 500 + [1] * 3 + [10] * 4 + [0] * 4 + [13] + [7] * 5 + [np.nan] * 2],
|
||||
dtype=X_DTYPE,
|
||||
).T
|
||||
known_categories = [np.unique(X[~np.isnan(X)])]
|
||||
|
||||
bin_mapper = _BinMapper(
|
||||
n_bins=n_bins,
|
||||
is_categorical=np.array([True]),
|
||||
known_categories=known_categories,
|
||||
).fit(X)
|
||||
assert bin_mapper.n_bins_non_missing_ == [6]
|
||||
assert_array_equal(bin_mapper.bin_thresholds_[0], [0, 1, 4, 7, 10, 13])
|
||||
|
||||
X = np.array([[0, 1, 4, np.nan, 7, 10, 13]], dtype=X_DTYPE).T
|
||||
expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T
|
||||
assert_array_equal(bin_mapper.transform(X), expected_trans)
|
||||
|
||||
# For unknown categories, the mapping is incorrect / undefined. This never
|
||||
# happens in practice. This check is only for illustration purpose.
|
||||
X = np.array([[-1, 100]], dtype=X_DTYPE).T
|
||||
expected_trans = np.array([[0, 6]]).T
|
||||
assert_array_equal(bin_mapper.transform(X), expected_trans)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_bins", (128, 256))
|
||||
def test_categorical_with_numerical_features(n_bins):
|
||||
# basic check for binmapper with mixed data
|
||||
X1 = np.arange(10, 20).reshape(-1, 1) # numerical
|
||||
X2 = np.arange(10, 15).reshape(-1, 1) # categorical
|
||||
X2 = np.r_[X2, X2]
|
||||
X = np.c_[X1, X2]
|
||||
known_categories = [None, np.unique(X2).astype(X_DTYPE)]
|
||||
|
||||
bin_mapper = _BinMapper(
|
||||
n_bins=n_bins,
|
||||
is_categorical=np.array([False, True]),
|
||||
known_categories=known_categories,
|
||||
).fit(X)
|
||||
|
||||
assert_array_equal(bin_mapper.n_bins_non_missing_, [10, 5])
|
||||
|
||||
bin_thresholds = bin_mapper.bin_thresholds_
|
||||
assert len(bin_thresholds) == 2
|
||||
assert_array_equal(bin_thresholds[1], np.arange(10, 15))
|
||||
|
||||
expected_X_trans = [
|
||||
[0, 0],
|
||||
[1, 1],
|
||||
[2, 2],
|
||||
[3, 3],
|
||||
[4, 4],
|
||||
[5, 0],
|
||||
[6, 1],
|
||||
[7, 2],
|
||||
[8, 3],
|
||||
[9, 4],
|
||||
]
|
||||
assert_array_equal(bin_mapper.transform(X), expected_X_trans)
|
||||
|
||||
|
||||
def test_make_known_categories_bitsets():
|
||||
# Check the output of make_known_categories_bitsets
|
||||
X = np.array(
|
||||
[[14, 2, 30], [30, 4, 70], [40, 10, 180], [40, 240, 180]], dtype=X_DTYPE
|
||||
)
|
||||
|
||||
bin_mapper = _BinMapper(
|
||||
n_bins=256,
|
||||
is_categorical=np.array([False, True, True]),
|
||||
known_categories=[None, X[:, 1], X[:, 2]],
|
||||
)
|
||||
bin_mapper.fit(X)
|
||||
|
||||
known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets()
|
||||
|
||||
# Note that for non-categorical features, values are left to 0
|
||||
expected_f_idx_map = np.array([0, 0, 1], dtype=np.uint8)
|
||||
assert_allclose(expected_f_idx_map, f_idx_map)
|
||||
|
||||
expected_cat_bitset = np.zeros((2, 8), dtype=np.uint32)
|
||||
|
||||
# first categorical feature: [2, 4, 10, 240]
|
||||
f_idx = 1
|
||||
mapped_f_idx = f_idx_map[f_idx]
|
||||
expected_cat_bitset[mapped_f_idx, 0] = 2**2 + 2**4 + 2**10
|
||||
# 240 = 32**7 + 16, therefore the 16th bit of the 7th array is 1.
|
||||
expected_cat_bitset[mapped_f_idx, 7] = 2**16
|
||||
|
||||
# second categorical feature [30, 70, 180]
|
||||
f_idx = 2
|
||||
mapped_f_idx = f_idx_map[f_idx]
|
||||
expected_cat_bitset[mapped_f_idx, 0] = 2**30
|
||||
expected_cat_bitset[mapped_f_idx, 2] = 2**6
|
||||
expected_cat_bitset[mapped_f_idx, 5] = 2**20
|
||||
|
||||
assert_allclose(expected_cat_bitset, known_cat_bitsets)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"is_categorical, known_categories, match",
|
||||
[
|
||||
(np.array([True]), [None], "Known categories for feature 0 must be provided"),
|
||||
(
|
||||
np.array([False]),
|
||||
np.array([1, 2, 3]),
|
||||
"isn't marked as a categorical feature, but categories were passed",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_categorical_parameters(is_categorical, known_categories, match):
|
||||
# test the validation of the is_categorical and known_categories parameters
|
||||
|
||||
X = np.array([[1, 2, 3]], dtype=X_DTYPE)
|
||||
|
||||
bin_mapper = _BinMapper(
|
||||
is_categorical=is_categorical, known_categories=known_categories
|
||||
)
|
||||
with pytest.raises(ValueError, match=match):
|
||||
bin_mapper.fit(X)
|
||||
@@ -0,0 +1,64 @@
|
||||
import pytest
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting._bitset import (
|
||||
set_bitset_memoryview,
|
||||
in_bitset_memoryview,
|
||||
set_raw_bitset_from_binned_bitset,
|
||||
)
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values_to_insert, expected_bitset",
|
||||
[
|
||||
([0, 4, 33], np.array([2**0 + 2**4, 2**1, 0], dtype=np.uint32)),
|
||||
(
|
||||
[31, 32, 33, 79],
|
||||
np.array([2**31, 2**0 + 2**1, 2**15], dtype=np.uint32),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_set_get_bitset(values_to_insert, expected_bitset):
|
||||
n_32bits_ints = 3
|
||||
bitset = np.zeros(n_32bits_ints, dtype=np.uint32)
|
||||
for value in values_to_insert:
|
||||
set_bitset_memoryview(bitset, value)
|
||||
assert_allclose(expected_bitset, bitset)
|
||||
for value in range(32 * n_32bits_ints):
|
||||
if value in values_to_insert:
|
||||
assert in_bitset_memoryview(bitset, value)
|
||||
else:
|
||||
assert not in_bitset_memoryview(bitset, value)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw_categories, binned_cat_to_insert, expected_raw_bitset",
|
||||
[
|
||||
(
|
||||
[3, 4, 5, 10, 31, 32, 43],
|
||||
[0, 2, 4, 5, 6],
|
||||
[2**3 + 2**5 + 2**31, 2**0 + 2**11],
|
||||
),
|
||||
([3, 33, 50, 52], [1, 3], [0, 2**1 + 2**20]),
|
||||
],
|
||||
)
|
||||
def test_raw_bitset_from_binned_bitset(
|
||||
raw_categories, binned_cat_to_insert, expected_raw_bitset
|
||||
):
|
||||
binned_bitset = np.zeros(2, dtype=np.uint32)
|
||||
raw_bitset = np.zeros(2, dtype=np.uint32)
|
||||
raw_categories = np.asarray(raw_categories, dtype=X_DTYPE)
|
||||
|
||||
for val in binned_cat_to_insert:
|
||||
set_bitset_memoryview(binned_bitset, val)
|
||||
|
||||
set_raw_bitset_from_binned_bitset(raw_bitset, binned_bitset, raw_categories)
|
||||
|
||||
assert_allclose(expected_raw_bitset, raw_bitset)
|
||||
for binned_cat_val, raw_cat_val in enumerate(raw_categories):
|
||||
if binned_cat_val in binned_cat_to_insert:
|
||||
assert in_bitset_memoryview(raw_bitset, raw_cat_val)
|
||||
else:
|
||||
assert not in_bitset_memoryview(raw_bitset, raw_cat_val)
|
||||
@@ -0,0 +1,250 @@
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.datasets import make_classification, make_regression
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.ensemble import HistGradientBoostingRegressor
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
|
||||
from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(5))
|
||||
@pytest.mark.parametrize("min_samples_leaf", (1, 20))
|
||||
@pytest.mark.parametrize(
|
||||
"n_samples, max_leaf_nodes",
|
||||
[
|
||||
(255, 4096),
|
||||
(1000, 8),
|
||||
],
|
||||
)
|
||||
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes):
|
||||
# Make sure sklearn has the same predictions as lightgbm for easy targets.
|
||||
#
|
||||
# In particular when the size of the trees are bound and the number of
|
||||
# samples is large enough, the structure of the prediction trees found by
|
||||
# LightGBM and sklearn should be exactly identical.
|
||||
#
|
||||
# Notes:
|
||||
# - Several candidate splits may have equal gains when the number of
|
||||
# samples in a node is low (and because of float errors). Therefore the
|
||||
# predictions on the test set might differ if the structure of the tree
|
||||
# is not exactly the same. To avoid this issue we only compare the
|
||||
# predictions on the test set when the number of samples is large enough
|
||||
# and max_leaf_nodes is low enough.
|
||||
# - To ignore discrepancies caused by small differences the binning
|
||||
# strategy, data is pre-binned if n_samples > 255.
|
||||
# - We don't check the absolute_error loss here. This is because
|
||||
# LightGBM's computation of the median (used for the initial value of
|
||||
# raw_prediction) is a bit off (they'll e.g. return midpoints when there
|
||||
# is no need to.). Since these tests only run 1 iteration, the
|
||||
# discrepancy between the initial values leads to biggish differences in
|
||||
# the predictions. These differences are much smaller with more
|
||||
# iterations.
|
||||
pytest.importorskip("lightgbm")
|
||||
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
max_iter = 1
|
||||
max_bins = 255
|
||||
|
||||
X, y = make_regression(
|
||||
n_samples=n_samples, n_features=5, n_informative=5, random_state=0
|
||||
)
|
||||
|
||||
if n_samples > 255:
|
||||
# bin data and convert it to float32 so that the estimator doesn't
|
||||
# treat it as pre-binned
|
||||
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
|
||||
|
||||
est_sklearn = HistGradientBoostingRegressor(
|
||||
max_iter=max_iter,
|
||||
max_bins=max_bins,
|
||||
learning_rate=1,
|
||||
early_stopping=False,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=max_leaf_nodes,
|
||||
)
|
||||
est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")
|
||||
|
||||
est_lightgbm.fit(X_train, y_train)
|
||||
est_sklearn.fit(X_train, y_train)
|
||||
|
||||
# We need X to be treated an numerical data, not pre-binned data.
|
||||
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
|
||||
|
||||
pred_lightgbm = est_lightgbm.predict(X_train)
|
||||
pred_sklearn = est_sklearn.predict(X_train)
|
||||
# less than 1% of the predictions are different up to the 3rd decimal
|
||||
assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < 0.011
|
||||
|
||||
if max_leaf_nodes < 10 and n_samples >= 1000:
|
||||
pred_lightgbm = est_lightgbm.predict(X_test)
|
||||
pred_sklearn = est_sklearn.predict(X_test)
|
||||
# less than 1% of the predictions are different up to the 4th decimal
|
||||
assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < 0.01
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(5))
|
||||
@pytest.mark.parametrize("min_samples_leaf", (1, 20))
|
||||
@pytest.mark.parametrize(
|
||||
"n_samples, max_leaf_nodes",
|
||||
[
|
||||
(255, 4096),
|
||||
(1000, 8),
|
||||
],
|
||||
)
|
||||
def test_same_predictions_classification(
|
||||
seed, min_samples_leaf, n_samples, max_leaf_nodes
|
||||
):
|
||||
# Same as test_same_predictions_regression but for classification
|
||||
pytest.importorskip("lightgbm")
|
||||
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
max_iter = 1
|
||||
n_classes = 2
|
||||
max_bins = 255
|
||||
|
||||
X, y = make_classification(
|
||||
n_samples=n_samples,
|
||||
n_classes=n_classes,
|
||||
n_features=5,
|
||||
n_informative=5,
|
||||
n_redundant=0,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
if n_samples > 255:
|
||||
# bin data and convert it to float32 so that the estimator doesn't
|
||||
# treat it as pre-binned
|
||||
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
|
||||
|
||||
est_sklearn = HistGradientBoostingClassifier(
|
||||
loss="log_loss",
|
||||
max_iter=max_iter,
|
||||
max_bins=max_bins,
|
||||
learning_rate=1,
|
||||
early_stopping=False,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=max_leaf_nodes,
|
||||
)
|
||||
est_lightgbm = get_equivalent_estimator(
|
||||
est_sklearn, lib="lightgbm", n_classes=n_classes
|
||||
)
|
||||
|
||||
est_lightgbm.fit(X_train, y_train)
|
||||
est_sklearn.fit(X_train, y_train)
|
||||
|
||||
# We need X to be treated an numerical data, not pre-binned data.
|
||||
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
|
||||
|
||||
pred_lightgbm = est_lightgbm.predict(X_train)
|
||||
pred_sklearn = est_sklearn.predict(X_train)
|
||||
assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
|
||||
|
||||
acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
|
||||
acc_sklearn = accuracy_score(y_train, pred_sklearn)
|
||||
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)
|
||||
|
||||
if max_leaf_nodes < 10 and n_samples >= 1000:
|
||||
|
||||
pred_lightgbm = est_lightgbm.predict(X_test)
|
||||
pred_sklearn = est_sklearn.predict(X_test)
|
||||
assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
|
||||
|
||||
acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
|
||||
acc_sklearn = accuracy_score(y_test, pred_sklearn)
|
||||
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(5))
|
||||
@pytest.mark.parametrize("min_samples_leaf", (1, 20))
|
||||
@pytest.mark.parametrize(
|
||||
"n_samples, max_leaf_nodes",
|
||||
[
|
||||
(255, 4096),
|
||||
(10000, 8),
|
||||
],
|
||||
)
|
||||
def test_same_predictions_multiclass_classification(
|
||||
seed, min_samples_leaf, n_samples, max_leaf_nodes
|
||||
):
|
||||
# Same as test_same_predictions_regression but for classification
|
||||
pytest.importorskip("lightgbm")
|
||||
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
n_classes = 3
|
||||
max_iter = 1
|
||||
max_bins = 255
|
||||
lr = 1
|
||||
|
||||
X, y = make_classification(
|
||||
n_samples=n_samples,
|
||||
n_classes=n_classes,
|
||||
n_features=5,
|
||||
n_informative=5,
|
||||
n_redundant=0,
|
||||
n_clusters_per_class=1,
|
||||
random_state=0,
|
||||
)
|
||||
|
||||
if n_samples > 255:
|
||||
# bin data and convert it to float32 so that the estimator doesn't
|
||||
# treat it as pre-binned
|
||||
X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
|
||||
|
||||
est_sklearn = HistGradientBoostingClassifier(
|
||||
loss="log_loss",
|
||||
max_iter=max_iter,
|
||||
max_bins=max_bins,
|
||||
learning_rate=lr,
|
||||
early_stopping=False,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=max_leaf_nodes,
|
||||
)
|
||||
est_lightgbm = get_equivalent_estimator(
|
||||
est_sklearn, lib="lightgbm", n_classes=n_classes
|
||||
)
|
||||
|
||||
est_lightgbm.fit(X_train, y_train)
|
||||
est_sklearn.fit(X_train, y_train)
|
||||
|
||||
# We need X to be treated an numerical data, not pre-binned data.
|
||||
X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)
|
||||
|
||||
pred_lightgbm = est_lightgbm.predict(X_train)
|
||||
pred_sklearn = est_sklearn.predict(X_train)
|
||||
assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
|
||||
|
||||
proba_lightgbm = est_lightgbm.predict_proba(X_train)
|
||||
proba_sklearn = est_sklearn.predict_proba(X_train)
|
||||
# assert more than 75% of the predicted probabilities are the same up to
|
||||
# the second decimal
|
||||
assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75
|
||||
|
||||
acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
|
||||
acc_sklearn = accuracy_score(y_train, pred_sklearn)
|
||||
|
||||
np.testing.assert_allclose(acc_lightgbm, acc_sklearn, rtol=0, atol=5e-2)
|
||||
|
||||
if max_leaf_nodes < 10 and n_samples >= 1000:
|
||||
|
||||
pred_lightgbm = est_lightgbm.predict(X_test)
|
||||
pred_sklearn = est_sklearn.predict(X_test)
|
||||
assert np.mean(pred_sklearn == pred_lightgbm) > 0.89
|
||||
|
||||
proba_lightgbm = est_lightgbm.predict_proba(X_train)
|
||||
proba_sklearn = est_sklearn.predict_proba(X_train)
|
||||
# assert more than 75% of the predicted probabilities are the same up
|
||||
# to the second decimal
|
||||
assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75
|
||||
|
||||
acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
|
||||
acc_sklearn = accuracy_score(y_test, pred_sklearn)
|
||||
np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,569 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from pytest import approx
|
||||
from numpy.testing import assert_array_equal
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
|
||||
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BITSET_INNER_DTYPE
|
||||
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
|
||||
|
||||
n_threads = _openmp_effective_n_threads()
|
||||
|
||||
|
||||
def _make_training_data(n_bins=256, constant_hessian=True):
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 10000
|
||||
|
||||
# Generate some test data directly binned so as to test the grower code
|
||||
# independently of the binning logic.
|
||||
X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2), dtype=X_BINNED_DTYPE)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
|
||||
def true_decision_function(input_features):
|
||||
"""Ground truth decision function
|
||||
|
||||
This is a very simple yet asymmetric decision tree. Therefore the
|
||||
grower code should have no trouble recovering the decision function
|
||||
from 10000 training samples.
|
||||
"""
|
||||
if input_features[0] <= n_bins // 2:
|
||||
return -1
|
||||
else:
|
||||
return -1 if input_features[1] <= n_bins // 3 else 1
|
||||
|
||||
target = np.array([true_decision_function(x) for x in X_binned], dtype=Y_DTYPE)
|
||||
|
||||
# Assume a square loss applied to an initial model that always predicts 0
|
||||
# (hardcoded for this test):
|
||||
all_gradients = target.astype(G_H_DTYPE)
|
||||
shape_hessians = 1 if constant_hessian else all_gradients.shape
|
||||
all_hessians = np.ones(shape=shape_hessians, dtype=G_H_DTYPE)
|
||||
|
||||
return X_binned, all_gradients, all_hessians
|
||||
|
||||
|
||||
def _check_children_consistency(parent, left, right):
|
||||
# Make sure the samples are correctly dispatched from a parent to its
|
||||
# children
|
||||
assert parent.left_child is left
|
||||
assert parent.right_child is right
|
||||
|
||||
# each sample from the parent is propagated to one of the two children
|
||||
assert len(left.sample_indices) + len(right.sample_indices) == len(
|
||||
parent.sample_indices
|
||||
)
|
||||
|
||||
assert set(left.sample_indices).union(set(right.sample_indices)) == set(
|
||||
parent.sample_indices
|
||||
)
|
||||
|
||||
# samples are sent either to the left or the right node, never to both
|
||||
assert set(left.sample_indices).intersection(set(right.sample_indices)) == set()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"n_bins, constant_hessian, stopping_param, shrinkage",
|
||||
[
|
||||
(11, True, "min_gain_to_split", 0.5),
|
||||
(11, False, "min_gain_to_split", 1.0),
|
||||
(11, True, "max_leaf_nodes", 1.0),
|
||||
(11, False, "max_leaf_nodes", 0.1),
|
||||
(42, True, "max_leaf_nodes", 0.01),
|
||||
(42, False, "max_leaf_nodes", 1.0),
|
||||
(256, True, "min_gain_to_split", 1.0),
|
||||
(256, True, "max_leaf_nodes", 0.1),
|
||||
],
|
||||
)
|
||||
def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
|
||||
X_binned, all_gradients, all_hessians = _make_training_data(
|
||||
n_bins=n_bins, constant_hessian=constant_hessian
|
||||
)
|
||||
n_samples = X_binned.shape[0]
|
||||
|
||||
if stopping_param == "max_leaf_nodes":
|
||||
stopping_param = {"max_leaf_nodes": 3}
|
||||
else:
|
||||
stopping_param = {"min_gain_to_split": 0.01}
|
||||
|
||||
grower = TreeGrower(
|
||||
X_binned,
|
||||
all_gradients,
|
||||
all_hessians,
|
||||
n_bins=n_bins,
|
||||
shrinkage=shrinkage,
|
||||
min_samples_leaf=1,
|
||||
**stopping_param,
|
||||
)
|
||||
|
||||
# The root node is not yet split, but the best possible split has
|
||||
# already been evaluated:
|
||||
assert grower.root.left_child is None
|
||||
assert grower.root.right_child is None
|
||||
|
||||
root_split = grower.root.split_info
|
||||
assert root_split.feature_idx == 0
|
||||
assert root_split.bin_idx == n_bins // 2
|
||||
assert len(grower.splittable_nodes) == 1
|
||||
|
||||
# Calling split next applies the next split and computes the best split
|
||||
# for each of the two newly introduced children nodes.
|
||||
left_node, right_node = grower.split_next()
|
||||
|
||||
# All training samples have ben split in the two nodes, approximately
|
||||
# 50%/50%
|
||||
_check_children_consistency(grower.root, left_node, right_node)
|
||||
assert len(left_node.sample_indices) > 0.4 * n_samples
|
||||
assert len(left_node.sample_indices) < 0.6 * n_samples
|
||||
|
||||
if grower.min_gain_to_split > 0:
|
||||
# The left node is too pure: there is no gain to split it further.
|
||||
assert left_node.split_info.gain < grower.min_gain_to_split
|
||||
assert left_node in grower.finalized_leaves
|
||||
|
||||
# The right node can still be split further, this time on feature #1
|
||||
split_info = right_node.split_info
|
||||
assert split_info.gain > 1.0
|
||||
assert split_info.feature_idx == 1
|
||||
assert split_info.bin_idx == n_bins // 3
|
||||
assert right_node.left_child is None
|
||||
assert right_node.right_child is None
|
||||
|
||||
# The right split has not been applied yet. Let's do it now:
|
||||
assert len(grower.splittable_nodes) == 1
|
||||
right_left_node, right_right_node = grower.split_next()
|
||||
_check_children_consistency(right_node, right_left_node, right_right_node)
|
||||
assert len(right_left_node.sample_indices) > 0.1 * n_samples
|
||||
assert len(right_left_node.sample_indices) < 0.2 * n_samples
|
||||
|
||||
assert len(right_right_node.sample_indices) > 0.2 * n_samples
|
||||
assert len(right_right_node.sample_indices) < 0.4 * n_samples
|
||||
|
||||
# All the leafs are pure, it is not possible to split any further:
|
||||
assert not grower.splittable_nodes
|
||||
|
||||
grower._apply_shrinkage()
|
||||
|
||||
# Check the values of the leaves:
|
||||
assert grower.root.left_child.value == approx(shrinkage)
|
||||
assert grower.root.right_child.left_child.value == approx(shrinkage)
|
||||
assert grower.root.right_child.right_child.value == approx(-shrinkage, rel=1e-3)
|
||||
|
||||
|
||||
def test_predictor_from_grower():
|
||||
# Build a tree on the toy 3-leaf dataset to extract the predictor.
|
||||
n_bins = 256
|
||||
X_binned, all_gradients, all_hessians = _make_training_data(n_bins=n_bins)
|
||||
grower = TreeGrower(
|
||||
X_binned,
|
||||
all_gradients,
|
||||
all_hessians,
|
||||
n_bins=n_bins,
|
||||
shrinkage=1.0,
|
||||
max_leaf_nodes=3,
|
||||
min_samples_leaf=5,
|
||||
)
|
||||
grower.grow()
|
||||
assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves)
|
||||
|
||||
# Check that the node structure can be converted into a predictor
|
||||
# object to perform predictions at scale
|
||||
# We pass undefined binning_thresholds because we won't use predict anyway
|
||||
predictor = grower.make_predictor(
|
||||
binning_thresholds=np.zeros((X_binned.shape[1], n_bins))
|
||||
)
|
||||
assert predictor.nodes.shape[0] == 5
|
||||
assert predictor.nodes["is_leaf"].sum() == 3
|
||||
|
||||
# Probe some predictions for each leaf of the tree
|
||||
# each group of 3 samples corresponds to a condition in _make_training_data
|
||||
input_data = np.array(
|
||||
[
|
||||
[0, 0],
|
||||
[42, 99],
|
||||
[128, 254],
|
||||
[129, 0],
|
||||
[129, 85],
|
||||
[254, 85],
|
||||
[129, 86],
|
||||
[129, 254],
|
||||
[242, 100],
|
||||
],
|
||||
dtype=np.uint8,
|
||||
)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
predictions = predictor.predict_binned(
|
||||
input_data, missing_values_bin_idx, n_threads
|
||||
)
|
||||
expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
|
||||
assert np.allclose(predictions, expected_targets)
|
||||
|
||||
# Check that training set can be recovered exactly:
|
||||
predictions = predictor.predict_binned(X_binned, missing_values_bin_idx, n_threads)
|
||||
assert np.allclose(predictions, -all_gradients)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"n_samples, min_samples_leaf, n_bins, constant_hessian, noise",
|
||||
[
|
||||
(11, 10, 7, True, 0),
|
||||
(13, 10, 42, False, 0),
|
||||
(56, 10, 255, True, 0.1),
|
||||
(101, 3, 7, True, 0),
|
||||
(200, 42, 42, False, 0),
|
||||
(300, 55, 255, True, 0.1),
|
||||
(300, 301, 255, True, 0.1),
|
||||
],
|
||||
)
|
||||
def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise):
|
||||
rng = np.random.RandomState(seed=0)
|
||||
# data = linear target, 3 features, 1 irrelevant.
|
||||
X = rng.normal(size=(n_samples, 3))
|
||||
y = X[:, 0] - X[:, 1]
|
||||
if noise:
|
||||
y_scale = y.std()
|
||||
y += rng.normal(scale=noise, size=n_samples) * y_scale
|
||||
mapper = _BinMapper(n_bins=n_bins)
|
||||
X = mapper.fit_transform(X)
|
||||
|
||||
all_gradients = y.astype(G_H_DTYPE)
|
||||
shape_hessian = 1 if constant_hessian else all_gradients.shape
|
||||
all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE)
|
||||
grower = TreeGrower(
|
||||
X,
|
||||
all_gradients,
|
||||
all_hessians,
|
||||
n_bins=n_bins,
|
||||
shrinkage=1.0,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=n_samples,
|
||||
)
|
||||
grower.grow()
|
||||
predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_)
|
||||
|
||||
if n_samples >= min_samples_leaf:
|
||||
for node in predictor.nodes:
|
||||
if node["is_leaf"]:
|
||||
assert node["count"] >= min_samples_leaf
|
||||
else:
|
||||
assert predictor.nodes.shape[0] == 1
|
||||
assert predictor.nodes[0]["is_leaf"]
|
||||
assert predictor.nodes[0]["count"] == n_samples
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_samples, min_samples_leaf", [(99, 50), (100, 50)])
|
||||
def test_min_samples_leaf_root(n_samples, min_samples_leaf):
|
||||
# Make sure root node isn't split if n_samples is not at least twice
|
||||
# min_samples_leaf
|
||||
rng = np.random.RandomState(seed=0)
|
||||
|
||||
n_bins = 256
|
||||
|
||||
# data = linear target, 3 features, 1 irrelevant.
|
||||
X = rng.normal(size=(n_samples, 3))
|
||||
y = X[:, 0] - X[:, 1]
|
||||
mapper = _BinMapper(n_bins=n_bins)
|
||||
X = mapper.fit_transform(X)
|
||||
|
||||
all_gradients = y.astype(G_H_DTYPE)
|
||||
all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
grower = TreeGrower(
|
||||
X,
|
||||
all_gradients,
|
||||
all_hessians,
|
||||
n_bins=n_bins,
|
||||
shrinkage=1.0,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=n_samples,
|
||||
)
|
||||
grower.grow()
|
||||
if n_samples >= min_samples_leaf * 2:
|
||||
assert len(grower.finalized_leaves) >= 2
|
||||
else:
|
||||
assert len(grower.finalized_leaves) == 1
|
||||
|
||||
|
||||
def assert_is_stump(grower):
|
||||
# To assert that stumps are created when max_depth=1
|
||||
for leaf in (grower.root.left_child, grower.root.right_child):
|
||||
assert leaf.left_child is None
|
||||
assert leaf.right_child is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_depth", [1, 2, 3])
|
||||
def test_max_depth(max_depth):
|
||||
# Make sure max_depth parameter works as expected
|
||||
rng = np.random.RandomState(seed=0)
|
||||
|
||||
n_bins = 256
|
||||
n_samples = 1000
|
||||
|
||||
# data = linear target, 3 features, 1 irrelevant.
|
||||
X = rng.normal(size=(n_samples, 3))
|
||||
y = X[:, 0] - X[:, 1]
|
||||
mapper = _BinMapper(n_bins=n_bins)
|
||||
X = mapper.fit_transform(X)
|
||||
|
||||
all_gradients = y.astype(G_H_DTYPE)
|
||||
all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth)
|
||||
grower.grow()
|
||||
|
||||
depth = max(leaf.depth for leaf in grower.finalized_leaves)
|
||||
assert depth == max_depth
|
||||
|
||||
if max_depth == 1:
|
||||
assert_is_stump(grower)
|
||||
|
||||
|
||||
def test_input_validation():
|
||||
|
||||
X_binned, all_gradients, all_hessians = _make_training_data()
|
||||
|
||||
X_binned_float = X_binned.astype(np.float32)
|
||||
with pytest.raises(NotImplementedError, match="X_binned must be of type uint8"):
|
||||
TreeGrower(X_binned_float, all_gradients, all_hessians)
|
||||
|
||||
X_binned_C_array = np.ascontiguousarray(X_binned)
|
||||
with pytest.raises(
|
||||
ValueError, match="X_binned should be passed as Fortran contiguous array"
|
||||
):
|
||||
TreeGrower(X_binned_C_array, all_gradients, all_hessians)
|
||||
|
||||
|
||||
def test_init_parameters_validation():
|
||||
X_binned, all_gradients, all_hessians = _make_training_data()
|
||||
with pytest.raises(ValueError, match="min_gain_to_split=-1 must be positive"):
|
||||
|
||||
TreeGrower(X_binned, all_gradients, all_hessians, min_gain_to_split=-1)
|
||||
|
||||
with pytest.raises(ValueError, match="min_hessian_to_split=-1 must be positive"):
|
||||
TreeGrower(X_binned, all_gradients, all_hessians, min_hessian_to_split=-1)
|
||||
|
||||
|
||||
def test_missing_value_predict_only():
|
||||
# Make sure that missing values are supported at predict time even if they
|
||||
# were not encountered in the training data: the missing values are
|
||||
# assigned to whichever child has the most samples.
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
|
||||
gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
|
||||
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
|
||||
grower = TreeGrower(
|
||||
X_binned, gradients, hessians, min_samples_leaf=5, has_missing_values=False
|
||||
)
|
||||
grower.grow()
|
||||
|
||||
# We pass undefined binning_thresholds because we won't use predict anyway
|
||||
predictor = grower.make_predictor(
|
||||
binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1))
|
||||
)
|
||||
|
||||
# go from root to a leaf, always following node with the most samples.
|
||||
# That's the path nans are supposed to take
|
||||
node = predictor.nodes[0]
|
||||
while not node["is_leaf"]:
|
||||
left = predictor.nodes[node["left"]]
|
||||
right = predictor.nodes[node["right"]]
|
||||
node = left if left["count"] > right["count"] else right
|
||||
|
||||
prediction_main_path = node["value"]
|
||||
|
||||
# now build X_test with only nans, and make sure all predictions are equal
|
||||
# to prediction_main_path
|
||||
all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan)
|
||||
known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
|
||||
f_idx_map = np.zeros(0, dtype=np.uint32)
|
||||
|
||||
y_pred = predictor.predict(all_nans, known_cat_bitsets, f_idx_map, n_threads)
|
||||
assert np.all(y_pred == prediction_main_path)
|
||||
|
||||
|
||||
def test_split_on_nan_with_infinite_values():
|
||||
# Make sure the split on nan situations are respected even when there are
|
||||
# samples with +inf values (we set the threshold to +inf when we have a
|
||||
# split on nan so this test makes sure this does not introduce edge-case
|
||||
# bugs). We need to use the private API so that we can also test
|
||||
# predict_binned().
|
||||
|
||||
X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1)
|
||||
# the gradient values will force a split on nan situation
|
||||
gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE)
|
||||
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
|
||||
bin_mapper = _BinMapper()
|
||||
X_binned = bin_mapper.fit_transform(X)
|
||||
|
||||
n_bins_non_missing = 3
|
||||
has_missing_values = True
|
||||
grower = TreeGrower(
|
||||
X_binned,
|
||||
gradients,
|
||||
hessians,
|
||||
n_bins_non_missing=n_bins_non_missing,
|
||||
has_missing_values=has_missing_values,
|
||||
min_samples_leaf=1,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
|
||||
grower.grow()
|
||||
|
||||
predictor = grower.make_predictor(binning_thresholds=bin_mapper.bin_thresholds_)
|
||||
|
||||
# sanity check: this was a split on nan
|
||||
assert predictor.nodes[0]["num_threshold"] == np.inf
|
||||
assert predictor.nodes[0]["bin_threshold"] == n_bins_non_missing - 1
|
||||
|
||||
known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets()
|
||||
|
||||
# Make sure in particular that the +inf sample is mapped to the left child
|
||||
# Note that lightgbm "fails" here and will assign the inf sample to the
|
||||
# right child, even though it's a "split on nan" situation.
|
||||
predictions = predictor.predict(X, known_cat_bitsets, f_idx_map, n_threads)
|
||||
predictions_binned = predictor.predict_binned(
|
||||
X_binned,
|
||||
missing_values_bin_idx=bin_mapper.missing_values_bin_idx_,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
np.testing.assert_allclose(predictions, -gradients)
|
||||
np.testing.assert_allclose(predictions_binned, -gradients)
|
||||
|
||||
|
||||
def test_grow_tree_categories():
|
||||
# Check that the grower produces the right predictor tree when a split is
|
||||
# categorical
|
||||
X_binned = np.array([[0, 1] * 11 + [1]], dtype=X_BINNED_DTYPE).T
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
|
||||
all_gradients = np.array([10, 1] * 11 + [1], dtype=G_H_DTYPE)
|
||||
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
is_categorical = np.ones(1, dtype=np.uint8)
|
||||
|
||||
grower = TreeGrower(
|
||||
X_binned,
|
||||
all_gradients,
|
||||
all_hessians,
|
||||
n_bins=4,
|
||||
shrinkage=1.0,
|
||||
min_samples_leaf=1,
|
||||
is_categorical=is_categorical,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
grower.grow()
|
||||
assert grower.n_nodes == 3
|
||||
|
||||
categories = [np.array([4, 9], dtype=X_DTYPE)]
|
||||
predictor = grower.make_predictor(binning_thresholds=categories)
|
||||
root = predictor.nodes[0]
|
||||
assert root["count"] == 23
|
||||
assert root["depth"] == 0
|
||||
assert root["is_categorical"]
|
||||
|
||||
left, right = predictor.nodes[root["left"]], predictor.nodes[root["right"]]
|
||||
|
||||
# arbitrary validation, but this means ones go to the left.
|
||||
assert left["count"] >= right["count"]
|
||||
|
||||
# check binned category value (1)
|
||||
expected_binned_cat_bitset = [2**1] + [0] * 7
|
||||
binned_cat_bitset = predictor.binned_left_cat_bitsets
|
||||
assert_array_equal(binned_cat_bitset[0], expected_binned_cat_bitset)
|
||||
|
||||
# check raw category value (9)
|
||||
expected_raw_cat_bitsets = [2**9] + [0] * 7
|
||||
raw_cat_bitsets = predictor.raw_left_cat_bitsets
|
||||
assert_array_equal(raw_cat_bitsets[0], expected_raw_cat_bitsets)
|
||||
|
||||
# Note that since there was no missing values during training, the missing
|
||||
# values aren't part of the bitsets. However, we expect the missing values
|
||||
# to go to the biggest child (i.e. the left one).
|
||||
# The left child has a value of -1 = negative gradient.
|
||||
assert root["missing_go_to_left"]
|
||||
|
||||
# make sure binned missing values are mapped to the left child during
|
||||
# prediction
|
||||
prediction_binned = predictor.predict_binned(
|
||||
np.asarray([[6]]).astype(X_BINNED_DTYPE),
|
||||
missing_values_bin_idx=6,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
assert_allclose(prediction_binned, [-1]) # negative gradient
|
||||
|
||||
# make sure raw missing values are mapped to the left child during
|
||||
# prediction
|
||||
known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32) # ignored anyway
|
||||
f_idx_map = np.array([0], dtype=np.uint32)
|
||||
prediction = predictor.predict(
|
||||
np.array([[np.nan]]), known_cat_bitsets, f_idx_map, n_threads
|
||||
)
|
||||
assert_allclose(prediction, [-1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("min_samples_leaf", (1, 20))
|
||||
@pytest.mark.parametrize("n_unique_categories", (2, 10, 100))
|
||||
@pytest.mark.parametrize("target", ("binary", "random", "equal"))
|
||||
def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target):
|
||||
# Make sure that native categorical splits are equivalent to using a OHE,
|
||||
# when given enough depth
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 10_000
|
||||
X_binned = rng.randint(0, n_unique_categories, size=(n_samples, 1), dtype=np.uint8)
|
||||
|
||||
X_ohe = OneHotEncoder(sparse=False).fit_transform(X_binned)
|
||||
X_ohe = np.asfortranarray(X_ohe).astype(np.uint8)
|
||||
|
||||
if target == "equal":
|
||||
gradients = X_binned.reshape(-1)
|
||||
elif target == "binary":
|
||||
gradients = (X_binned % 2).reshape(-1)
|
||||
else:
|
||||
gradients = rng.randn(n_samples)
|
||||
gradients = gradients.astype(G_H_DTYPE)
|
||||
|
||||
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
|
||||
grower_params = {
|
||||
"min_samples_leaf": min_samples_leaf,
|
||||
"max_depth": None,
|
||||
"max_leaf_nodes": None,
|
||||
}
|
||||
|
||||
grower = TreeGrower(
|
||||
X_binned, gradients, hessians, is_categorical=[True], **grower_params
|
||||
)
|
||||
grower.grow()
|
||||
# we pass undefined bin_thresholds because we won't use predict()
|
||||
predictor = grower.make_predictor(
|
||||
binning_thresholds=np.zeros((1, n_unique_categories))
|
||||
)
|
||||
preds = predictor.predict_binned(
|
||||
X_binned, missing_values_bin_idx=255, n_threads=n_threads
|
||||
)
|
||||
|
||||
grower_ohe = TreeGrower(X_ohe, gradients, hessians, **grower_params)
|
||||
grower_ohe.grow()
|
||||
predictor_ohe = grower_ohe.make_predictor(
|
||||
binning_thresholds=np.zeros((X_ohe.shape[1], n_unique_categories))
|
||||
)
|
||||
preds_ohe = predictor_ohe.predict_binned(
|
||||
X_ohe, missing_values_bin_idx=255, n_threads=n_threads
|
||||
)
|
||||
|
||||
assert predictor.get_max_depth() <= predictor_ohe.get_max_depth()
|
||||
if target == "binary" and n_unique_categories > 2:
|
||||
# OHE needs more splits to achieve the same predictions
|
||||
assert predictor.get_max_depth() < predictor_ohe.get_max_depth()
|
||||
|
||||
np.testing.assert_allclose(preds, preds_ohe)
|
||||
@@ -0,0 +1,239 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from numpy.testing import assert_allclose
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.histogram import (
|
||||
_build_histogram_naive,
|
||||
_build_histogram,
|
||||
_build_histogram_no_hessian,
|
||||
_build_histogram_root_no_hessian,
|
||||
_build_histogram_root,
|
||||
_subtract_histograms,
|
||||
)
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
||||
|
||||
|
||||
@pytest.mark.parametrize("build_func", [_build_histogram_naive, _build_histogram])
|
||||
def test_build_histogram(build_func):
|
||||
binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=X_BINNED_DTYPE)
|
||||
|
||||
# Small sample_indices (below unrolling threshold)
|
||||
ordered_gradients = np.array([0, 1, 3], dtype=G_H_DTYPE)
|
||||
ordered_hessians = np.array([1, 1, 2], dtype=G_H_DTYPE)
|
||||
|
||||
sample_indices = np.array([0, 2, 3], dtype=np.uint32)
|
||||
hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
|
||||
build_func(
|
||||
0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist
|
||||
)
|
||||
hist = hist[0]
|
||||
assert_array_equal(hist["count"], [2, 1, 0])
|
||||
assert_allclose(hist["sum_gradients"], [1, 3, 0])
|
||||
assert_allclose(hist["sum_hessians"], [2, 2, 0])
|
||||
|
||||
# Larger sample_indices (above unrolling threshold)
|
||||
sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32)
|
||||
ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=G_H_DTYPE)
|
||||
ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=G_H_DTYPE)
|
||||
|
||||
hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE)
|
||||
build_func(
|
||||
0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist
|
||||
)
|
||||
hist = hist[0]
|
||||
assert_array_equal(hist["count"], [2, 2, 1])
|
||||
assert_allclose(hist["sum_gradients"], [1, 4, 0])
|
||||
assert_allclose(hist["sum_hessians"], [2, 2, 1])
|
||||
|
||||
|
||||
def test_histogram_sample_order_independence():
|
||||
# Make sure the order of the samples has no impact on the histogram
|
||||
# computations
|
||||
rng = np.random.RandomState(42)
|
||||
n_sub_samples = 100
|
||||
n_samples = 1000
|
||||
n_bins = 256
|
||||
|
||||
binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=X_BINNED_DTYPE)
|
||||
sample_indices = rng.choice(
|
||||
np.arange(n_samples, dtype=np.uint32), n_sub_samples, replace=False
|
||||
)
|
||||
ordered_gradients = rng.randn(n_sub_samples).astype(G_H_DTYPE)
|
||||
hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
_build_histogram_no_hessian(
|
||||
0, sample_indices, binned_feature, ordered_gradients, hist_gc
|
||||
)
|
||||
|
||||
ordered_hessians = rng.exponential(size=n_sub_samples).astype(G_H_DTYPE)
|
||||
hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
_build_histogram(
|
||||
0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc
|
||||
)
|
||||
|
||||
permutation = rng.permutation(n_sub_samples)
|
||||
hist_gc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
_build_histogram_no_hessian(
|
||||
0,
|
||||
sample_indices[permutation],
|
||||
binned_feature,
|
||||
ordered_gradients[permutation],
|
||||
hist_gc_perm,
|
||||
)
|
||||
|
||||
hist_ghc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
_build_histogram(
|
||||
0,
|
||||
sample_indices[permutation],
|
||||
binned_feature,
|
||||
ordered_gradients[permutation],
|
||||
ordered_hessians[permutation],
|
||||
hist_ghc_perm,
|
||||
)
|
||||
|
||||
hist_gc = hist_gc[0]
|
||||
hist_ghc = hist_ghc[0]
|
||||
hist_gc_perm = hist_gc_perm[0]
|
||||
hist_ghc_perm = hist_ghc_perm[0]
|
||||
|
||||
assert_allclose(hist_gc["sum_gradients"], hist_gc_perm["sum_gradients"])
|
||||
assert_array_equal(hist_gc["count"], hist_gc_perm["count"])
|
||||
|
||||
assert_allclose(hist_ghc["sum_gradients"], hist_ghc_perm["sum_gradients"])
|
||||
assert_allclose(hist_ghc["sum_hessians"], hist_ghc_perm["sum_hessians"])
|
||||
assert_array_equal(hist_ghc["count"], hist_ghc_perm["count"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constant_hessian", [True, False])
|
||||
def test_unrolled_equivalent_to_naive(constant_hessian):
|
||||
# Make sure the different unrolled histogram computations give the same
|
||||
# results as the naive one.
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 10
|
||||
n_bins = 5
|
||||
sample_indices = np.arange(n_samples).astype(np.uint32)
|
||||
binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
|
||||
ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
|
||||
if constant_hessian:
|
||||
ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
|
||||
else:
|
||||
ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
|
||||
|
||||
hist_gc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
hist_ghc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
|
||||
_build_histogram_root_no_hessian(0, binned_feature, ordered_gradients, hist_gc_root)
|
||||
_build_histogram_root(
|
||||
0, binned_feature, ordered_gradients, ordered_hessians, hist_ghc_root
|
||||
)
|
||||
_build_histogram_no_hessian(
|
||||
0, sample_indices, binned_feature, ordered_gradients, hist_gc
|
||||
)
|
||||
_build_histogram(
|
||||
0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc
|
||||
)
|
||||
_build_histogram_naive(
|
||||
0,
|
||||
sample_indices,
|
||||
binned_feature,
|
||||
ordered_gradients,
|
||||
ordered_hessians,
|
||||
hist_naive,
|
||||
)
|
||||
|
||||
hist_naive = hist_naive[0]
|
||||
hist_gc_root = hist_gc_root[0]
|
||||
hist_ghc_root = hist_ghc_root[0]
|
||||
hist_gc = hist_gc[0]
|
||||
hist_ghc = hist_ghc[0]
|
||||
for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_ghc):
|
||||
assert_array_equal(hist["count"], hist_naive["count"])
|
||||
assert_allclose(hist["sum_gradients"], hist_naive["sum_gradients"])
|
||||
for hist in (hist_ghc_root, hist_ghc):
|
||||
assert_allclose(hist["sum_hessians"], hist_naive["sum_hessians"])
|
||||
for hist in (hist_gc_root, hist_gc):
|
||||
assert_array_equal(hist["sum_hessians"], np.zeros(n_bins))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constant_hessian", [True, False])
|
||||
def test_hist_subtraction(constant_hessian):
|
||||
# Make sure the histogram subtraction trick gives the same result as the
|
||||
# classical method.
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 10
|
||||
n_bins = 5
|
||||
sample_indices = np.arange(n_samples).astype(np.uint32)
|
||||
binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8)
|
||||
ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
|
||||
if constant_hessian:
|
||||
ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
|
||||
else:
|
||||
ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
|
||||
|
||||
hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
if constant_hessian:
|
||||
_build_histogram_no_hessian(
|
||||
0, sample_indices, binned_feature, ordered_gradients, hist_parent
|
||||
)
|
||||
else:
|
||||
_build_histogram(
|
||||
0,
|
||||
sample_indices,
|
||||
binned_feature,
|
||||
ordered_gradients,
|
||||
ordered_hessians,
|
||||
hist_parent,
|
||||
)
|
||||
|
||||
mask = rng.randint(0, 2, n_samples).astype(bool)
|
||||
|
||||
sample_indices_left = sample_indices[mask]
|
||||
ordered_gradients_left = ordered_gradients[mask]
|
||||
ordered_hessians_left = ordered_hessians[mask]
|
||||
hist_left = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
if constant_hessian:
|
||||
_build_histogram_no_hessian(
|
||||
0, sample_indices_left, binned_feature, ordered_gradients_left, hist_left
|
||||
)
|
||||
else:
|
||||
_build_histogram(
|
||||
0,
|
||||
sample_indices_left,
|
||||
binned_feature,
|
||||
ordered_gradients_left,
|
||||
ordered_hessians_left,
|
||||
hist_left,
|
||||
)
|
||||
|
||||
sample_indices_right = sample_indices[~mask]
|
||||
ordered_gradients_right = ordered_gradients[~mask]
|
||||
ordered_hessians_right = ordered_hessians[~mask]
|
||||
hist_right = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
if constant_hessian:
|
||||
_build_histogram_no_hessian(
|
||||
0, sample_indices_right, binned_feature, ordered_gradients_right, hist_right
|
||||
)
|
||||
else:
|
||||
_build_histogram(
|
||||
0,
|
||||
sample_indices_right,
|
||||
binned_feature,
|
||||
ordered_gradients_right,
|
||||
ordered_hessians_right,
|
||||
hist_right,
|
||||
)
|
||||
|
||||
hist_left_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
hist_right_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE)
|
||||
_subtract_histograms(0, n_bins, hist_parent, hist_right, hist_left_sub)
|
||||
_subtract_histograms(0, n_bins, hist_parent, hist_left, hist_right_sub)
|
||||
|
||||
for key in ("count", "sum_hessians", "sum_gradients"):
|
||||
assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6)
|
||||
assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6)
|
||||
@@ -0,0 +1,376 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
|
||||
from sklearn.ensemble._hist_gradient_boosting.splitting import (
|
||||
Splitter,
|
||||
compute_node_value,
|
||||
)
|
||||
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
|
||||
from sklearn.ensemble import HistGradientBoostingRegressor
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
|
||||
|
||||
n_threads = _openmp_effective_n_threads()
|
||||
|
||||
|
||||
def is_increasing(a):
|
||||
return (np.diff(a) >= 0.0).all()
|
||||
|
||||
|
||||
def is_decreasing(a):
|
||||
return (np.diff(a) <= 0.0).all()
|
||||
|
||||
|
||||
def assert_leaves_values_monotonic(predictor, monotonic_cst):
|
||||
# make sure leaves values (from left to right) are either all increasing
|
||||
# or all decreasing (or neither) depending on the monotonic constraint.
|
||||
nodes = predictor.nodes
|
||||
|
||||
def get_leaves_values():
|
||||
"""get leaves values from left to right"""
|
||||
values = []
|
||||
|
||||
def depth_first_collect_leaf_values(node_idx):
|
||||
node = nodes[node_idx]
|
||||
if node["is_leaf"]:
|
||||
values.append(node["value"])
|
||||
return
|
||||
depth_first_collect_leaf_values(node["left"])
|
||||
depth_first_collect_leaf_values(node["right"])
|
||||
|
||||
depth_first_collect_leaf_values(0) # start at root (0)
|
||||
return values
|
||||
|
||||
values = get_leaves_values()
|
||||
|
||||
if monotonic_cst == MonotonicConstraint.NO_CST:
|
||||
# some increasing, some decreasing
|
||||
assert not is_increasing(values) and not is_decreasing(values)
|
||||
elif monotonic_cst == MonotonicConstraint.POS:
|
||||
# all increasing
|
||||
assert is_increasing(values)
|
||||
else: # NEG
|
||||
# all decreasing
|
||||
assert is_decreasing(values)
|
||||
|
||||
|
||||
def assert_children_values_monotonic(predictor, monotonic_cst):
|
||||
# Make sure siblings values respect the monotonic constraints. Left should
|
||||
# be lower (resp greater) than right child if constraint is POS (resp.
|
||||
# NEG).
|
||||
# Note that this property alone isn't enough to ensure full monotonicity,
|
||||
# since we also need to guanrantee that all the descendents of the left
|
||||
# child won't be greater (resp. lower) than the right child, or its
|
||||
# descendents. That's why we need to bound the predicted values (this is
|
||||
# tested in assert_children_values_bounded)
|
||||
nodes = predictor.nodes
|
||||
left_lower = []
|
||||
left_greater = []
|
||||
for node in nodes:
|
||||
if node["is_leaf"]:
|
||||
continue
|
||||
|
||||
left_idx = node["left"]
|
||||
right_idx = node["right"]
|
||||
|
||||
if nodes[left_idx]["value"] < nodes[right_idx]["value"]:
|
||||
left_lower.append(node)
|
||||
elif nodes[left_idx]["value"] > nodes[right_idx]["value"]:
|
||||
left_greater.append(node)
|
||||
|
||||
if monotonic_cst == MonotonicConstraint.NO_CST:
|
||||
assert left_lower and left_greater
|
||||
elif monotonic_cst == MonotonicConstraint.POS:
|
||||
assert left_lower and not left_greater
|
||||
else: # NEG
|
||||
assert not left_lower and left_greater
|
||||
|
||||
|
||||
def assert_children_values_bounded(grower, monotonic_cst):
|
||||
# Make sure that the values of the children of a node are bounded by the
|
||||
# middle value between that node and its sibling (if there is a monotonic
|
||||
# constraint).
|
||||
# As a bonus, we also check that the siblings values are properly ordered
|
||||
# which is slightly redundant with assert_children_values_monotonic (but
|
||||
# this check is done on the grower nodes whereas
|
||||
# assert_children_values_monotonic is done on the predictor nodes)
|
||||
|
||||
if monotonic_cst == MonotonicConstraint.NO_CST:
|
||||
return
|
||||
|
||||
def recursively_check_children_node_values(node, right_sibling=None):
|
||||
if node.is_leaf:
|
||||
return
|
||||
if right_sibling is not None:
|
||||
middle = (node.value + right_sibling.value) / 2
|
||||
if monotonic_cst == MonotonicConstraint.POS:
|
||||
assert node.left_child.value <= node.right_child.value <= middle
|
||||
if not right_sibling.is_leaf:
|
||||
assert (
|
||||
middle
|
||||
<= right_sibling.left_child.value
|
||||
<= right_sibling.right_child.value
|
||||
)
|
||||
else: # NEG
|
||||
assert node.left_child.value >= node.right_child.value >= middle
|
||||
if not right_sibling.is_leaf:
|
||||
assert (
|
||||
middle
|
||||
>= right_sibling.left_child.value
|
||||
>= right_sibling.right_child.value
|
||||
)
|
||||
|
||||
recursively_check_children_node_values(
|
||||
node.left_child, right_sibling=node.right_child
|
||||
)
|
||||
recursively_check_children_node_values(node.right_child)
|
||||
|
||||
recursively_check_children_node_values(grower.root)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(3))
|
||||
@pytest.mark.parametrize(
|
||||
"monotonic_cst",
|
||||
(
|
||||
MonotonicConstraint.NO_CST,
|
||||
MonotonicConstraint.POS,
|
||||
MonotonicConstraint.NEG,
|
||||
),
|
||||
)
|
||||
def test_nodes_values(monotonic_cst, seed):
|
||||
# Build a single tree with only one feature, and make sure the nodes
|
||||
# values respect the monotonic constraints.
|
||||
|
||||
# Considering the following tree with a monotonic POS constraint, we
|
||||
# should have:
|
||||
#
|
||||
# root
|
||||
# / \
|
||||
# 5 10 # middle = 7.5
|
||||
# / \ / \
|
||||
# a b c d
|
||||
#
|
||||
# a <= b and c <= d (assert_children_values_monotonic)
|
||||
# a, b <= middle <= c, d (assert_children_values_bounded)
|
||||
# a <= b <= c <= d (assert_leaves_values_monotonic)
|
||||
#
|
||||
# The last one is a consequence of the others, but can't hurt to check
|
||||
|
||||
rng = np.random.RandomState(seed)
|
||||
n_samples = 1000
|
||||
n_features = 1
|
||||
X_binned = rng.randint(0, 255, size=(n_samples, n_features), dtype=np.uint8)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
|
||||
gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
|
||||
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
|
||||
|
||||
grower = TreeGrower(
|
||||
X_binned, gradients, hessians, monotonic_cst=[monotonic_cst], shrinkage=0.1
|
||||
)
|
||||
grower.grow()
|
||||
|
||||
# grow() will shrink the leaves values at the very end. For our comparison
|
||||
# tests, we need to revert the shrinkage of the leaves, else we would
|
||||
# compare the value of a leaf (shrunk) with a node (not shrunk) and the
|
||||
# test would not be correct.
|
||||
for leave in grower.finalized_leaves:
|
||||
leave.value /= grower.shrinkage
|
||||
|
||||
# We pass undefined binning_thresholds because we won't use predict anyway
|
||||
predictor = grower.make_predictor(
|
||||
binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1))
|
||||
)
|
||||
|
||||
# The consistency of the bounds can only be checked on the tree grower
|
||||
# as the node bounds are not copied into the predictor tree. The
|
||||
# consistency checks on the values of node children and leaves can be
|
||||
# done either on the grower tree or on the predictor tree. We only
|
||||
# do those checks on the predictor tree as the latter is derived from
|
||||
# the former.
|
||||
assert_children_values_monotonic(predictor, monotonic_cst)
|
||||
assert_children_values_bounded(grower, monotonic_cst)
|
||||
assert_leaves_values_monotonic(predictor, monotonic_cst)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(3))
|
||||
def test_predictions(seed):
|
||||
# Train a model with a POS constraint on the first feature and a NEG
|
||||
# constraint on the second feature, and make sure the constraints are
|
||||
# respected by checking the predictions.
|
||||
# test adapted from lightgbm's test_monotone_constraint(), itself inspired
|
||||
# by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
|
||||
|
||||
rng = np.random.RandomState(seed)
|
||||
|
||||
n_samples = 1000
|
||||
f_0 = rng.rand(n_samples) # positive correlation with y
|
||||
f_1 = rng.rand(n_samples) # negative correslation with y
|
||||
X = np.c_[f_0, f_1]
|
||||
noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
|
||||
y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise
|
||||
|
||||
gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
|
||||
gbdt.fit(X, y)
|
||||
|
||||
linspace = np.linspace(0, 1, 100)
|
||||
sin = np.sin(linspace)
|
||||
constant = np.full_like(linspace, fill_value=0.5)
|
||||
|
||||
# We now assert the predictions properly respect the constraints, on each
|
||||
# feature. When testing for a feature we need to set the other one to a
|
||||
# constant, because the monotonic constraints are only a "all else being
|
||||
# equal" type of constraints:
|
||||
# a constraint on the first feature only means that
|
||||
# x0 < x0' => f(x0, x1) < f(x0', x1)
|
||||
# while x1 stays constant.
|
||||
# The constraint does not guanrantee that
|
||||
# x0 < x0' => f(x0, x1) < f(x0', x1')
|
||||
|
||||
# First feature (POS)
|
||||
# assert pred is all increasing when f_0 is all increasing
|
||||
X = np.c_[linspace, constant]
|
||||
pred = gbdt.predict(X)
|
||||
assert is_increasing(pred)
|
||||
# assert pred actually follows the variations of f_0
|
||||
X = np.c_[sin, constant]
|
||||
pred = gbdt.predict(X)
|
||||
assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
|
||||
|
||||
# Second feature (NEG)
|
||||
# assert pred is all decreasing when f_1 is all increasing
|
||||
X = np.c_[constant, linspace]
|
||||
pred = gbdt.predict(X)
|
||||
assert is_decreasing(pred)
|
||||
# assert pred actually follows the inverse variations of f_1
|
||||
X = np.c_[constant, sin]
|
||||
pred = gbdt.predict(X)
|
||||
assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
|
||||
|
||||
|
||||
def test_input_error():
|
||||
X = [[1, 2], [2, 3], [3, 4]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
|
||||
with pytest.raises(
|
||||
ValueError, match="monotonic_cst has shape 3 but the input data"
|
||||
):
|
||||
gbdt.fit(X, y)
|
||||
|
||||
for monotonic_cst in ([1, 3], [1, -3]):
|
||||
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
|
||||
with pytest.raises(
|
||||
ValueError, match="must be None or an array-like of -1, 0 or 1"
|
||||
):
|
||||
gbdt.fit(X, y)
|
||||
|
||||
gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="monotonic constraints are not supported for multiclass classification",
|
||||
):
|
||||
gbdt.fit(X, y)
|
||||
|
||||
|
||||
def test_bounded_value_min_gain_to_split():
|
||||
# The purpose of this test is to show that when computing the gain at a
|
||||
# given split, the value of the current node should be properly bounded to
|
||||
# respect the monotonic constraints, because it strongly interacts with
|
||||
# min_gain_to_split. We build a simple example where gradients are [1, 1,
|
||||
# 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
|
||||
# bin, and depending on whether the value of the node is bounded or not,
|
||||
# the min_gain_to_split constraint is or isn't satisfied.
|
||||
l2_regularization = 0
|
||||
min_hessian_to_split = 0
|
||||
min_samples_leaf = 1
|
||||
n_bins = n_samples = 5
|
||||
X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
|
||||
all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
sum_hessians = all_hessians.sum()
|
||||
hessians_are_constant = False
|
||||
|
||||
builder = HistogramBuilder(
|
||||
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
||||
)
|
||||
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
|
||||
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
||||
)
|
||||
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
children_lower_bound, children_upper_bound = -np.inf, np.inf
|
||||
|
||||
min_gain_to_split = 2000
|
||||
splitter = Splitter(
|
||||
X_binned,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
is_categorical,
|
||||
monotonic_cst,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
hessians_are_constant,
|
||||
)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
|
||||
# Since the gradient array is [1, 1, 100, 1, 1]
|
||||
# the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
|
||||
# and is equal to about 1307, which less than min_gain_to_split = 2000, so
|
||||
# the node is considered unsplittable (gain = -1)
|
||||
current_lower_bound, current_upper_bound = -np.inf, np.inf
|
||||
value = compute_node_value(
|
||||
sum_gradients,
|
||||
sum_hessians,
|
||||
current_lower_bound,
|
||||
current_upper_bound,
|
||||
l2_regularization,
|
||||
)
|
||||
# the unbounded value is equal to -sum_gradients / sum_hessians
|
||||
assert value == pytest.approx(-104 / 5)
|
||||
split_info = splitter.find_node_split(
|
||||
n_samples,
|
||||
histograms,
|
||||
sum_gradients,
|
||||
sum_hessians,
|
||||
value,
|
||||
lower_bound=children_lower_bound,
|
||||
upper_bound=children_upper_bound,
|
||||
)
|
||||
assert split_info.gain == -1 # min_gain_to_split not respected
|
||||
|
||||
# here again the max possible gain is on the 3rd bin but we now cap the
|
||||
# value of the node into [-10, inf].
|
||||
# This means the gain is now about 2430 which is more than the
|
||||
# min_gain_to_split constraint.
|
||||
current_lower_bound, current_upper_bound = -10, np.inf
|
||||
value = compute_node_value(
|
||||
sum_gradients,
|
||||
sum_hessians,
|
||||
current_lower_bound,
|
||||
current_upper_bound,
|
||||
l2_regularization,
|
||||
)
|
||||
assert value == -10
|
||||
split_info = splitter.find_node_split(
|
||||
n_samples,
|
||||
histograms,
|
||||
sum_gradients,
|
||||
sum_hessians,
|
||||
value,
|
||||
lower_bound=children_lower_bound,
|
||||
upper_bound=children_upper_bound,
|
||||
)
|
||||
assert split_info.gain > min_gain_to_split
|
||||
@@ -0,0 +1,187 @@
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import r2_score
|
||||
import pytest
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
|
||||
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
|
||||
from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import (
|
||||
G_H_DTYPE,
|
||||
PREDICTOR_RECORD_DTYPE,
|
||||
ALMOST_INF,
|
||||
X_BINNED_DTYPE,
|
||||
X_BITSET_INNER_DTYPE,
|
||||
X_DTYPE,
|
||||
)
|
||||
from sklearn.ensemble._hist_gradient_boosting._bitset import (
|
||||
set_bitset_memoryview,
|
||||
set_raw_bitset_from_binned_bitset,
|
||||
)
|
||||
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
|
||||
|
||||
n_threads = _openmp_effective_n_threads()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_bins", [200, 256])
|
||||
def test_regression_dataset(n_bins):
|
||||
X, y = make_regression(
|
||||
n_samples=500, n_features=10, n_informative=5, random_state=42
|
||||
)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
|
||||
|
||||
mapper = _BinMapper(n_bins=n_bins, random_state=42)
|
||||
X_train_binned = mapper.fit_transform(X_train)
|
||||
|
||||
# Init gradients and hessians to that of least squares loss
|
||||
gradients = -y_train.astype(G_H_DTYPE)
|
||||
hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
|
||||
min_samples_leaf = 10
|
||||
max_leaf_nodes = 30
|
||||
grower = TreeGrower(
|
||||
X_train_binned,
|
||||
gradients,
|
||||
hessians,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
max_leaf_nodes=max_leaf_nodes,
|
||||
n_bins=n_bins,
|
||||
n_bins_non_missing=mapper.n_bins_non_missing_,
|
||||
)
|
||||
grower.grow()
|
||||
|
||||
predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_)
|
||||
|
||||
known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
|
||||
f_idx_map = np.zeros(0, dtype=np.uint32)
|
||||
|
||||
y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map, n_threads)
|
||||
assert r2_score(y_train, y_pred_train) > 0.82
|
||||
|
||||
y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map, n_threads)
|
||||
assert r2_score(y_test, y_pred_test) > 0.67
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_threshold, expected_predictions",
|
||||
[
|
||||
(-np.inf, [0, 1, 1, 1]),
|
||||
(10, [0, 0, 1, 1]),
|
||||
(20, [0, 0, 0, 1]),
|
||||
(ALMOST_INF, [0, 0, 0, 1]),
|
||||
(np.inf, [0, 0, 0, 0]),
|
||||
],
|
||||
)
|
||||
def test_infinite_values_and_thresholds(num_threshold, expected_predictions):
|
||||
# Make sure infinite values and infinite thresholds are handled properly.
|
||||
# In particular, if a value is +inf and the threshold is ALMOST_INF the
|
||||
# sample should go to the right child. If the threshold is inf (split on
|
||||
# nan), the +inf sample will go to the left child.
|
||||
|
||||
X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1)
|
||||
nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
|
||||
|
||||
# We just construct a simple tree with 1 root and 2 children
|
||||
# parent node
|
||||
nodes[0]["left"] = 1
|
||||
nodes[0]["right"] = 2
|
||||
nodes[0]["feature_idx"] = 0
|
||||
nodes[0]["num_threshold"] = num_threshold
|
||||
|
||||
# left child
|
||||
nodes[1]["is_leaf"] = True
|
||||
nodes[1]["value"] = 0
|
||||
|
||||
# right child
|
||||
nodes[2]["is_leaf"] = True
|
||||
nodes[2]["value"] = 1
|
||||
|
||||
binned_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
|
||||
raw_categorical_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
|
||||
known_cat_bitset = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
|
||||
f_idx_map = np.zeros(0, dtype=np.uint32)
|
||||
|
||||
predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)
|
||||
predictions = predictor.predict(X, known_cat_bitset, f_idx_map, n_threads)
|
||||
|
||||
assert np.all(predictions == expected_predictions)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bins_go_left, expected_predictions",
|
||||
[
|
||||
([0, 3, 4, 6], [1, 0, 0, 1, 1, 0]),
|
||||
([0, 1, 2, 6], [1, 1, 1, 0, 0, 0]),
|
||||
([3, 5, 6], [0, 0, 0, 1, 0, 1]),
|
||||
],
|
||||
)
|
||||
def test_categorical_predictor(bins_go_left, expected_predictions):
|
||||
# Test predictor outputs are correct with categorical features
|
||||
|
||||
X_binned = np.array([[0, 1, 2, 3, 4, 5]], dtype=X_BINNED_DTYPE).T
|
||||
categories = np.array([2, 5, 6, 8, 10, 15], dtype=X_DTYPE)
|
||||
|
||||
bins_go_left = np.array(bins_go_left, dtype=X_BINNED_DTYPE)
|
||||
|
||||
# We just construct a simple tree with 1 root and 2 children
|
||||
# parent node
|
||||
nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
|
||||
nodes[0]["left"] = 1
|
||||
nodes[0]["right"] = 2
|
||||
nodes[0]["feature_idx"] = 0
|
||||
nodes[0]["is_categorical"] = True
|
||||
nodes[0]["missing_go_to_left"] = True
|
||||
|
||||
# left child
|
||||
nodes[1]["is_leaf"] = True
|
||||
nodes[1]["value"] = 1
|
||||
|
||||
# right child
|
||||
nodes[2]["is_leaf"] = True
|
||||
nodes[2]["value"] = 0
|
||||
|
||||
binned_cat_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE)
|
||||
raw_categorical_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE)
|
||||
for go_left in bins_go_left:
|
||||
set_bitset_memoryview(binned_cat_bitsets[0], go_left)
|
||||
|
||||
set_raw_bitset_from_binned_bitset(
|
||||
raw_categorical_bitsets[0], binned_cat_bitsets[0], categories
|
||||
)
|
||||
|
||||
predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)
|
||||
|
||||
# Check binned data gives correct predictions
|
||||
prediction_binned = predictor.predict_binned(
|
||||
X_binned, missing_values_bin_idx=6, n_threads=n_threads
|
||||
)
|
||||
assert_allclose(prediction_binned, expected_predictions)
|
||||
|
||||
# manually construct bitset
|
||||
known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32)
|
||||
known_cat_bitsets[0, 0] = np.sum(2**categories, dtype=np.uint32)
|
||||
f_idx_map = np.array([0], dtype=np.uint32)
|
||||
|
||||
# Check with un-binned data
|
||||
predictions = predictor.predict(
|
||||
categories.reshape(-1, 1), known_cat_bitsets, f_idx_map, n_threads
|
||||
)
|
||||
assert_allclose(predictions, expected_predictions)
|
||||
|
||||
# Check missing goes left because missing_values_bin_idx=6
|
||||
X_binned_missing = np.array([[6]], dtype=X_BINNED_DTYPE).T
|
||||
predictions = predictor.predict_binned(
|
||||
X_binned_missing, missing_values_bin_idx=6, n_threads=n_threads
|
||||
)
|
||||
assert_allclose(predictions, [1])
|
||||
|
||||
# missing and unknown go left
|
||||
predictions = predictor.predict(
|
||||
np.array([[np.nan, 17]], dtype=X_DTYPE).T,
|
||||
known_cat_bitsets,
|
||||
f_idx_map,
|
||||
n_threads,
|
||||
)
|
||||
assert_allclose(predictions, [1, 1])
|
||||
@@ -0,0 +1,858 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
||||
from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
|
||||
from sklearn.ensemble._hist_gradient_boosting.splitting import (
|
||||
Splitter,
|
||||
compute_node_value,
|
||||
)
|
||||
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
|
||||
from sklearn.utils._testing import skip_if_32bit
|
||||
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
|
||||
|
||||
n_threads = _openmp_effective_n_threads()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_bins", [3, 32, 256])
|
||||
def test_histogram_split(n_bins):
|
||||
rng = np.random.RandomState(42)
|
||||
feature_idx = 0
|
||||
l2_regularization = 0
|
||||
min_hessian_to_split = 1e-3
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.0
|
||||
X_binned = np.asfortranarray(
|
||||
rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE
|
||||
)
|
||||
binned_feature = X_binned.T[feature_idx]
|
||||
sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32)
|
||||
ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
|
||||
all_hessians = ordered_hessians
|
||||
sum_hessians = all_hessians.sum()
|
||||
hessians_are_constant = False
|
||||
|
||||
for true_bin in range(1, n_bins - 2):
|
||||
for sign in [-1, 1]:
|
||||
ordered_gradients = np.full_like(binned_feature, sign, dtype=G_H_DTYPE)
|
||||
ordered_gradients[binned_feature <= true_bin] *= -1
|
||||
all_gradients = ordered_gradients
|
||||
sum_gradients = all_gradients.sum()
|
||||
|
||||
builder = HistogramBuilder(
|
||||
X_binned,
|
||||
n_bins,
|
||||
all_gradients,
|
||||
all_hessians,
|
||||
hessians_are_constant,
|
||||
n_threads,
|
||||
)
|
||||
n_bins_non_missing = np.array(
|
||||
[n_bins - 1] * X_binned.shape[1], dtype=np.uint32
|
||||
)
|
||||
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
||||
)
|
||||
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
splitter = Splitter(
|
||||
X_binned,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
is_categorical,
|
||||
monotonic_cst,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
hessians_are_constant,
|
||||
)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
value = compute_node_value(
|
||||
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
||||
)
|
||||
split_info = splitter.find_node_split(
|
||||
sample_indices.shape[0], histograms, sum_gradients, sum_hessians, value
|
||||
)
|
||||
|
||||
assert split_info.bin_idx == true_bin
|
||||
assert split_info.gain >= 0
|
||||
assert split_info.feature_idx == feature_idx
|
||||
assert (
|
||||
split_info.n_samples_left + split_info.n_samples_right
|
||||
== sample_indices.shape[0]
|
||||
)
|
||||
# Constant hessian: 1. per sample.
|
||||
assert split_info.n_samples_left == split_info.sum_hessian_left
|
||||
|
||||
|
||||
@skip_if_32bit
|
||||
@pytest.mark.parametrize("constant_hessian", [True, False])
|
||||
def test_gradient_and_hessian_sanity(constant_hessian):
|
||||
# This test checks that the values of gradients and hessians are
|
||||
# consistent in different places:
|
||||
# - in split_info: si.sum_gradient_left + si.sum_gradient_right must be
|
||||
# equal to the gradient at the node. Same for hessians.
|
||||
# - in the histograms: summing 'sum_gradients' over the bins must be
|
||||
# constant across all features, and those sums must be equal to the
|
||||
# node's gradient. Same for hessians.
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
n_bins = 10
|
||||
n_features = 20
|
||||
n_samples = 500
|
||||
l2_regularization = 0.0
|
||||
min_hessian_to_split = 1e-3
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.0
|
||||
|
||||
X_binned = rng.randint(
|
||||
0, n_bins, size=(n_samples, n_features), dtype=X_BINNED_DTYPE
|
||||
)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
if constant_hessian:
|
||||
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
sum_hessians = 1 * n_samples
|
||||
else:
|
||||
all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
|
||||
sum_hessians = all_hessians.sum()
|
||||
|
||||
builder = HistogramBuilder(
|
||||
X_binned, n_bins, all_gradients, all_hessians, constant_hessian, n_threads
|
||||
)
|
||||
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
|
||||
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
||||
)
|
||||
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
splitter = Splitter(
|
||||
X_binned,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
is_categorical,
|
||||
monotonic_cst,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
constant_hessian,
|
||||
)
|
||||
|
||||
hists_parent = builder.compute_histograms_brute(sample_indices)
|
||||
value_parent = compute_node_value(
|
||||
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
||||
)
|
||||
si_parent = splitter.find_node_split(
|
||||
n_samples, hists_parent, sum_gradients, sum_hessians, value_parent
|
||||
)
|
||||
sample_indices_left, sample_indices_right, _ = splitter.split_indices(
|
||||
si_parent, sample_indices
|
||||
)
|
||||
|
||||
hists_left = builder.compute_histograms_brute(sample_indices_left)
|
||||
value_left = compute_node_value(
|
||||
si_parent.sum_gradient_left,
|
||||
si_parent.sum_hessian_left,
|
||||
-np.inf,
|
||||
np.inf,
|
||||
l2_regularization,
|
||||
)
|
||||
hists_right = builder.compute_histograms_brute(sample_indices_right)
|
||||
value_right = compute_node_value(
|
||||
si_parent.sum_gradient_right,
|
||||
si_parent.sum_hessian_right,
|
||||
-np.inf,
|
||||
np.inf,
|
||||
l2_regularization,
|
||||
)
|
||||
si_left = splitter.find_node_split(
|
||||
n_samples,
|
||||
hists_left,
|
||||
si_parent.sum_gradient_left,
|
||||
si_parent.sum_hessian_left,
|
||||
value_left,
|
||||
)
|
||||
si_right = splitter.find_node_split(
|
||||
n_samples,
|
||||
hists_right,
|
||||
si_parent.sum_gradient_right,
|
||||
si_parent.sum_hessian_right,
|
||||
value_right,
|
||||
)
|
||||
|
||||
# make sure that si.sum_gradient_left + si.sum_gradient_right have their
|
||||
# expected value, same for hessians
|
||||
for si, indices in (
|
||||
(si_parent, sample_indices),
|
||||
(si_left, sample_indices_left),
|
||||
(si_right, sample_indices_right),
|
||||
):
|
||||
gradient = si.sum_gradient_right + si.sum_gradient_left
|
||||
expected_gradient = all_gradients[indices].sum()
|
||||
hessian = si.sum_hessian_right + si.sum_hessian_left
|
||||
if constant_hessian:
|
||||
expected_hessian = indices.shape[0] * all_hessians[0]
|
||||
else:
|
||||
expected_hessian = all_hessians[indices].sum()
|
||||
|
||||
assert np.isclose(gradient, expected_gradient)
|
||||
assert np.isclose(hessian, expected_hessian)
|
||||
|
||||
# make sure sum of gradients in histograms are the same for all features,
|
||||
# and make sure they're equal to their expected value
|
||||
hists_parent = np.asarray(hists_parent, dtype=HISTOGRAM_DTYPE)
|
||||
hists_left = np.asarray(hists_left, dtype=HISTOGRAM_DTYPE)
|
||||
hists_right = np.asarray(hists_right, dtype=HISTOGRAM_DTYPE)
|
||||
for hists, indices in (
|
||||
(hists_parent, sample_indices),
|
||||
(hists_left, sample_indices_left),
|
||||
(hists_right, sample_indices_right),
|
||||
):
|
||||
# note: gradients and hessians have shape (n_features,),
|
||||
# we're comparing them to *scalars*. This has the benefit of also
|
||||
# making sure that all the entries are equal across features.
|
||||
gradients = hists["sum_gradients"].sum(axis=1) # shape = (n_features,)
|
||||
expected_gradient = all_gradients[indices].sum() # scalar
|
||||
hessians = hists["sum_hessians"].sum(axis=1)
|
||||
if constant_hessian:
|
||||
# 0 is not the actual hessian, but it's not computed in this case
|
||||
expected_hessian = 0.0
|
||||
else:
|
||||
expected_hessian = all_hessians[indices].sum()
|
||||
|
||||
assert np.allclose(gradients, expected_gradient)
|
||||
assert np.allclose(hessians, expected_hessian)
|
||||
|
||||
|
||||
def test_split_indices():
|
||||
# Check that split_indices returns the correct splits and that
|
||||
# splitter.partition is consistent with what is returned.
|
||||
rng = np.random.RandomState(421)
|
||||
|
||||
n_bins = 5
|
||||
n_samples = 10
|
||||
l2_regularization = 0.0
|
||||
min_hessian_to_split = 1e-3
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.0
|
||||
|
||||
# split will happen on feature 1 and on bin 3
|
||||
X_binned = [
|
||||
[0, 0],
|
||||
[0, 3],
|
||||
[0, 4],
|
||||
[0, 0],
|
||||
[0, 0],
|
||||
[0, 0],
|
||||
[0, 0],
|
||||
[0, 4],
|
||||
[0, 0],
|
||||
[0, 4],
|
||||
]
|
||||
X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
|
||||
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
sum_hessians = 1 * n_samples
|
||||
hessians_are_constant = True
|
||||
|
||||
builder = HistogramBuilder(
|
||||
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
||||
)
|
||||
n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32)
|
||||
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
||||
)
|
||||
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
splitter = Splitter(
|
||||
X_binned,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
is_categorical,
|
||||
monotonic_cst,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
hessians_are_constant,
|
||||
)
|
||||
|
||||
assert np.all(sample_indices == splitter.partition)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
value = compute_node_value(
|
||||
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
||||
)
|
||||
si_root = splitter.find_node_split(
|
||||
n_samples, histograms, sum_gradients, sum_hessians, value
|
||||
)
|
||||
|
||||
# sanity checks for best split
|
||||
assert si_root.feature_idx == 1
|
||||
assert si_root.bin_idx == 3
|
||||
|
||||
samples_left, samples_right, position_right = splitter.split_indices(
|
||||
si_root, splitter.partition
|
||||
)
|
||||
assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8])
|
||||
assert set(samples_right) == set([2, 7, 9])
|
||||
|
||||
assert list(samples_left) == list(splitter.partition[:position_right])
|
||||
assert list(samples_right) == list(splitter.partition[position_right:])
|
||||
|
||||
# Check that the resulting split indices sizes are consistent with the
|
||||
# count statistics anticipated when looking for the best split.
|
||||
assert samples_left.shape[0] == si_root.n_samples_left
|
||||
assert samples_right.shape[0] == si_root.n_samples_right
|
||||
|
||||
|
||||
def test_min_gain_to_split():
|
||||
# Try to split a pure node (all gradients are equal, same for hessians)
|
||||
# with min_gain_to_split = 0 and make sure that the node is not split (best
|
||||
# possible gain = -1). Note: before the strict inequality comparison, this
|
||||
# test would fail because the node would be split with a gain of 0.
|
||||
rng = np.random.RandomState(42)
|
||||
l2_regularization = 0
|
||||
min_hessian_to_split = 0
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.0
|
||||
n_bins = 255
|
||||
n_samples = 100
|
||||
X_binned = np.asfortranarray(
|
||||
rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE
|
||||
)
|
||||
binned_feature = X_binned[:, 0]
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
|
||||
all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
sum_hessians = all_hessians.sum()
|
||||
hessians_are_constant = False
|
||||
|
||||
builder = HistogramBuilder(
|
||||
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
||||
)
|
||||
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
|
||||
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
||||
)
|
||||
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
splitter = Splitter(
|
||||
X_binned,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
is_categorical,
|
||||
monotonic_cst,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
hessians_are_constant,
|
||||
)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
value = compute_node_value(
|
||||
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
||||
)
|
||||
split_info = splitter.find_node_split(
|
||||
n_samples, histograms, sum_gradients, sum_hessians, value
|
||||
)
|
||||
assert split_info.gain == -1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X_binned, all_gradients, has_missing_values, n_bins_non_missing, "
|
||||
" expected_split_on_nan, expected_bin_idx, expected_go_to_left",
|
||||
[
|
||||
# basic sanity check with no missing values: given the gradient
|
||||
# values, the split must occur on bin_idx=3
|
||||
(
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], # X_binned
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5], # gradients
|
||||
False, # no missing values
|
||||
10, # n_bins_non_missing
|
||||
False, # don't split on nans
|
||||
3, # expected_bin_idx
|
||||
"not_applicable",
|
||||
),
|
||||
# We replace 2 samples by NaNs (bin_idx=8)
|
||||
# These 2 samples were mapped to the left node before, so they should
|
||||
# be mapped to left node again
|
||||
# Notice how the bin_idx threshold changes from 3 to 1.
|
||||
(
|
||||
[8, 0, 1, 8, 2, 3, 4, 5, 6, 7], # 8 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
8, # n_bins_non_missing
|
||||
False, # don't split on nans
|
||||
1, # cut on bin_idx=1
|
||||
True,
|
||||
), # missing values go to left
|
||||
# same as above, but with non-consecutive missing_values_bin
|
||||
(
|
||||
[9, 0, 1, 9, 2, 3, 4, 5, 6, 7], # 9 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
8, # n_bins_non_missing
|
||||
False, # don't split on nans
|
||||
1, # cut on bin_idx=1
|
||||
True,
|
||||
), # missing values go to left
|
||||
# this time replacing 2 samples that were on the right.
|
||||
(
|
||||
[0, 1, 2, 3, 8, 4, 8, 5, 6, 7], # 8 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
8, # n_bins_non_missing
|
||||
False, # don't split on nans
|
||||
3, # cut on bin_idx=3 (like in first case)
|
||||
False,
|
||||
), # missing values go to right
|
||||
# same as above, but with non-consecutive missing_values_bin
|
||||
(
|
||||
[0, 1, 2, 3, 9, 4, 9, 5, 6, 7], # 9 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
8, # n_bins_non_missing
|
||||
False, # don't split on nans
|
||||
3, # cut on bin_idx=3 (like in first case)
|
||||
False,
|
||||
), # missing values go to right
|
||||
# For the following cases, split_on_nans is True (we replace all of
|
||||
# the samples with nans, instead of just 2).
|
||||
(
|
||||
[0, 1, 2, 3, 4, 4, 4, 4, 4, 4], # 4 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
4, # n_bins_non_missing
|
||||
True, # split on nans
|
||||
3, # cut on bin_idx=3
|
||||
False,
|
||||
), # missing values go to right
|
||||
# same as above, but with non-consecutive missing_values_bin
|
||||
(
|
||||
[0, 1, 2, 3, 9, 9, 9, 9, 9, 9], # 9 <=> missing
|
||||
[1, 1, 1, 1, 1, 1, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
4, # n_bins_non_missing
|
||||
True, # split on nans
|
||||
3, # cut on bin_idx=3
|
||||
False,
|
||||
), # missing values go to right
|
||||
(
|
||||
[6, 6, 6, 6, 0, 1, 2, 3, 4, 5], # 6 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
6, # n_bins_non_missing
|
||||
True, # split on nans
|
||||
5, # cut on bin_idx=5
|
||||
False,
|
||||
), # missing values go to right
|
||||
# same as above, but with non-consecutive missing_values_bin
|
||||
(
|
||||
[9, 9, 9, 9, 0, 1, 2, 3, 4, 5], # 9 <=> missing
|
||||
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
||||
True, # missing values
|
||||
6, # n_bins_non_missing
|
||||
True, # split on nans
|
||||
5, # cut on bin_idx=5
|
||||
False,
|
||||
), # missing values go to right
|
||||
],
|
||||
)
|
||||
def test_splitting_missing_values(
|
||||
X_binned,
|
||||
all_gradients,
|
||||
has_missing_values,
|
||||
n_bins_non_missing,
|
||||
expected_split_on_nan,
|
||||
expected_bin_idx,
|
||||
expected_go_to_left,
|
||||
):
|
||||
# Make sure missing values are properly supported.
|
||||
# we build an artificial example with gradients such that the best split
|
||||
# is on bin_idx=3, when there are no missing values.
|
||||
# Then we introduce missing values and:
|
||||
# - make sure the chosen bin is correct (find_best_bin()): it's
|
||||
# still the same split, even though the index of the bin may change
|
||||
# - make sure the missing values are mapped to the correct child
|
||||
# (split_indices())
|
||||
|
||||
n_bins = max(X_binned) + 1
|
||||
n_samples = len(X_binned)
|
||||
l2_regularization = 0.0
|
||||
min_hessian_to_split = 1e-3
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.0
|
||||
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
|
||||
has_missing_values = np.array([has_missing_values], dtype=np.uint8)
|
||||
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
sum_hessians = 1 * n_samples
|
||||
hessians_are_constant = True
|
||||
|
||||
builder = HistogramBuilder(
|
||||
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
||||
)
|
||||
|
||||
n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
||||
)
|
||||
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
splitter = Splitter(
|
||||
X_binned,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
is_categorical,
|
||||
monotonic_cst,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
hessians_are_constant,
|
||||
)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
value = compute_node_value(
|
||||
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
||||
)
|
||||
split_info = splitter.find_node_split(
|
||||
n_samples, histograms, sum_gradients, sum_hessians, value
|
||||
)
|
||||
|
||||
assert split_info.bin_idx == expected_bin_idx
|
||||
if has_missing_values:
|
||||
assert split_info.missing_go_to_left == expected_go_to_left
|
||||
|
||||
split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1
|
||||
assert split_on_nan == expected_split_on_nan
|
||||
|
||||
# Make sure the split is properly computed.
|
||||
# This also make sure missing values are properly assigned to the correct
|
||||
# child in split_indices()
|
||||
samples_left, samples_right, _ = splitter.split_indices(
|
||||
split_info, splitter.partition
|
||||
)
|
||||
|
||||
if not expected_split_on_nan:
|
||||
# When we don't split on nans, the split should always be the same.
|
||||
assert set(samples_left) == set([0, 1, 2, 3])
|
||||
assert set(samples_right) == set([4, 5, 6, 7, 8, 9])
|
||||
else:
|
||||
# When we split on nans, samples with missing values are always mapped
|
||||
# to the right child.
|
||||
missing_samples_indices = np.flatnonzero(
|
||||
np.array(X_binned) == missing_values_bin_idx
|
||||
)
|
||||
non_missing_samples_indices = np.flatnonzero(
|
||||
np.array(X_binned) != missing_values_bin_idx
|
||||
)
|
||||
|
||||
assert set(samples_right) == set(missing_samples_indices)
|
||||
assert set(samples_left) == set(non_missing_samples_indices)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X_binned, has_missing_values, n_bins_non_missing, ",
|
||||
[
|
||||
# one category
|
||||
([0] * 20, False, 1),
|
||||
# all categories appear less than MIN_CAT_SUPPORT (hardcoded to 10)
|
||||
([0] * 9 + [1] * 8, False, 2),
|
||||
# only one category appears more than MIN_CAT_SUPPORT
|
||||
([0] * 12 + [1] * 8, False, 2),
|
||||
# missing values + category appear less than MIN_CAT_SUPPORT
|
||||
# 9 is missing
|
||||
([0] * 9 + [1] * 8 + [9] * 4, True, 2),
|
||||
# no non-missing category
|
||||
([9] * 11, True, 0),
|
||||
],
|
||||
)
|
||||
def test_splitting_categorical_cat_smooth(
|
||||
X_binned, has_missing_values, n_bins_non_missing
|
||||
):
|
||||
# Checks categorical splits are correct when the MIN_CAT_SUPPORT constraint
|
||||
# isn't respected: there are no splits
|
||||
|
||||
n_bins = max(X_binned) + 1
|
||||
n_samples = len(X_binned)
|
||||
X_binned = np.array([X_binned], dtype=X_BINNED_DTYPE).T
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
|
||||
l2_regularization = 0.0
|
||||
min_hessian_to_split = 1e-3
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.0
|
||||
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
all_gradients = np.ones(n_samples, dtype=G_H_DTYPE)
|
||||
has_missing_values = np.array([has_missing_values], dtype=np.uint8)
|
||||
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
sum_gradients = all_gradients.sum()
|
||||
sum_hessians = n_samples
|
||||
hessians_are_constant = True
|
||||
|
||||
builder = HistogramBuilder(
|
||||
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
||||
)
|
||||
|
||||
n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
||||
)
|
||||
is_categorical = np.ones_like(monotonic_cst, dtype=np.uint8)
|
||||
missing_values_bin_idx = n_bins - 1
|
||||
|
||||
splitter = Splitter(
|
||||
X_binned,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
is_categorical,
|
||||
monotonic_cst,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
hessians_are_constant,
|
||||
)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
value = compute_node_value(
|
||||
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
||||
)
|
||||
split_info = splitter.find_node_split(
|
||||
n_samples, histograms, sum_gradients, sum_hessians, value
|
||||
)
|
||||
|
||||
# no split found
|
||||
assert split_info.gain == -1
|
||||
|
||||
|
||||
def _assert_categories_equals_bitset(categories, bitset):
|
||||
# assert that the bitset exactly corresponds to the categories
|
||||
# bitset is assumed to be an array of 8 uint32 elements
|
||||
|
||||
# form bitset from threshold
|
||||
expected_bitset = np.zeros(8, dtype=np.uint32)
|
||||
for cat in categories:
|
||||
idx = cat // 32
|
||||
shift = cat % 32
|
||||
expected_bitset[idx] |= 1 << shift
|
||||
|
||||
# check for equality
|
||||
assert_array_equal(expected_bitset, bitset)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X_binned, all_gradients, expected_categories_left, n_bins_non_missing,"
|
||||
"missing_values_bin_idx, has_missing_values, expected_missing_go_to_left",
|
||||
[
|
||||
# 4 categories
|
||||
(
|
||||
[0, 1, 2, 3] * 11, # X_binned
|
||||
[10, 1, 10, 10] * 11, # all_gradients
|
||||
[1], # expected_categories_left
|
||||
4, # n_bins_non_missing
|
||||
4, # missing_values_bin_idx
|
||||
False, # has_missing_values
|
||||
None,
|
||||
), # expected_missing_go_to_left, unchecked
|
||||
# Make sure that the categories that are on the right (second half) of
|
||||
# the sorted categories array can still go in the left child. In this
|
||||
# case, the best split was found when scanning from right to left.
|
||||
(
|
||||
[0, 1, 2, 3] * 11, # X_binned
|
||||
[10, 10, 10, 1] * 11, # all_gradients
|
||||
[3], # expected_categories_left
|
||||
4, # n_bins_non_missing
|
||||
4, # missing_values_bin_idx
|
||||
False, # has_missing_values
|
||||
None,
|
||||
), # expected_missing_go_to_left, unchecked
|
||||
# categories that don't respect MIN_CAT_SUPPORT (cat 4) are always
|
||||
# mapped to the right child
|
||||
(
|
||||
[0, 1, 2, 3] * 11 + [4] * 5, # X_binned
|
||||
[10, 10, 10, 1] * 11 + [10] * 5, # all_gradients
|
||||
[3], # expected_categories_left
|
||||
4, # n_bins_non_missing
|
||||
4, # missing_values_bin_idx
|
||||
False, # has_missing_values
|
||||
None,
|
||||
), # expected_missing_go_to_left, unchecked
|
||||
# categories that don't respect MIN_CAT_SUPPORT are always mapped to
|
||||
# the right child: in this case a more sensible split could have been
|
||||
# 3, 4 - 0, 1, 2
|
||||
# But the split is still 3 - 0, 1, 2, 4. this is because we only scan
|
||||
# up to the middle of the sorted category array (0, 1, 2, 3), and
|
||||
# because we exclude cat 4 in this array.
|
||||
(
|
||||
[0, 1, 2, 3] * 11 + [4] * 5, # X_binned
|
||||
[10, 10, 10, 1] * 11 + [1] * 5, # all_gradients
|
||||
[3], # expected_categories_left
|
||||
4, # n_bins_non_missing
|
||||
4, # missing_values_bin_idx
|
||||
False, # has_missing_values
|
||||
None,
|
||||
), # expected_missing_go_to_left, unchecked
|
||||
# 4 categories with missing values that go to the right
|
||||
(
|
||||
[0, 1, 2] * 11 + [9] * 11, # X_binned
|
||||
[10, 1, 10] * 11 + [10] * 11, # all_gradients
|
||||
[1], # expected_categories_left
|
||||
3, # n_bins_non_missing
|
||||
9, # missing_values_bin_idx
|
||||
True, # has_missing_values
|
||||
False,
|
||||
), # expected_missing_go_to_left
|
||||
# 4 categories with missing values that go to the left
|
||||
(
|
||||
[0, 1, 2] * 11 + [9] * 11, # X_binned
|
||||
[10, 1, 10] * 11 + [1] * 11, # all_gradients
|
||||
[1, 9], # expected_categories_left
|
||||
3, # n_bins_non_missing
|
||||
9, # missing_values_bin_idx
|
||||
True, # has_missing_values
|
||||
True,
|
||||
), # expected_missing_go_to_left
|
||||
# split is on the missing value
|
||||
(
|
||||
[0, 1, 2, 3, 4] * 11 + [255] * 12, # X_binned
|
||||
[10, 10, 10, 10, 10] * 11 + [1] * 12, # all_gradients
|
||||
[255], # expected_categories_left
|
||||
5, # n_bins_non_missing
|
||||
255, # missing_values_bin_idx
|
||||
True, # has_missing_values
|
||||
True,
|
||||
), # expected_missing_go_to_left
|
||||
# split on even categories
|
||||
(
|
||||
list(range(60)) * 12, # X_binned
|
||||
[10, 1] * 360, # all_gradients
|
||||
list(range(1, 60, 2)), # expected_categories_left
|
||||
59, # n_bins_non_missing
|
||||
59, # missing_values_bin_idx
|
||||
True, # has_missing_values
|
||||
True,
|
||||
), # expected_missing_go_to_left
|
||||
# split on every 8 categories
|
||||
(
|
||||
list(range(256)) * 12, # X_binned
|
||||
[10, 10, 10, 10, 10, 10, 10, 1] * 384, # all_gradients
|
||||
list(range(7, 256, 8)), # expected_categories_left
|
||||
255, # n_bins_non_missing
|
||||
255, # missing_values_bin_idx
|
||||
True, # has_missing_values
|
||||
True,
|
||||
), # expected_missing_go_to_left
|
||||
],
|
||||
)
|
||||
def test_splitting_categorical_sanity(
|
||||
X_binned,
|
||||
all_gradients,
|
||||
expected_categories_left,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
expected_missing_go_to_left,
|
||||
):
|
||||
# Tests various combinations of categorical splits
|
||||
|
||||
n_samples = len(X_binned)
|
||||
n_bins = max(X_binned) + 1
|
||||
|
||||
X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
|
||||
X_binned = np.asfortranarray(X_binned)
|
||||
|
||||
l2_regularization = 0.0
|
||||
min_hessian_to_split = 1e-3
|
||||
min_samples_leaf = 1
|
||||
min_gain_to_split = 0.0
|
||||
|
||||
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
||||
all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
|
||||
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
||||
has_missing_values = np.array([has_missing_values], dtype=np.uint8)
|
||||
sum_gradients = all_gradients.sum()
|
||||
sum_hessians = n_samples
|
||||
hessians_are_constant = True
|
||||
|
||||
builder = HistogramBuilder(
|
||||
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
||||
)
|
||||
|
||||
n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
|
||||
monotonic_cst = np.array(
|
||||
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
||||
)
|
||||
is_categorical = np.ones_like(monotonic_cst, dtype=np.uint8)
|
||||
|
||||
splitter = Splitter(
|
||||
X_binned,
|
||||
n_bins_non_missing,
|
||||
missing_values_bin_idx,
|
||||
has_missing_values,
|
||||
is_categorical,
|
||||
monotonic_cst,
|
||||
l2_regularization,
|
||||
min_hessian_to_split,
|
||||
min_samples_leaf,
|
||||
min_gain_to_split,
|
||||
hessians_are_constant,
|
||||
)
|
||||
|
||||
histograms = builder.compute_histograms_brute(sample_indices)
|
||||
|
||||
value = compute_node_value(
|
||||
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
||||
)
|
||||
split_info = splitter.find_node_split(
|
||||
n_samples, histograms, sum_gradients, sum_hessians, value
|
||||
)
|
||||
|
||||
assert split_info.is_categorical
|
||||
assert split_info.gain > 0
|
||||
_assert_categories_equals_bitset(
|
||||
expected_categories_left, split_info.left_cat_bitset
|
||||
)
|
||||
if has_missing_values:
|
||||
assert split_info.missing_go_to_left == expected_missing_go_to_left
|
||||
# If there is no missing value during training, the flag missing_go_to_left
|
||||
# is set later in the grower.
|
||||
|
||||
# make sure samples are split correctly
|
||||
samples_left, samples_right, _ = splitter.split_indices(
|
||||
split_info, splitter.partition
|
||||
)
|
||||
|
||||
left_mask = np.isin(X_binned.ravel(), expected_categories_left)
|
||||
assert_array_equal(sample_indices[left_mask], samples_left)
|
||||
assert_array_equal(sample_indices[~left_mask], samples_right)
|
||||
@@ -0,0 +1,233 @@
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_equal
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.datasets import make_classification, make_regression
|
||||
|
||||
from sklearn.ensemble import HistGradientBoostingRegressor
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
from sklearn.metrics import check_scoring
|
||||
|
||||
|
||||
X_classification, y_classification = make_classification(random_state=0)
|
||||
X_regression, y_regression = make_regression(random_state=0)
|
||||
|
||||
|
||||
def _assert_predictor_equal(gb_1, gb_2, X):
|
||||
"""Assert that two HistGBM instances are identical."""
|
||||
# Check identical nodes for each tree
|
||||
for pred_ith_1, pred_ith_2 in zip(gb_1._predictors, gb_2._predictors):
|
||||
for predictor_1, predictor_2 in zip(pred_ith_1, pred_ith_2):
|
||||
assert_array_equal(predictor_1.nodes, predictor_2.nodes)
|
||||
|
||||
# Check identical predictions
|
||||
assert_allclose(gb_1.predict(X), gb_2.predict(X))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"GradientBoosting, X, y",
|
||||
[
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression),
|
||||
],
|
||||
)
|
||||
def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):
|
||||
# Check that a ValueError is raised when the maximum number of iterations
|
||||
# is smaller than the number of iterations from the previous fit when warm
|
||||
# start is True.
|
||||
|
||||
estimator = GradientBoosting(max_iter=10, early_stopping=False, warm_start=True)
|
||||
estimator.fit(X, y)
|
||||
estimator.set_params(max_iter=5)
|
||||
err_msg = (
|
||||
"max_iter=5 must be larger than or equal to n_iter_=10 when warm_start==True"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
estimator.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"GradientBoosting, X, y",
|
||||
[
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression),
|
||||
],
|
||||
)
|
||||
def test_warm_start_yields_identical_results(GradientBoosting, X, y):
|
||||
# Make sure that fitting 50 iterations and then 25 with warm start is
|
||||
# equivalent to fitting 75 iterations.
|
||||
|
||||
rng = 42
|
||||
gb_warm_start = GradientBoosting(
|
||||
n_iter_no_change=100, max_iter=50, random_state=rng, warm_start=True
|
||||
)
|
||||
gb_warm_start.fit(X, y).set_params(max_iter=75).fit(X, y)
|
||||
|
||||
gb_no_warm_start = GradientBoosting(
|
||||
n_iter_no_change=100, max_iter=75, random_state=rng, warm_start=False
|
||||
)
|
||||
gb_no_warm_start.fit(X, y)
|
||||
|
||||
# Check that both predictors are equal
|
||||
_assert_predictor_equal(gb_warm_start, gb_no_warm_start, X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"GradientBoosting, X, y",
|
||||
[
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression),
|
||||
],
|
||||
)
|
||||
def test_warm_start_max_depth(GradientBoosting, X, y):
|
||||
# Test if possible to fit trees of different depth in ensemble.
|
||||
gb = GradientBoosting(
|
||||
max_iter=20,
|
||||
min_samples_leaf=1,
|
||||
warm_start=True,
|
||||
max_depth=2,
|
||||
early_stopping=False,
|
||||
)
|
||||
gb.fit(X, y)
|
||||
gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110)
|
||||
gb.fit(X, y)
|
||||
|
||||
# First 20 trees have max_depth == 2
|
||||
for i in range(20):
|
||||
assert gb._predictors[i][0].get_max_depth() == 2
|
||||
# Last 10 trees have max_depth == 3
|
||||
for i in range(1, 11):
|
||||
assert gb._predictors[-i][0].get_max_depth() == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"GradientBoosting, X, y",
|
||||
[
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("scoring", (None, "loss"))
|
||||
def test_warm_start_early_stopping(GradientBoosting, X, y, scoring):
|
||||
# Make sure that early stopping occurs after a small number of iterations
|
||||
# when fitting a second time with warm starting.
|
||||
|
||||
n_iter_no_change = 5
|
||||
gb = GradientBoosting(
|
||||
n_iter_no_change=n_iter_no_change,
|
||||
max_iter=10000,
|
||||
early_stopping=True,
|
||||
random_state=42,
|
||||
warm_start=True,
|
||||
tol=1e-3,
|
||||
scoring=scoring,
|
||||
)
|
||||
gb.fit(X, y)
|
||||
n_iter_first_fit = gb.n_iter_
|
||||
gb.fit(X, y)
|
||||
n_iter_second_fit = gb.n_iter_
|
||||
assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"GradientBoosting, X, y",
|
||||
[
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression),
|
||||
],
|
||||
)
|
||||
def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
|
||||
# Test if warm start with equal n_estimators does nothing
|
||||
gb_1 = GradientBoosting(max_depth=2, early_stopping=False)
|
||||
gb_1.fit(X, y)
|
||||
|
||||
gb_2 = clone(gb_1)
|
||||
gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True, n_iter_no_change=5)
|
||||
gb_2.fit(X, y)
|
||||
|
||||
# Check that both predictors are equal
|
||||
_assert_predictor_equal(gb_1, gb_2, X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"GradientBoosting, X, y",
|
||||
[
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression),
|
||||
],
|
||||
)
|
||||
def test_warm_start_clear(GradientBoosting, X, y):
|
||||
# Test if fit clears state.
|
||||
gb_1 = GradientBoosting(n_iter_no_change=5, random_state=42)
|
||||
gb_1.fit(X, y)
|
||||
|
||||
gb_2 = GradientBoosting(n_iter_no_change=5, random_state=42, warm_start=True)
|
||||
gb_2.fit(X, y) # inits state
|
||||
gb_2.set_params(warm_start=False)
|
||||
gb_2.fit(X, y) # clears old state and equals est
|
||||
|
||||
# Check that both predictors have the same train_score_ and
|
||||
# validation_score_ attributes
|
||||
assert_allclose(gb_1.train_score_, gb_2.train_score_)
|
||||
assert_allclose(gb_1.validation_score_, gb_2.validation_score_)
|
||||
|
||||
# Check that both predictors are equal
|
||||
_assert_predictor_equal(gb_1, gb_2, X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"GradientBoosting, X, y",
|
||||
[
|
||||
(HistGradientBoostingClassifier, X_classification, y_classification),
|
||||
(HistGradientBoostingRegressor, X_regression, y_regression),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("rng_type", ("none", "int", "instance"))
|
||||
def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):
|
||||
# Make sure the seeds for train/val split and small trainset subsampling
|
||||
# are correctly set in a warm start context.
|
||||
def _get_rng(rng_type):
|
||||
# Helper to avoid consuming rngs
|
||||
if rng_type == "none":
|
||||
return None
|
||||
elif rng_type == "int":
|
||||
return 42
|
||||
else:
|
||||
return np.random.RandomState(0)
|
||||
|
||||
random_state = _get_rng(rng_type)
|
||||
gb_1 = GradientBoosting(early_stopping=True, max_iter=2, random_state=random_state)
|
||||
gb_1.set_params(scoring=check_scoring(gb_1))
|
||||
gb_1.fit(X, y)
|
||||
random_seed_1_1 = gb_1._random_seed
|
||||
|
||||
gb_1.fit(X, y)
|
||||
random_seed_1_2 = gb_1._random_seed # clear the old state, different seed
|
||||
|
||||
random_state = _get_rng(rng_type)
|
||||
gb_2 = GradientBoosting(
|
||||
early_stopping=True, max_iter=2, random_state=random_state, warm_start=True
|
||||
)
|
||||
gb_2.set_params(scoring=check_scoring(gb_2))
|
||||
gb_2.fit(X, y) # inits state
|
||||
random_seed_2_1 = gb_2._random_seed
|
||||
gb_2.fit(X, y) # clears old state and equals est
|
||||
random_seed_2_2 = gb_2._random_seed
|
||||
|
||||
# Without warm starting, the seeds should be
|
||||
# * all different if random state is None
|
||||
# * all equal if random state is an integer
|
||||
# * different when refitting and equal with a new estimator (because
|
||||
# the random state is mutated)
|
||||
if rng_type == "none":
|
||||
assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1
|
||||
elif rng_type == "int":
|
||||
assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1
|
||||
else:
|
||||
assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2
|
||||
|
||||
# With warm starting, the seeds must be equal
|
||||
assert random_seed_2_1 == random_seed_2_2
|
||||
Binary file not shown.
@@ -0,0 +1,530 @@
|
||||
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
|
||||
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numbers
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from warnings import warn
|
||||
|
||||
from ..tree import ExtraTreeRegressor
|
||||
from ..utils import (
|
||||
check_random_state,
|
||||
check_array,
|
||||
gen_batches,
|
||||
get_chunk_n_rows,
|
||||
)
|
||||
from ..utils.validation import check_is_fitted, _num_samples
|
||||
from ..base import OutlierMixin
|
||||
|
||||
from ._bagging import BaseBagging
|
||||
|
||||
__all__ = ["IsolationForest"]
|
||||
|
||||
|
||||
class IsolationForest(OutlierMixin, BaseBagging):
|
||||
"""
|
||||
Isolation Forest Algorithm.
|
||||
|
||||
Return the anomaly score of each sample using the IsolationForest algorithm
|
||||
|
||||
The IsolationForest 'isolates' observations by randomly selecting a feature
|
||||
and then randomly selecting a split value between the maximum and minimum
|
||||
values of the selected feature.
|
||||
|
||||
Since recursive partitioning can be represented by a tree structure, the
|
||||
number of splittings required to isolate a sample is equivalent to the path
|
||||
length from the root node to the terminating node.
|
||||
|
||||
This path length, averaged over a forest of such random trees, is a
|
||||
measure of normality and our decision function.
|
||||
|
||||
Random partitioning produces noticeably shorter paths for anomalies.
|
||||
Hence, when a forest of random trees collectively produce shorter path
|
||||
lengths for particular samples, they are highly likely to be anomalies.
|
||||
|
||||
Read more in the :ref:`User Guide <isolation_forest>`.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_estimators : int, default=100
|
||||
The number of base estimators in the ensemble.
|
||||
|
||||
max_samples : "auto", int or float, default="auto"
|
||||
The number of samples to draw from X to train each base estimator.
|
||||
- If int, then draw `max_samples` samples.
|
||||
- If float, then draw `max_samples * X.shape[0]` samples.
|
||||
- If "auto", then `max_samples=min(256, n_samples)`.
|
||||
|
||||
If max_samples is larger than the number of samples provided,
|
||||
all samples will be used for all trees (no sampling).
|
||||
|
||||
contamination : 'auto' or float, default='auto'
|
||||
The amount of contamination of the data set, i.e. the proportion
|
||||
of outliers in the data set. Used when fitting to define the threshold
|
||||
on the scores of the samples.
|
||||
|
||||
- If 'auto', the threshold is determined as in the
|
||||
original paper.
|
||||
- If float, the contamination should be in the range (0, 0.5].
|
||||
|
||||
.. versionchanged:: 0.22
|
||||
The default value of ``contamination`` changed from 0.1
|
||||
to ``'auto'``.
|
||||
|
||||
max_features : int or float, default=1.0
|
||||
The number of features to draw from X to train each base estimator.
|
||||
|
||||
- If int, then draw `max_features` features.
|
||||
- If float, then draw `max_features * X.shape[1]` features.
|
||||
|
||||
bootstrap : bool, default=False
|
||||
If True, individual trees are fit on random subsets of the training
|
||||
data sampled with replacement. If False, sampling without replacement
|
||||
is performed.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to run in parallel for both :meth:`fit` and
|
||||
:meth:`predict`. ``None`` means 1 unless in a
|
||||
:obj:`joblib.parallel_backend` context. ``-1`` means using all
|
||||
processors. See :term:`Glossary <n_jobs>` for more details.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Controls the pseudo-randomness of the selection of the feature
|
||||
and split values for each branching step and each tree in the forest.
|
||||
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
verbose : int, default=0
|
||||
Controls the verbosity of the tree building process.
|
||||
|
||||
warm_start : bool, default=False
|
||||
When set to ``True``, reuse the solution of the previous call to fit
|
||||
and add more estimators to the ensemble, otherwise, just fit a whole
|
||||
new forest. See :term:`the Glossary <warm_start>`.
|
||||
|
||||
.. versionadded:: 0.21
|
||||
|
||||
Attributes
|
||||
----------
|
||||
base_estimator_ : ExtraTreeRegressor instance
|
||||
The child estimator template used to create the collection of
|
||||
fitted sub-estimators.
|
||||
|
||||
estimators_ : list of ExtraTreeRegressor instances
|
||||
The collection of fitted sub-estimators.
|
||||
|
||||
estimators_features_ : list of ndarray
|
||||
The subset of drawn features for each base estimator.
|
||||
|
||||
estimators_samples_ : list of ndarray
|
||||
The subset of drawn samples (i.e., the in-bag samples) for each base
|
||||
estimator.
|
||||
|
||||
max_samples_ : int
|
||||
The actual number of samples.
|
||||
|
||||
offset_ : float
|
||||
Offset used to define the decision function from the raw scores. We
|
||||
have the relation: ``decision_function = score_samples - offset_``.
|
||||
``offset_`` is defined as follows. When the contamination parameter is
|
||||
set to "auto", the offset is equal to -0.5 as the scores of inliers are
|
||||
close to 0 and the scores of outliers are close to -1. When a
|
||||
contamination parameter different than "auto" is provided, the offset
|
||||
is defined in such a way we obtain the expected number of outliers
|
||||
(samples with decision function < 0) in training.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
n_features_ : int
|
||||
The number of features when ``fit`` is performed.
|
||||
|
||||
.. deprecated:: 1.0
|
||||
Attribute `n_features_` was deprecated in version 1.0 and will be
|
||||
removed in 1.2. Use `n_features_in_` instead.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a
|
||||
Gaussian distributed dataset.
|
||||
sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.
|
||||
Estimate the support of a high-dimensional distribution.
|
||||
The implementation is based on libsvm.
|
||||
sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection
|
||||
using Local Outlier Factor (LOF).
|
||||
|
||||
Notes
|
||||
-----
|
||||
The implementation is based on an ensemble of ExtraTreeRegressor. The
|
||||
maximum depth of each tree is set to ``ceil(log_2(n))`` where
|
||||
:math:`n` is the number of samples used to build the tree
|
||||
(see (Liu et al., 2008) for more details).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
|
||||
Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
|
||||
.. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation-based
|
||||
anomaly detection." ACM Transactions on Knowledge Discovery from
|
||||
Data (TKDD) 6.1 (2012): 3.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.ensemble import IsolationForest
|
||||
>>> X = [[-1.1], [0.3], [0.5], [100]]
|
||||
>>> clf = IsolationForest(random_state=0).fit(X)
|
||||
>>> clf.predict([[0.1], [0], [90]])
|
||||
array([ 1, 1, -1])
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
n_estimators=100,
|
||||
max_samples="auto",
|
||||
contamination="auto",
|
||||
max_features=1.0,
|
||||
bootstrap=False,
|
||||
n_jobs=None,
|
||||
random_state=None,
|
||||
verbose=0,
|
||||
warm_start=False,
|
||||
):
|
||||
super().__init__(
|
||||
base_estimator=ExtraTreeRegressor(
|
||||
max_features=1, splitter="random", random_state=random_state
|
||||
),
|
||||
# here above max_features has no links with self.max_features
|
||||
bootstrap=bootstrap,
|
||||
bootstrap_features=False,
|
||||
n_estimators=n_estimators,
|
||||
max_samples=max_samples,
|
||||
max_features=max_features,
|
||||
warm_start=warm_start,
|
||||
n_jobs=n_jobs,
|
||||
random_state=random_state,
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
self.contamination = contamination
|
||||
|
||||
def _set_oob_score(self, X, y):
|
||||
raise NotImplementedError("OOB score not supported by iforest")
|
||||
|
||||
def _parallel_args(self):
|
||||
# ExtraTreeRegressor releases the GIL, so it's more efficient to use
|
||||
# a thread-based backend rather than a process-based backend so as
|
||||
# to avoid suffering from communication overhead and extra memory
|
||||
# copies.
|
||||
return {"prefer": "threads"}
|
||||
|
||||
def fit(self, X, y=None, sample_weight=None):
|
||||
"""
|
||||
Fit estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The input samples. Use ``dtype=np.float32`` for maximum
|
||||
efficiency. Sparse matrices are also supported, use sparse
|
||||
``csc_matrix`` for maximum efficiency.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
X = self._validate_data(X, accept_sparse=["csc"])
|
||||
if issparse(X):
|
||||
# Pre-sort indices to avoid that each individual tree of the
|
||||
# ensemble sorts the indices.
|
||||
X.sort_indices()
|
||||
|
||||
rnd = check_random_state(self.random_state)
|
||||
y = rnd.uniform(size=X.shape[0])
|
||||
|
||||
# ensure that max_sample is in [1, n_samples]:
|
||||
n_samples = X.shape[0]
|
||||
|
||||
if self.contamination != "auto":
|
||||
if not (0.0 < self.contamination <= 0.5):
|
||||
raise ValueError(
|
||||
"contamination must be in (0, 0.5], got: %f" % self.contamination
|
||||
)
|
||||
|
||||
if isinstance(self.max_samples, str):
|
||||
if self.max_samples == "auto":
|
||||
max_samples = min(256, n_samples)
|
||||
else:
|
||||
raise ValueError(
|
||||
"max_samples (%s) is not supported."
|
||||
'Valid choices are: "auto", int or'
|
||||
"float"
|
||||
% self.max_samples
|
||||
)
|
||||
|
||||
elif isinstance(self.max_samples, numbers.Integral):
|
||||
if self.max_samples > n_samples:
|
||||
warn(
|
||||
"max_samples (%s) is greater than the "
|
||||
"total number of samples (%s). max_samples "
|
||||
"will be set to n_samples for estimation."
|
||||
% (self.max_samples, n_samples)
|
||||
)
|
||||
max_samples = n_samples
|
||||
else:
|
||||
max_samples = self.max_samples
|
||||
else: # float
|
||||
if not 0.0 < self.max_samples <= 1.0:
|
||||
raise ValueError(
|
||||
"max_samples must be in (0, 1], got %r" % self.max_samples
|
||||
)
|
||||
max_samples = int(self.max_samples * X.shape[0])
|
||||
|
||||
self.max_samples_ = max_samples
|
||||
max_depth = int(np.ceil(np.log2(max(max_samples, 2))))
|
||||
super()._fit(
|
||||
X,
|
||||
y,
|
||||
max_samples,
|
||||
max_depth=max_depth,
|
||||
sample_weight=sample_weight,
|
||||
check_input=False,
|
||||
)
|
||||
|
||||
if self.contamination == "auto":
|
||||
# 0.5 plays a special role as described in the original paper.
|
||||
# we take the opposite as we consider the opposite of their score.
|
||||
self.offset_ = -0.5
|
||||
return self
|
||||
|
||||
# else, define offset_ wrt contamination parameter
|
||||
self.offset_ = np.percentile(self.score_samples(X), 100.0 * self.contamination)
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Predict if a particular sample is an outlier or not.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The input samples. Internally, it will be converted to
|
||||
``dtype=np.float32`` and if a sparse matrix is provided
|
||||
to a sparse ``csr_matrix``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
For each observation, tells whether or not (+1 or -1) it should
|
||||
be considered as an inlier according to the fitted model.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
decision_func = self.decision_function(X)
|
||||
is_inlier = np.ones_like(decision_func, dtype=int)
|
||||
is_inlier[decision_func < 0] = -1
|
||||
return is_inlier
|
||||
|
||||
def decision_function(self, X):
|
||||
"""
|
||||
Average anomaly score of X of the base classifiers.
|
||||
|
||||
The anomaly score of an input sample is computed as
|
||||
the mean anomaly score of the trees in the forest.
|
||||
|
||||
The measure of normality of an observation given a tree is the depth
|
||||
of the leaf containing this observation, which is equivalent to
|
||||
the number of splittings required to isolate this point. In case of
|
||||
several observations n_left in the leaf, the average path length of
|
||||
a n_left samples isolation tree is added.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The input samples. Internally, it will be converted to
|
||||
``dtype=np.float32`` and if a sparse matrix is provided
|
||||
to a sparse ``csr_matrix``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
scores : ndarray of shape (n_samples,)
|
||||
The anomaly score of the input samples.
|
||||
The lower, the more abnormal. Negative scores represent outliers,
|
||||
positive scores represent inliers.
|
||||
"""
|
||||
# We subtract self.offset_ to make 0 be the threshold value for being
|
||||
# an outlier:
|
||||
|
||||
return self.score_samples(X) - self.offset_
|
||||
|
||||
def score_samples(self, X):
|
||||
"""
|
||||
Opposite of the anomaly score defined in the original paper.
|
||||
|
||||
The anomaly score of an input sample is computed as
|
||||
the mean anomaly score of the trees in the forest.
|
||||
|
||||
The measure of normality of an observation given a tree is the depth
|
||||
of the leaf containing this observation, which is equivalent to
|
||||
the number of splittings required to isolate this point. In case of
|
||||
several observations n_left in the leaf, the average path length of
|
||||
a n_left samples isolation tree is added.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
scores : ndarray of shape (n_samples,)
|
||||
The anomaly score of the input samples.
|
||||
The lower, the more abnormal.
|
||||
"""
|
||||
# code structure from ForestClassifier/predict_proba
|
||||
|
||||
check_is_fitted(self)
|
||||
|
||||
# Check data
|
||||
X = self._validate_data(X, accept_sparse="csr", reset=False)
|
||||
|
||||
# Take the opposite of the scores as bigger is better (here less
|
||||
# abnormal)
|
||||
return -self._compute_chunked_score_samples(X)
|
||||
|
||||
def _compute_chunked_score_samples(self, X):
|
||||
|
||||
n_samples = _num_samples(X)
|
||||
|
||||
if self._max_features == X.shape[1]:
|
||||
subsample_features = False
|
||||
else:
|
||||
subsample_features = True
|
||||
|
||||
# We get as many rows as possible within our working_memory budget
|
||||
# (defined by sklearn.get_config()['working_memory']) to store
|
||||
# self._max_features in each row during computation.
|
||||
#
|
||||
# Note:
|
||||
# - this will get at least 1 row, even if 1 row of score will
|
||||
# exceed working_memory.
|
||||
# - this does only account for temporary memory usage while loading
|
||||
# the data needed to compute the scores -- the returned scores
|
||||
# themselves are 1D.
|
||||
|
||||
chunk_n_rows = get_chunk_n_rows(
|
||||
row_bytes=16 * self._max_features, max_n_rows=n_samples
|
||||
)
|
||||
slices = gen_batches(n_samples, chunk_n_rows)
|
||||
|
||||
scores = np.zeros(n_samples, order="f")
|
||||
|
||||
for sl in slices:
|
||||
# compute score on the slices of test samples:
|
||||
scores[sl] = self._compute_score_samples(X[sl], subsample_features)
|
||||
|
||||
return scores
|
||||
|
||||
def _compute_score_samples(self, X, subsample_features):
|
||||
"""
|
||||
Compute the score of each samples in X going through the extra trees.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix
|
||||
Data matrix.
|
||||
|
||||
subsample_features : bool
|
||||
Whether features should be subsampled.
|
||||
"""
|
||||
n_samples = X.shape[0]
|
||||
|
||||
depths = np.zeros(n_samples, order="f")
|
||||
|
||||
for tree, features in zip(self.estimators_, self.estimators_features_):
|
||||
X_subset = X[:, features] if subsample_features else X
|
||||
|
||||
leaves_index = tree.apply(X_subset)
|
||||
node_indicator = tree.decision_path(X_subset)
|
||||
n_samples_leaf = tree.tree_.n_node_samples[leaves_index]
|
||||
|
||||
depths += (
|
||||
np.ravel(node_indicator.sum(axis=1))
|
||||
+ _average_path_length(n_samples_leaf)
|
||||
- 1.0
|
||||
)
|
||||
denominator = len(self.estimators_) * _average_path_length([self.max_samples_])
|
||||
scores = 2 ** (
|
||||
# For a single training sample, denominator and depth are 0.
|
||||
# Therefore, we set the score manually to 1.
|
||||
-np.divide(
|
||||
depths, denominator, out=np.ones_like(depths), where=denominator != 0
|
||||
)
|
||||
)
|
||||
return scores
|
||||
|
||||
def _more_tags(self):
|
||||
return {
|
||||
"_xfail_checks": {
|
||||
"check_sample_weights_invariance": (
|
||||
"zero sample_weight is not equivalent to removing samples"
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def _average_path_length(n_samples_leaf):
|
||||
"""
|
||||
The average path length in a n_samples iTree, which is equal to
|
||||
the average path length of an unsuccessful BST search since the
|
||||
latter has the same structure as an isolation tree.
|
||||
Parameters
|
||||
----------
|
||||
n_samples_leaf : array-like of shape (n_samples,)
|
||||
The number of training samples in each test sample leaf, for
|
||||
each estimators.
|
||||
|
||||
Returns
|
||||
-------
|
||||
average_path_length : ndarray of shape (n_samples,)
|
||||
"""
|
||||
|
||||
n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False)
|
||||
|
||||
n_samples_leaf_shape = n_samples_leaf.shape
|
||||
n_samples_leaf = n_samples_leaf.reshape((1, -1))
|
||||
average_path_length = np.zeros(n_samples_leaf.shape)
|
||||
|
||||
mask_1 = n_samples_leaf <= 1
|
||||
mask_2 = n_samples_leaf == 2
|
||||
not_mask = ~np.logical_or(mask_1, mask_2)
|
||||
|
||||
average_path_length[mask_1] = 0.0
|
||||
average_path_length[mask_2] = 1.0
|
||||
average_path_length[not_mask] = (
|
||||
2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma)
|
||||
- 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask]
|
||||
)
|
||||
|
||||
return average_path_length.reshape(n_samples_leaf_shape)
|
||||
@@ -0,0 +1,891 @@
|
||||
"""Stacking classifier and regressor."""
|
||||
|
||||
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from copy import deepcopy
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel
|
||||
import scipy.sparse as sparse
|
||||
|
||||
from ..base import clone
|
||||
from ..base import ClassifierMixin, RegressorMixin, TransformerMixin
|
||||
from ..base import is_classifier, is_regressor
|
||||
from ..exceptions import NotFittedError
|
||||
from ..utils._estimator_html_repr import _VisualBlock
|
||||
|
||||
from ._base import _fit_single_estimator
|
||||
from ._base import _BaseHeterogeneousEnsemble
|
||||
|
||||
from ..linear_model import LogisticRegression
|
||||
from ..linear_model import RidgeCV
|
||||
|
||||
from ..model_selection import cross_val_predict
|
||||
from ..model_selection import check_cv
|
||||
|
||||
from ..preprocessing import LabelEncoder
|
||||
|
||||
from ..utils import Bunch
|
||||
from ..utils.metaestimators import available_if
|
||||
from ..utils.multiclass import check_classification_targets
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ..utils.validation import check_scalar
|
||||
from ..utils.validation import column_or_1d
|
||||
from ..utils.fixes import delayed
|
||||
from ..utils.validation import _check_feature_names_in
|
||||
|
||||
|
||||
def _estimator_has(attr):
|
||||
"""Check if we can delegate a method to the underlying estimator.
|
||||
|
||||
First, we check the first fitted final estimator if available, otherwise we
|
||||
check the unfitted final estimator.
|
||||
"""
|
||||
return lambda self: (
|
||||
hasattr(self.final_estimator_, attr)
|
||||
if hasattr(self, "final_estimator_")
|
||||
else hasattr(self.final_estimator, attr)
|
||||
)
|
||||
|
||||
|
||||
class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble, metaclass=ABCMeta):
|
||||
"""Base class for stacking method."""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(
|
||||
self,
|
||||
estimators,
|
||||
final_estimator=None,
|
||||
*,
|
||||
cv=None,
|
||||
stack_method="auto",
|
||||
n_jobs=None,
|
||||
verbose=0,
|
||||
passthrough=False,
|
||||
):
|
||||
super().__init__(estimators=estimators)
|
||||
self.final_estimator = final_estimator
|
||||
self.cv = cv
|
||||
self.stack_method = stack_method
|
||||
self.n_jobs = n_jobs
|
||||
self.verbose = verbose
|
||||
self.passthrough = passthrough
|
||||
|
||||
def _clone_final_estimator(self, default):
|
||||
if self.final_estimator is not None:
|
||||
self.final_estimator_ = clone(self.final_estimator)
|
||||
else:
|
||||
self.final_estimator_ = clone(default)
|
||||
|
||||
def _concatenate_predictions(self, X, predictions):
|
||||
"""Concatenate the predictions of each first layer learner and
|
||||
possibly the input dataset `X`.
|
||||
|
||||
If `X` is sparse and `self.passthrough` is False, the output of
|
||||
`transform` will be dense (the predictions). If `X` is sparse
|
||||
and `self.passthrough` is True, the output of `transform` will
|
||||
be sparse.
|
||||
|
||||
This helper is in charge of ensuring the predictions are 2D arrays and
|
||||
it will drop one of the probability column when using probabilities
|
||||
in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)
|
||||
"""
|
||||
X_meta = []
|
||||
for est_idx, preds in enumerate(predictions):
|
||||
# case where the estimator returned a 1D array
|
||||
if preds.ndim == 1:
|
||||
X_meta.append(preds.reshape(-1, 1))
|
||||
else:
|
||||
if (
|
||||
self.stack_method_[est_idx] == "predict_proba"
|
||||
and len(self.classes_) == 2
|
||||
):
|
||||
# Remove the first column when using probabilities in
|
||||
# binary classification because both features are perfectly
|
||||
# collinear.
|
||||
X_meta.append(preds[:, 1:])
|
||||
else:
|
||||
X_meta.append(preds)
|
||||
|
||||
self._n_feature_outs = [pred.shape[1] for pred in X_meta]
|
||||
if self.passthrough:
|
||||
X_meta.append(X)
|
||||
if sparse.issparse(X):
|
||||
return sparse.hstack(X_meta, format=X.format)
|
||||
|
||||
return np.hstack(X_meta)
|
||||
|
||||
@staticmethod
|
||||
def _method_name(name, estimator, method):
|
||||
if estimator == "drop":
|
||||
return None
|
||||
if method == "auto":
|
||||
if getattr(estimator, "predict_proba", None):
|
||||
return "predict_proba"
|
||||
elif getattr(estimator, "decision_function", None):
|
||||
return "decision_function"
|
||||
else:
|
||||
return "predict"
|
||||
else:
|
||||
if not hasattr(estimator, method):
|
||||
raise ValueError(
|
||||
"Underlying estimator {} does not implement the method {}.".format(
|
||||
name, method
|
||||
)
|
||||
)
|
||||
return method
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,) or default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
Note that this is supported only if all underlying estimators
|
||||
support sample weights.
|
||||
|
||||
.. versionchanged:: 0.23
|
||||
when not None, `sample_weight` is passed to all underlying
|
||||
estimators
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
# Check params.
|
||||
check_scalar(
|
||||
self.passthrough,
|
||||
name="passthrough",
|
||||
target_type=(np.bool_, bool),
|
||||
include_boundaries="neither",
|
||||
)
|
||||
# all_estimators contains all estimators, the one to be fitted and the
|
||||
# 'drop' string.
|
||||
names, all_estimators = self._validate_estimators()
|
||||
self._validate_final_estimator()
|
||||
|
||||
stack_method = [self.stack_method] * len(all_estimators)
|
||||
|
||||
if self.cv == "prefit":
|
||||
self.estimators_ = []
|
||||
for estimator in all_estimators:
|
||||
if estimator != "drop":
|
||||
check_is_fitted(estimator)
|
||||
self.estimators_.append(estimator)
|
||||
else:
|
||||
# Fit the base estimators on the whole training data. Those
|
||||
# base estimators will be used in transform, predict, and
|
||||
# predict_proba. They are exposed publicly.
|
||||
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(_fit_single_estimator)(clone(est), X, y, sample_weight)
|
||||
for est in all_estimators
|
||||
if est != "drop"
|
||||
)
|
||||
|
||||
self.named_estimators_ = Bunch()
|
||||
est_fitted_idx = 0
|
||||
for name_est, org_est in zip(names, all_estimators):
|
||||
if org_est != "drop":
|
||||
current_estimator = self.estimators_[est_fitted_idx]
|
||||
self.named_estimators_[name_est] = current_estimator
|
||||
est_fitted_idx += 1
|
||||
if hasattr(current_estimator, "feature_names_in_"):
|
||||
self.feature_names_in_ = current_estimator.feature_names_in_
|
||||
else:
|
||||
self.named_estimators_[name_est] = "drop"
|
||||
|
||||
self.stack_method_ = [
|
||||
self._method_name(name, est, meth)
|
||||
for name, est, meth in zip(names, all_estimators, stack_method)
|
||||
]
|
||||
|
||||
if self.cv == "prefit":
|
||||
# Generate predictions from prefit models
|
||||
predictions = [
|
||||
getattr(estimator, predict_method)(X)
|
||||
for estimator, predict_method in zip(all_estimators, self.stack_method_)
|
||||
if estimator != "drop"
|
||||
]
|
||||
else:
|
||||
# To train the meta-classifier using the most data as possible, we use
|
||||
# a cross-validation to obtain the output of the stacked estimators.
|
||||
# To ensure that the data provided to each estimator are the same,
|
||||
# we need to set the random state of the cv if there is one and we
|
||||
# need to take a copy.
|
||||
cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
|
||||
if hasattr(cv, "random_state") and cv.random_state is None:
|
||||
cv.random_state = np.random.RandomState()
|
||||
|
||||
fit_params = (
|
||||
{"sample_weight": sample_weight} if sample_weight is not None else None
|
||||
)
|
||||
predictions = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(cross_val_predict)(
|
||||
clone(est),
|
||||
X,
|
||||
y,
|
||||
cv=deepcopy(cv),
|
||||
method=meth,
|
||||
n_jobs=self.n_jobs,
|
||||
fit_params=fit_params,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
for est, meth in zip(all_estimators, self.stack_method_)
|
||||
if est != "drop"
|
||||
)
|
||||
|
||||
# Only not None or not 'drop' estimators will be used in transform.
|
||||
# Remove the None from the method as well.
|
||||
self.stack_method_ = [
|
||||
meth
|
||||
for (meth, est) in zip(self.stack_method_, all_estimators)
|
||||
if est != "drop"
|
||||
]
|
||||
|
||||
X_meta = self._concatenate_predictions(X, predictions)
|
||||
_fit_single_estimator(
|
||||
self.final_estimator_, X_meta, y, sample_weight=sample_weight
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
@property
|
||||
def n_features_in_(self):
|
||||
"""Number of features seen during :term:`fit`."""
|
||||
try:
|
||||
check_is_fitted(self)
|
||||
except NotFittedError as nfe:
|
||||
raise AttributeError(
|
||||
f"{self.__class__.__name__} object has no attribute n_features_in_"
|
||||
) from nfe
|
||||
return self.estimators_[0].n_features_in_
|
||||
|
||||
def _transform(self, X):
|
||||
"""Concatenate and return the predictions of the estimators."""
|
||||
check_is_fitted(self)
|
||||
predictions = [
|
||||
getattr(est, meth)(X)
|
||||
for est, meth in zip(self.estimators_, self.stack_method_)
|
||||
if est != "drop"
|
||||
]
|
||||
return self._concatenate_predictions(X, predictions)
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names for transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Input features. The input feature names are only used when `passthrough` is
|
||||
`True`.
|
||||
|
||||
- If `input_features` is `None`, then `feature_names_in_` is
|
||||
used as feature names in. If `feature_names_in_` is not defined,
|
||||
then names are generated: `[x0, x1, ..., x(n_features_in_ - 1)]`.
|
||||
- If `input_features` is an array-like, then `input_features` must
|
||||
match `feature_names_in_` if `feature_names_in_` is defined.
|
||||
|
||||
If `passthrough` is `False`, then only the names of `estimators` are used
|
||||
to generate the output feature names.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
"""
|
||||
input_features = _check_feature_names_in(
|
||||
self, input_features, generate_names=self.passthrough
|
||||
)
|
||||
|
||||
class_name = self.__class__.__name__.lower()
|
||||
non_dropped_estimators = (
|
||||
name for name, est in self.estimators if est != "drop"
|
||||
)
|
||||
meta_names = []
|
||||
for est, n_features_out in zip(non_dropped_estimators, self._n_feature_outs):
|
||||
if n_features_out == 1:
|
||||
meta_names.append(f"{class_name}_{est}")
|
||||
else:
|
||||
meta_names.extend(
|
||||
f"{class_name}_{est}{i}" for i in range(n_features_out)
|
||||
)
|
||||
|
||||
if self.passthrough:
|
||||
return np.concatenate((meta_names, input_features))
|
||||
|
||||
return np.asarray(meta_names, dtype=object)
|
||||
|
||||
@available_if(_estimator_has("predict"))
|
||||
def predict(self, X, **predict_params):
|
||||
"""Predict target for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
**predict_params : dict of str -> obj
|
||||
Parameters to the `predict` called by the `final_estimator`. Note
|
||||
that this may be used to return uncertainties from some estimators
|
||||
with `return_std` or `return_cov`. Be aware that it will only
|
||||
accounts for uncertainty in the final estimator.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
|
||||
Predicted targets.
|
||||
"""
|
||||
|
||||
check_is_fitted(self)
|
||||
return self.final_estimator_.predict(self.transform(X), **predict_params)
|
||||
|
||||
def _sk_visual_block_(self, final_estimator):
|
||||
names, estimators = zip(*self.estimators)
|
||||
parallel = _VisualBlock("parallel", estimators, names=names, dash_wrapped=False)
|
||||
|
||||
# final estimator is wrapped in a parallel block to show the label:
|
||||
# 'final_estimator' in the html repr
|
||||
final_block = _VisualBlock(
|
||||
"parallel", [final_estimator], names=["final_estimator"], dash_wrapped=False
|
||||
)
|
||||
return _VisualBlock("serial", (parallel, final_block), dash_wrapped=False)
|
||||
|
||||
|
||||
class StackingClassifier(ClassifierMixin, _BaseStacking):
|
||||
"""Stack of estimators with a final classifier.
|
||||
|
||||
Stacked generalization consists in stacking the output of individual
|
||||
estimator and use a classifier to compute the final prediction. Stacking
|
||||
allows to use the strength of each individual estimator by using their
|
||||
output as input of a final estimator.
|
||||
|
||||
Note that `estimators_` are fitted on the full `X` while `final_estimator_`
|
||||
is trained using cross-validated predictions of the base estimators using
|
||||
`cross_val_predict`.
|
||||
|
||||
Read more in the :ref:`User Guide <stacking>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimators : list of (str, estimator)
|
||||
Base estimators which will be stacked together. Each element of the
|
||||
list is defined as a tuple of string (i.e. name) and an estimator
|
||||
instance. An estimator can be set to 'drop' using `set_params`.
|
||||
|
||||
final_estimator : estimator, default=None
|
||||
A classifier which will be used to combine the base estimators.
|
||||
The default classifier is a
|
||||
:class:`~sklearn.linear_model.LogisticRegression`.
|
||||
|
||||
cv : int, cross-validation generator, iterable, or "prefit", default=None
|
||||
Determines the cross-validation splitting strategy used in
|
||||
`cross_val_predict` to train `final_estimator`. Possible inputs for
|
||||
cv are:
|
||||
|
||||
* None, to use the default 5-fold cross validation,
|
||||
* integer, to specify the number of folds in a (Stratified) KFold,
|
||||
* An object to be used as a cross-validation generator,
|
||||
* An iterable yielding train, test splits,
|
||||
* `"prefit"` to assume the `estimators` are prefit. In this case, the
|
||||
estimators will not be refitted.
|
||||
|
||||
For integer/None inputs, if the estimator is a classifier and y is
|
||||
either binary or multiclass,
|
||||
:class:`~sklearn.model_selection.StratifiedKFold` is used.
|
||||
In all other cases, :class:`~sklearn.model_selection.KFold` is used.
|
||||
These splitters are instantiated with `shuffle=False` so the splits
|
||||
will be the same across calls.
|
||||
|
||||
Refer :ref:`User Guide <cross_validation>` for the various
|
||||
cross-validation strategies that can be used here.
|
||||
|
||||
If "prefit" is passed, it is assumed that all `estimators` have
|
||||
been fitted already. The `final_estimator_` is trained on the `estimators`
|
||||
predictions on the full training set and are **not** cross validated
|
||||
predictions. Please note that if the models have been trained on the same
|
||||
data to train the stacking model, there is a very high risk of overfitting.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
The 'prefit' option was added in 1.1
|
||||
|
||||
.. note::
|
||||
A larger number of split will provide no benefits if the number
|
||||
of training samples is large enough. Indeed, the training time
|
||||
will increase. ``cv`` is not used for model evaluation but for
|
||||
prediction.
|
||||
|
||||
stack_method : {'auto', 'predict_proba', 'decision_function', 'predict'}, \
|
||||
default='auto'
|
||||
Methods called for each base estimator. It can be:
|
||||
|
||||
* if 'auto', it will try to invoke, for each estimator,
|
||||
`'predict_proba'`, `'decision_function'` or `'predict'` in that
|
||||
order.
|
||||
* otherwise, one of `'predict_proba'`, `'decision_function'` or
|
||||
`'predict'`. If the method is not implemented by the estimator, it
|
||||
will raise an error.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to run in parallel all `estimators` `fit`.
|
||||
`None` means 1 unless in a `joblib.parallel_backend` context. -1 means
|
||||
using all processors. See Glossary for more details.
|
||||
|
||||
passthrough : bool, default=False
|
||||
When False, only the predictions of estimators will be used as
|
||||
training data for `final_estimator`. When True, the
|
||||
`final_estimator` is trained on the predictions as well as the
|
||||
original training data.
|
||||
|
||||
verbose : int, default=0
|
||||
Verbosity level.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
Class labels.
|
||||
|
||||
estimators_ : list of estimators
|
||||
The elements of the `estimators` parameter, having been fitted on the
|
||||
training data. If an estimator has been set to `'drop'`, it
|
||||
will not appear in `estimators_`. When `cv="prefit"`, `estimators_`
|
||||
is set to `estimators` and is not fitted again.
|
||||
|
||||
named_estimators_ : :class:`~sklearn.utils.Bunch`
|
||||
Attribute to access any fitted sub-estimators by name.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`. Only defined if the
|
||||
underlying classifier exposes such an attribute when fit.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Only defined if the
|
||||
underlying estimators expose such an attribute when fit.
|
||||
.. versionadded:: 1.0
|
||||
|
||||
final_estimator_ : estimator
|
||||
The classifier which predicts given the output of `estimators_`.
|
||||
|
||||
stack_method_ : list of str
|
||||
The method used by each base estimator.
|
||||
|
||||
See Also
|
||||
--------
|
||||
StackingRegressor : Stack of estimators with a final regressor.
|
||||
|
||||
Notes
|
||||
-----
|
||||
When `predict_proba` is used by each estimator (i.e. most of the time for
|
||||
`stack_method='auto'` or specifically for `stack_method='predict_proba'`),
|
||||
The first column predicted by each estimator will be dropped in the case
|
||||
of a binary classification problem. Indeed, both feature will be perfectly
|
||||
collinear.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2
|
||||
(1992): 241-259.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_iris
|
||||
>>> from sklearn.ensemble import RandomForestClassifier
|
||||
>>> from sklearn.svm import LinearSVC
|
||||
>>> from sklearn.linear_model import LogisticRegression
|
||||
>>> from sklearn.preprocessing import StandardScaler
|
||||
>>> from sklearn.pipeline import make_pipeline
|
||||
>>> from sklearn.ensemble import StackingClassifier
|
||||
>>> X, y = load_iris(return_X_y=True)
|
||||
>>> estimators = [
|
||||
... ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
|
||||
... ('svr', make_pipeline(StandardScaler(),
|
||||
... LinearSVC(random_state=42)))
|
||||
... ]
|
||||
>>> clf = StackingClassifier(
|
||||
... estimators=estimators, final_estimator=LogisticRegression()
|
||||
... )
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, stratify=y, random_state=42
|
||||
... )
|
||||
>>> clf.fit(X_train, y_train).score(X_test, y_test)
|
||||
0.9...
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
estimators,
|
||||
final_estimator=None,
|
||||
*,
|
||||
cv=None,
|
||||
stack_method="auto",
|
||||
n_jobs=None,
|
||||
passthrough=False,
|
||||
verbose=0,
|
||||
):
|
||||
super().__init__(
|
||||
estimators=estimators,
|
||||
final_estimator=final_estimator,
|
||||
cv=cv,
|
||||
stack_method=stack_method,
|
||||
n_jobs=n_jobs,
|
||||
passthrough=passthrough,
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
def _validate_final_estimator(self):
|
||||
self._clone_final_estimator(default=LogisticRegression())
|
||||
if not is_classifier(self.final_estimator_):
|
||||
raise ValueError(
|
||||
"'final_estimator' parameter should be a classifier. Got {}".format(
|
||||
self.final_estimator_
|
||||
)
|
||||
)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
Note that this is supported only if all underlying estimators
|
||||
support sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns a fitted instance of estimator.
|
||||
"""
|
||||
check_classification_targets(y)
|
||||
self._le = LabelEncoder().fit(y)
|
||||
self.classes_ = self._le.classes_
|
||||
return super().fit(X, self._le.transform(y), sample_weight)
|
||||
|
||||
@available_if(_estimator_has("predict"))
|
||||
def predict(self, X, **predict_params):
|
||||
"""Predict target for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
**predict_params : dict of str -> obj
|
||||
Parameters to the `predict` called by the `final_estimator`. Note
|
||||
that this may be used to return uncertainties from some estimators
|
||||
with `return_std` or `return_cov`. Be aware that it will only
|
||||
accounts for uncertainty in the final estimator.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)
|
||||
Predicted targets.
|
||||
"""
|
||||
y_pred = super().predict(X, **predict_params)
|
||||
return self._le.inverse_transform(y_pred)
|
||||
|
||||
@available_if(_estimator_has("predict_proba"))
|
||||
def predict_proba(self, X):
|
||||
"""Predict class probabilities for `X` using the final estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
probabilities : ndarray of shape (n_samples, n_classes) or \
|
||||
list of ndarray of shape (n_output,)
|
||||
The class probabilities of the input samples.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return self.final_estimator_.predict_proba(self.transform(X))
|
||||
|
||||
@available_if(_estimator_has("decision_function"))
|
||||
def decision_function(self, X):
|
||||
"""Decision function for samples in `X` using the final estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
decisions : ndarray of shape (n_samples,), (n_samples, n_classes), \
|
||||
or (n_samples, n_classes * (n_classes-1) / 2)
|
||||
The decision function computed the final estimator.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return self.final_estimator_.decision_function(self.transform(X))
|
||||
|
||||
def transform(self, X):
|
||||
"""Return class labels or probabilities for X for each estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_preds : ndarray of shape (n_samples, n_estimators) or \
|
||||
(n_samples, n_classes * n_estimators)
|
||||
Prediction outputs for each estimator.
|
||||
"""
|
||||
return self._transform(X)
|
||||
|
||||
def _sk_visual_block_(self):
|
||||
# If final_estimator's default changes then this should be
|
||||
# updated.
|
||||
if self.final_estimator is None:
|
||||
final_estimator = LogisticRegression()
|
||||
else:
|
||||
final_estimator = self.final_estimator
|
||||
return super()._sk_visual_block_(final_estimator)
|
||||
|
||||
|
||||
class StackingRegressor(RegressorMixin, _BaseStacking):
|
||||
"""Stack of estimators with a final regressor.
|
||||
|
||||
Stacked generalization consists in stacking the output of individual
|
||||
estimator and use a regressor to compute the final prediction. Stacking
|
||||
allows to use the strength of each individual estimator by using their
|
||||
output as input of a final estimator.
|
||||
|
||||
Note that `estimators_` are fitted on the full `X` while `final_estimator_`
|
||||
is trained using cross-validated predictions of the base estimators using
|
||||
`cross_val_predict`.
|
||||
|
||||
Read more in the :ref:`User Guide <stacking>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimators : list of (str, estimator)
|
||||
Base estimators which will be stacked together. Each element of the
|
||||
list is defined as a tuple of string (i.e. name) and an estimator
|
||||
instance. An estimator can be set to 'drop' using `set_params`.
|
||||
|
||||
final_estimator : estimator, default=None
|
||||
A regressor which will be used to combine the base estimators.
|
||||
The default regressor is a :class:`~sklearn.linear_model.RidgeCV`.
|
||||
|
||||
cv : int, cross-validation generator, iterable, or "prefit", default=None
|
||||
Determines the cross-validation splitting strategy used in
|
||||
`cross_val_predict` to train `final_estimator`. Possible inputs for
|
||||
cv are:
|
||||
|
||||
* None, to use the default 5-fold cross validation,
|
||||
* integer, to specify the number of folds in a (Stratified) KFold,
|
||||
* An object to be used as a cross-validation generator,
|
||||
* An iterable yielding train, test splits.
|
||||
* "prefit" to assume the `estimators` are prefit, and skip cross validation
|
||||
|
||||
For integer/None inputs, if the estimator is a classifier and y is
|
||||
either binary or multiclass,
|
||||
:class:`~sklearn.model_selection.StratifiedKFold` is used.
|
||||
In all other cases, :class:`~sklearn.model_selection.KFold` is used.
|
||||
These splitters are instantiated with `shuffle=False` so the splits
|
||||
will be the same across calls.
|
||||
|
||||
Refer :ref:`User Guide <cross_validation>` for the various
|
||||
cross-validation strategies that can be used here.
|
||||
|
||||
If "prefit" is passed, it is assumed that all `estimators` have
|
||||
been fitted already. The `final_estimator_` is trained on the `estimators`
|
||||
predictions on the full training set and are **not** cross validated
|
||||
predictions. Please note that if the models have been trained on the same
|
||||
data to train the stacking model, there is a very high risk of overfitting.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
The 'prefit' option was added in 1.1
|
||||
|
||||
.. note::
|
||||
A larger number of split will provide no benefits if the number
|
||||
of training samples is large enough. Indeed, the training time
|
||||
will increase. ``cv`` is not used for model evaluation but for
|
||||
prediction.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to run in parallel for `fit` of all `estimators`.
|
||||
`None` means 1 unless in a `joblib.parallel_backend` context. -1 means
|
||||
using all processors. See Glossary for more details.
|
||||
|
||||
passthrough : bool, default=False
|
||||
When False, only the predictions of estimators will be used as
|
||||
training data for `final_estimator`. When True, the
|
||||
`final_estimator` is trained on the predictions as well as the
|
||||
original training data.
|
||||
|
||||
verbose : int, default=0
|
||||
Verbosity level.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
estimators_ : list of estimator
|
||||
The elements of the `estimators` parameter, having been fitted on the
|
||||
training data. If an estimator has been set to `'drop'`, it
|
||||
will not appear in `estimators_`. When `cv="prefit"`, `estimators_`
|
||||
is set to `estimators` and is not fitted again.
|
||||
|
||||
named_estimators_ : :class:`~sklearn.utils.Bunch`
|
||||
Attribute to access any fitted sub-estimators by name.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`. Only defined if the
|
||||
underlying regressor exposes such an attribute when fit.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Only defined if the
|
||||
underlying estimators expose such an attribute when fit.
|
||||
.. versionadded:: 1.0
|
||||
|
||||
final_estimator_ : estimator
|
||||
The regressor to stacked the base estimators fitted.
|
||||
|
||||
stack_method_ : list of str
|
||||
The method used by each base estimator.
|
||||
|
||||
See Also
|
||||
--------
|
||||
StackingClassifier : Stack of estimators with a final classifier.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2
|
||||
(1992): 241-259.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_diabetes
|
||||
>>> from sklearn.linear_model import RidgeCV
|
||||
>>> from sklearn.svm import LinearSVR
|
||||
>>> from sklearn.ensemble import RandomForestRegressor
|
||||
>>> from sklearn.ensemble import StackingRegressor
|
||||
>>> X, y = load_diabetes(return_X_y=True)
|
||||
>>> estimators = [
|
||||
... ('lr', RidgeCV()),
|
||||
... ('svr', LinearSVR(random_state=42))
|
||||
... ]
|
||||
>>> reg = StackingRegressor(
|
||||
... estimators=estimators,
|
||||
... final_estimator=RandomForestRegressor(n_estimators=10,
|
||||
... random_state=42)
|
||||
... )
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||||
... X, y, random_state=42
|
||||
... )
|
||||
>>> reg.fit(X_train, y_train).score(X_test, y_test)
|
||||
0.3...
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
estimators,
|
||||
final_estimator=None,
|
||||
*,
|
||||
cv=None,
|
||||
n_jobs=None,
|
||||
passthrough=False,
|
||||
verbose=0,
|
||||
):
|
||||
super().__init__(
|
||||
estimators=estimators,
|
||||
final_estimator=final_estimator,
|
||||
cv=cv,
|
||||
stack_method="predict",
|
||||
n_jobs=n_jobs,
|
||||
passthrough=passthrough,
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
def _validate_final_estimator(self):
|
||||
self._clone_final_estimator(default=RidgeCV())
|
||||
if not is_regressor(self.final_estimator_):
|
||||
raise ValueError(
|
||||
"'final_estimator' parameter should be a regressor. Got {}".format(
|
||||
self.final_estimator_
|
||||
)
|
||||
)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
Note that this is supported only if all underlying estimators
|
||||
support sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns a fitted instance.
|
||||
"""
|
||||
y = column_or_1d(y, warn=True)
|
||||
return super().fit(X, y, sample_weight)
|
||||
|
||||
def transform(self, X):
|
||||
"""Return the predictions for X for each estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_preds : ndarray of shape (n_samples, n_estimators)
|
||||
Prediction outputs for each estimator.
|
||||
"""
|
||||
return self._transform(X)
|
||||
|
||||
def _sk_visual_block_(self):
|
||||
# If final_estimator's default changes then this should be
|
||||
# updated.
|
||||
if self.final_estimator is None:
|
||||
final_estimator = RidgeCV()
|
||||
else:
|
||||
final_estimator = self.final_estimator
|
||||
return super()._sk_visual_block_(final_estimator)
|
||||
@@ -0,0 +1,656 @@
|
||||
"""
|
||||
Soft Voting/Majority Rule classifier and Voting regressor.
|
||||
|
||||
This module contains:
|
||||
- A Soft Voting/Majority Rule classifier for classification estimators.
|
||||
- A Voting regressor for regression estimators.
|
||||
"""
|
||||
|
||||
# Authors: Sebastian Raschka <se.raschka@gmail.com>,
|
||||
# Gilles Louppe <g.louppe@gmail.com>,
|
||||
# Ramil Nugmanov <stsouko@live.ru>
|
||||
# Mohamed Ali Jamaoui <m.ali.jamaoui@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
from abc import abstractmethod
|
||||
|
||||
import numbers
|
||||
import numpy as np
|
||||
|
||||
from joblib import Parallel
|
||||
|
||||
from ..base import ClassifierMixin
|
||||
from ..base import RegressorMixin
|
||||
from ..base import TransformerMixin
|
||||
from ..base import clone
|
||||
from ._base import _fit_single_estimator
|
||||
from ._base import _BaseHeterogeneousEnsemble
|
||||
from ..preprocessing import LabelEncoder
|
||||
from ..utils import Bunch
|
||||
from ..utils import check_scalar
|
||||
from ..utils.metaestimators import available_if
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ..utils.validation import _check_feature_names_in
|
||||
from ..utils.multiclass import check_classification_targets
|
||||
from ..utils.validation import column_or_1d
|
||||
from ..exceptions import NotFittedError
|
||||
from ..utils._estimator_html_repr import _VisualBlock
|
||||
from ..utils.fixes import delayed
|
||||
|
||||
|
||||
class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble):
|
||||
"""Base class for voting.
|
||||
|
||||
Warning: This class should not be used directly. Use derived classes
|
||||
instead.
|
||||
"""
|
||||
|
||||
def _log_message(self, name, idx, total):
|
||||
if not self.verbose:
|
||||
return None
|
||||
return f"({idx} of {total}) Processing {name}"
|
||||
|
||||
@property
|
||||
def _weights_not_none(self):
|
||||
"""Get the weights of not `None` estimators."""
|
||||
if self.weights is None:
|
||||
return None
|
||||
return [w for est, w in zip(self.estimators, self.weights) if est[1] != "drop"]
|
||||
|
||||
def _predict(self, X):
|
||||
"""Collect results from clf.predict calls."""
|
||||
return np.asarray([est.predict(X) for est in self.estimators_]).T
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Get common fit operations."""
|
||||
names, clfs = self._validate_estimators()
|
||||
|
||||
check_scalar(
|
||||
self.verbose,
|
||||
name="verbose",
|
||||
target_type=(numbers.Integral, np.bool_),
|
||||
min_val=0,
|
||||
)
|
||||
|
||||
if self.weights is not None and len(self.weights) != len(self.estimators):
|
||||
raise ValueError(
|
||||
"Number of `estimators` and weights must be equal; got"
|
||||
f" {len(self.weights)} weights, {len(self.estimators)} estimators"
|
||||
)
|
||||
|
||||
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(_fit_single_estimator)(
|
||||
clone(clf),
|
||||
X,
|
||||
y,
|
||||
sample_weight=sample_weight,
|
||||
message_clsname="Voting",
|
||||
message=self._log_message(names[idx], idx + 1, len(clfs)),
|
||||
)
|
||||
for idx, clf in enumerate(clfs)
|
||||
if clf != "drop"
|
||||
)
|
||||
|
||||
self.named_estimators_ = Bunch()
|
||||
|
||||
# Uses 'drop' as placeholder for dropped estimators
|
||||
est_iter = iter(self.estimators_)
|
||||
for name, est in self.estimators:
|
||||
current_est = est if est == "drop" else next(est_iter)
|
||||
self.named_estimators_[name] = current_est
|
||||
|
||||
if hasattr(current_est, "feature_names_in_"):
|
||||
self.feature_names_in_ = current_est.feature_names_in_
|
||||
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None, **fit_params):
|
||||
"""Return class labels or probabilities for each estimator.
|
||||
|
||||
Return predictions for X for each estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix, dataframe} of shape \
|
||||
(n_samples, n_features)
|
||||
Input samples.
|
||||
|
||||
y : ndarray of shape (n_samples,), default=None
|
||||
Target values (None for unsupervised transformations).
|
||||
|
||||
**fit_params : dict
|
||||
Additional fit parameters.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : ndarray array of shape (n_samples, n_features_new)
|
||||
Transformed array.
|
||||
"""
|
||||
return super().fit_transform(X, y, **fit_params)
|
||||
|
||||
@property
|
||||
def n_features_in_(self):
|
||||
"""Number of features seen during :term:`fit`."""
|
||||
# For consistency with other estimators we raise a AttributeError so
|
||||
# that hasattr() fails if the estimator isn't fitted.
|
||||
try:
|
||||
check_is_fitted(self)
|
||||
except NotFittedError as nfe:
|
||||
raise AttributeError(
|
||||
"{} object has no n_features_in_ attribute.".format(
|
||||
self.__class__.__name__
|
||||
)
|
||||
) from nfe
|
||||
|
||||
return self.estimators_[0].n_features_in_
|
||||
|
||||
def _sk_visual_block_(self):
|
||||
names, estimators = zip(*self.estimators)
|
||||
return _VisualBlock("parallel", estimators, names=names)
|
||||
|
||||
def _more_tags(self):
|
||||
return {"preserves_dtype": []}
|
||||
|
||||
|
||||
class VotingClassifier(ClassifierMixin, _BaseVoting):
|
||||
"""Soft Voting/Majority Rule classifier for unfitted estimators.
|
||||
|
||||
Read more in the :ref:`User Guide <voting_classifier>`.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimators : list of (str, estimator) tuples
|
||||
Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
|
||||
of those original estimators that will be stored in the class attribute
|
||||
``self.estimators_``. An estimator can be set to ``'drop'`` using
|
||||
:meth:`set_params`.
|
||||
|
||||
.. versionchanged:: 0.21
|
||||
``'drop'`` is accepted. Using None was deprecated in 0.22 and
|
||||
support was removed in 0.24.
|
||||
|
||||
voting : {'hard', 'soft'}, default='hard'
|
||||
If 'hard', uses predicted class labels for majority rule voting.
|
||||
Else if 'soft', predicts the class label based on the argmax of
|
||||
the sums of the predicted probabilities, which is recommended for
|
||||
an ensemble of well-calibrated classifiers.
|
||||
|
||||
weights : array-like of shape (n_classifiers,), default=None
|
||||
Sequence of weights (`float` or `int`) to weight the occurrences of
|
||||
predicted class labels (`hard` voting) or class probabilities
|
||||
before averaging (`soft` voting). Uses uniform weights if `None`.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to run in parallel for ``fit``.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
flatten_transform : bool, default=True
|
||||
Affects shape of transform output only when voting='soft'
|
||||
If voting='soft' and flatten_transform=True, transform method returns
|
||||
matrix with shape (n_samples, n_classifiers * n_classes). If
|
||||
flatten_transform=False, it returns
|
||||
(n_classifiers, n_samples, n_classes).
|
||||
|
||||
verbose : bool, default=False
|
||||
If True, the time elapsed while fitting will be printed as it
|
||||
is completed.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
|
||||
Attributes
|
||||
----------
|
||||
estimators_ : list of classifiers
|
||||
The collection of fitted sub-estimators as defined in ``estimators``
|
||||
that are not 'drop'.
|
||||
|
||||
named_estimators_ : :class:`~sklearn.utils.Bunch`
|
||||
Attribute to access any fitted sub-estimators by name.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
le_ : :class:`~sklearn.preprocessing.LabelEncoder`
|
||||
Transformer used to encode the labels during fit and decode during
|
||||
prediction.
|
||||
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
The classes labels.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`. Only defined if the
|
||||
underlying classifier exposes such an attribute when fit.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Only defined if the
|
||||
underlying estimators expose such an attribute when fit.
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
VotingRegressor : Prediction voting regressor.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.linear_model import LogisticRegression
|
||||
>>> from sklearn.naive_bayes import GaussianNB
|
||||
>>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier
|
||||
>>> clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
|
||||
>>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
|
||||
>>> clf3 = GaussianNB()
|
||||
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
|
||||
>>> y = np.array([1, 1, 1, 2, 2, 2])
|
||||
>>> eclf1 = VotingClassifier(estimators=[
|
||||
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
|
||||
>>> eclf1 = eclf1.fit(X, y)
|
||||
>>> print(eclf1.predict(X))
|
||||
[1 1 1 2 2 2]
|
||||
>>> np.array_equal(eclf1.named_estimators_.lr.predict(X),
|
||||
... eclf1.named_estimators_['lr'].predict(X))
|
||||
True
|
||||
>>> eclf2 = VotingClassifier(estimators=[
|
||||
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
... voting='soft')
|
||||
>>> eclf2 = eclf2.fit(X, y)
|
||||
>>> print(eclf2.predict(X))
|
||||
[1 1 1 2 2 2]
|
||||
|
||||
To drop an estimator, :meth:`set_params` can be used to remove it. Here we
|
||||
dropped one of the estimators, resulting in 2 fitted estimators:
|
||||
|
||||
>>> eclf2 = eclf2.set_params(lr='drop')
|
||||
>>> eclf2 = eclf2.fit(X, y)
|
||||
>>> len(eclf2.estimators_)
|
||||
2
|
||||
|
||||
Setting `flatten_transform=True` with `voting='soft'` flattens output shape of
|
||||
`transform`:
|
||||
|
||||
>>> eclf3 = VotingClassifier(estimators=[
|
||||
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||||
... voting='soft', weights=[2,1,1],
|
||||
... flatten_transform=True)
|
||||
>>> eclf3 = eclf3.fit(X, y)
|
||||
>>> print(eclf3.predict(X))
|
||||
[1 1 1 2 2 2]
|
||||
>>> print(eclf3.transform(X).shape)
|
||||
(6, 6)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
estimators,
|
||||
*,
|
||||
voting="hard",
|
||||
weights=None,
|
||||
n_jobs=None,
|
||||
flatten_transform=True,
|
||||
verbose=False,
|
||||
):
|
||||
super().__init__(estimators=estimators)
|
||||
self.voting = voting
|
||||
self.weights = weights
|
||||
self.n_jobs = n_jobs
|
||||
self.flatten_transform = flatten_transform
|
||||
self.verbose = verbose
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
Note that this is supported only if all underlying estimators
|
||||
support sample weights.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
check_classification_targets(y)
|
||||
if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
|
||||
raise NotImplementedError(
|
||||
"Multilabel and multi-output classification is not supported."
|
||||
)
|
||||
|
||||
check_scalar(
|
||||
self.flatten_transform,
|
||||
name="flatten_transform",
|
||||
target_type=(numbers.Integral, np.bool_),
|
||||
)
|
||||
|
||||
if self.voting not in ("soft", "hard"):
|
||||
raise ValueError(
|
||||
f"Voting must be 'soft' or 'hard'; got (voting={self.voting!r})"
|
||||
)
|
||||
|
||||
self.le_ = LabelEncoder().fit(y)
|
||||
self.classes_ = self.le_.classes_
|
||||
transformed_y = self.le_.transform(y)
|
||||
|
||||
return super().fit(X, transformed_y, sample_weight)
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict class labels for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
maj : array-like of shape (n_samples,)
|
||||
Predicted class labels.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
if self.voting == "soft":
|
||||
maj = np.argmax(self.predict_proba(X), axis=1)
|
||||
|
||||
else: # 'hard' voting
|
||||
predictions = self._predict(X)
|
||||
maj = np.apply_along_axis(
|
||||
lambda x: np.argmax(np.bincount(x, weights=self._weights_not_none)),
|
||||
axis=1,
|
||||
arr=predictions,
|
||||
)
|
||||
|
||||
maj = self.le_.inverse_transform(maj)
|
||||
|
||||
return maj
|
||||
|
||||
def _collect_probas(self, X):
|
||||
"""Collect results from clf.predict calls."""
|
||||
return np.asarray([clf.predict_proba(X) for clf in self.estimators_])
|
||||
|
||||
def _check_voting(self):
|
||||
if self.voting == "hard":
|
||||
raise AttributeError(
|
||||
f"predict_proba is not available when voting={repr(self.voting)}"
|
||||
)
|
||||
return True
|
||||
|
||||
@available_if(_check_voting)
|
||||
def predict_proba(self, X):
|
||||
"""Compute probabilities of possible outcomes for samples in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
avg : array-like of shape (n_samples, n_classes)
|
||||
Weighted average probability for each class per sample.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
avg = np.average(
|
||||
self._collect_probas(X), axis=0, weights=self._weights_not_none
|
||||
)
|
||||
return avg
|
||||
|
||||
def transform(self, X):
|
||||
"""Return class labels or probabilities for X for each estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
probabilities_or_labels
|
||||
If `voting='soft'` and `flatten_transform=True`:
|
||||
returns ndarray of shape (n_samples, n_classifiers * n_classes),
|
||||
being class probabilities calculated by each classifier.
|
||||
If `voting='soft' and `flatten_transform=False`:
|
||||
ndarray of shape (n_classifiers, n_samples, n_classes)
|
||||
If `voting='hard'`:
|
||||
ndarray of shape (n_samples, n_classifiers), being
|
||||
class labels predicted by each classifier.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if self.voting == "soft":
|
||||
probas = self._collect_probas(X)
|
||||
if not self.flatten_transform:
|
||||
return probas
|
||||
return np.hstack(probas)
|
||||
|
||||
else:
|
||||
return self._predict(X)
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names for transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
"""
|
||||
if self.voting == "soft" and not self.flatten_transform:
|
||||
raise ValueError(
|
||||
"get_feature_names_out is not supported when `voting='soft'` and "
|
||||
"`flatten_transform=False`"
|
||||
)
|
||||
|
||||
_check_feature_names_in(self, input_features, generate_names=False)
|
||||
class_name = self.__class__.__name__.lower()
|
||||
|
||||
active_names = [name for name, est in self.estimators if est != "drop"]
|
||||
|
||||
if self.voting == "hard":
|
||||
return np.asarray(
|
||||
[f"{class_name}_{name}" for name in active_names], dtype=object
|
||||
)
|
||||
|
||||
# voting == "soft"
|
||||
n_classes = len(self.classes_)
|
||||
names_out = [
|
||||
f"{class_name}_{name}{i}" for name in active_names for i in range(n_classes)
|
||||
]
|
||||
return np.asarray(names_out, dtype=object)
|
||||
|
||||
|
||||
class VotingRegressor(RegressorMixin, _BaseVoting):
|
||||
"""Prediction voting regressor for unfitted estimators.
|
||||
|
||||
A voting regressor is an ensemble meta-estimator that fits several base
|
||||
regressors, each on the whole dataset. Then it averages the individual
|
||||
predictions to form a final prediction.
|
||||
|
||||
Read more in the :ref:`User Guide <voting_regressor>`.
|
||||
|
||||
.. versionadded:: 0.21
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimators : list of (str, estimator) tuples
|
||||
Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
|
||||
of those original estimators that will be stored in the class attribute
|
||||
``self.estimators_``. An estimator can be set to ``'drop'`` using
|
||||
:meth:`set_params`.
|
||||
|
||||
.. versionchanged:: 0.21
|
||||
``'drop'`` is accepted. Using None was deprecated in 0.22 and
|
||||
support was removed in 0.24.
|
||||
|
||||
weights : array-like of shape (n_regressors,), default=None
|
||||
Sequence of weights (`float` or `int`) to weight the occurrences of
|
||||
predicted values before averaging. Uses uniform weights if `None`.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to run in parallel for ``fit``.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
verbose : bool, default=False
|
||||
If True, the time elapsed while fitting will be printed as it
|
||||
is completed.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
|
||||
Attributes
|
||||
----------
|
||||
estimators_ : list of regressors
|
||||
The collection of fitted sub-estimators as defined in ``estimators``
|
||||
that are not 'drop'.
|
||||
|
||||
named_estimators_ : :class:`~sklearn.utils.Bunch`
|
||||
Attribute to access any fitted sub-estimators by name.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`. Only defined if the
|
||||
underlying regressor exposes such an attribute when fit.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Only defined if the
|
||||
underlying estimators expose such an attribute when fit.
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
VotingClassifier : Soft Voting/Majority Rule classifier.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.linear_model import LinearRegression
|
||||
>>> from sklearn.ensemble import RandomForestRegressor
|
||||
>>> from sklearn.ensemble import VotingRegressor
|
||||
>>> from sklearn.neighbors import KNeighborsRegressor
|
||||
>>> r1 = LinearRegression()
|
||||
>>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)
|
||||
>>> r3 = KNeighborsRegressor()
|
||||
>>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
|
||||
>>> y = np.array([2, 6, 12, 20, 30, 42])
|
||||
>>> er = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])
|
||||
>>> print(er.fit(X, y).predict(X))
|
||||
[ 6.8... 8.4... 12.5... 17.8... 26... 34...]
|
||||
|
||||
In the following example, we drop the `'lr'` estimator with
|
||||
:meth:`~VotingRegressor.set_params` and fit the remaining two estimators:
|
||||
|
||||
>>> er = er.set_params(lr='drop')
|
||||
>>> er = er.fit(X, y)
|
||||
>>> len(er.estimators_)
|
||||
2
|
||||
"""
|
||||
|
||||
def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):
|
||||
super().__init__(estimators=estimators)
|
||||
self.weights = weights
|
||||
self.n_jobs = n_jobs
|
||||
self.verbose = verbose
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vectors, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
Note that this is supported only if all underlying estimators
|
||||
support sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
y = column_or_1d(y, warn=True)
|
||||
return super().fit(X, y, sample_weight)
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict regression target for X.
|
||||
|
||||
The predicted regression target of an input sample is computed as the
|
||||
mean predicted regression targets of the estimators in the ensemble.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_samples,)
|
||||
The predicted values.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return np.average(self._predict(X), axis=1, weights=self._weights_not_none)
|
||||
|
||||
def transform(self, X):
|
||||
"""Return predictions for X for each estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
predictions : ndarray of shape (n_samples, n_classifiers)
|
||||
Values predicted by each regressor.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return self._predict(X)
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names for transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
"""
|
||||
_check_feature_names_in(self, input_features, generate_names=False)
|
||||
class_name = self.__class__.__name__.lower()
|
||||
return np.asarray(
|
||||
[f"{class_name}_{name}" for name, est in self.estimators if est != "drop"],
|
||||
dtype=object,
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,73 @@
|
||||
import numpy
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
|
||||
|
||||
def configuration(parent_package="", top_path=None):
|
||||
config = Configuration("ensemble", parent_package, top_path)
|
||||
|
||||
config.add_extension(
|
||||
"_gradient_boosting",
|
||||
sources=["_gradient_boosting.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
)
|
||||
|
||||
config.add_subpackage("tests")
|
||||
|
||||
# Histogram-based gradient boosting files
|
||||
config.add_extension(
|
||||
"_hist_gradient_boosting._gradient_boosting",
|
||||
sources=["_hist_gradient_boosting/_gradient_boosting.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_hist_gradient_boosting.histogram",
|
||||
sources=["_hist_gradient_boosting/histogram.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_hist_gradient_boosting.splitting",
|
||||
sources=["_hist_gradient_boosting/splitting.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_hist_gradient_boosting._binning",
|
||||
sources=["_hist_gradient_boosting/_binning.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_hist_gradient_boosting._predictor",
|
||||
sources=["_hist_gradient_boosting/_predictor.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_hist_gradient_boosting._bitset",
|
||||
sources=["_hist_gradient_boosting/_bitset.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_hist_gradient_boosting.common",
|
||||
sources=["_hist_gradient_boosting/common.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_hist_gradient_boosting.utils",
|
||||
sources=["_hist_gradient_boosting/utils.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
)
|
||||
|
||||
config.add_subpackage("_hist_gradient_boosting.tests")
|
||||
|
||||
return config
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from numpy.distutils.core import setup
|
||||
|
||||
setup(**configuration().todict())
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,963 @@
|
||||
"""
|
||||
Testing for the bagging ensemble module (sklearn.ensemble.bagging).
|
||||
"""
|
||||
|
||||
# Author: Gilles Louppe
|
||||
# License: BSD 3 clause
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import joblib
|
||||
import pytest
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.model_selection import GridSearchCV, ParameterGrid
|
||||
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
|
||||
from sklearn.linear_model import Perceptron, LogisticRegression
|
||||
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.svm import SVC, SVR
|
||||
from sklearn.random_projection import SparseRandomProjection
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.feature_selection import SelectKBest
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.preprocessing import FunctionTransformer, scale
|
||||
from itertools import cycle
|
||||
|
||||
from scipy.sparse import csc_matrix, csr_matrix
|
||||
|
||||
rng = check_random_state(0)
|
||||
|
||||
# also load the iris dataset
|
||||
# and randomly permute it
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
# also load the diabetes dataset
|
||||
# and randomly permute it
|
||||
diabetes = load_diabetes()
|
||||
perm = rng.permutation(diabetes.target.size)
|
||||
diabetes.data = diabetes.data[perm]
|
||||
diabetes.target = diabetes.target[perm]
|
||||
|
||||
|
||||
def test_classification():
|
||||
# Check classification for various parameter settings.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=rng
|
||||
)
|
||||
grid = ParameterGrid(
|
||||
{
|
||||
"max_samples": [0.5, 1.0],
|
||||
"max_features": [1, 4],
|
||||
"bootstrap": [True, False],
|
||||
"bootstrap_features": [True, False],
|
||||
}
|
||||
)
|
||||
estimators = [
|
||||
None,
|
||||
DummyClassifier(),
|
||||
Perceptron(max_iter=20),
|
||||
DecisionTreeClassifier(max_depth=2),
|
||||
KNeighborsClassifier(),
|
||||
SVC(),
|
||||
]
|
||||
# Try different parameter settings with different base classifiers without
|
||||
# doing the full cartesian product to keep the test durations low.
|
||||
for params, base_estimator in zip(grid, cycle(estimators)):
|
||||
BaggingClassifier(
|
||||
base_estimator=base_estimator,
|
||||
random_state=rng,
|
||||
n_estimators=2,
|
||||
**params,
|
||||
).fit(X_train, y_train).predict(X_test)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_format, params, method",
|
||||
product(
|
||||
[csc_matrix, csr_matrix],
|
||||
[
|
||||
{
|
||||
"max_samples": 0.5,
|
||||
"max_features": 2,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True,
|
||||
},
|
||||
{
|
||||
"max_samples": 1.0,
|
||||
"max_features": 4,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True,
|
||||
},
|
||||
{"max_features": 2, "bootstrap": False, "bootstrap_features": True},
|
||||
{"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False},
|
||||
],
|
||||
["predict", "predict_proba", "predict_log_proba", "decision_function"],
|
||||
),
|
||||
)
|
||||
def test_sparse_classification(sparse_format, params, method):
|
||||
# Check classification for various parameter settings on sparse input.
|
||||
|
||||
class CustomSVC(SVC):
|
||||
"""SVC variant that records the nature of the training set"""
|
||||
|
||||
def fit(self, X, y):
|
||||
super().fit(X, y)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
scale(iris.data), iris.target, random_state=rng
|
||||
)
|
||||
|
||||
X_train_sparse = sparse_format(X_train)
|
||||
X_test_sparse = sparse_format(X_test)
|
||||
# Trained on sparse format
|
||||
sparse_classifier = BaggingClassifier(
|
||||
base_estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"),
|
||||
random_state=1,
|
||||
**params,
|
||||
).fit(X_train_sparse, y_train)
|
||||
sparse_results = getattr(sparse_classifier, method)(X_test_sparse)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = BaggingClassifier(
|
||||
base_estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"),
|
||||
random_state=1,
|
||||
**params,
|
||||
).fit(X_train, y_train)
|
||||
dense_results = getattr(dense_classifier, method)(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
sparse_type = type(X_train_sparse)
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert all([t == sparse_type for t in types])
|
||||
|
||||
|
||||
def test_regression():
|
||||
# Check regression for various parameter settings.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data[:50], diabetes.target[:50], random_state=rng
|
||||
)
|
||||
grid = ParameterGrid(
|
||||
{
|
||||
"max_samples": [0.5, 1.0],
|
||||
"max_features": [0.5, 1.0],
|
||||
"bootstrap": [True, False],
|
||||
"bootstrap_features": [True, False],
|
||||
}
|
||||
)
|
||||
|
||||
for base_estimator in [
|
||||
None,
|
||||
DummyRegressor(),
|
||||
DecisionTreeRegressor(),
|
||||
KNeighborsRegressor(),
|
||||
SVR(),
|
||||
]:
|
||||
for params in grid:
|
||||
BaggingRegressor(
|
||||
base_estimator=base_estimator, random_state=rng, **params
|
||||
).fit(X_train, y_train).predict(X_test)
|
||||
|
||||
|
||||
def test_sparse_regression():
|
||||
# Check regression for various parameter settings on sparse input.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data[:50], diabetes.target[:50], random_state=rng
|
||||
)
|
||||
|
||||
class CustomSVR(SVR):
|
||||
"""SVC variant that records the nature of the training set"""
|
||||
|
||||
def fit(self, X, y):
|
||||
super().fit(X, y)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
parameter_sets = [
|
||||
{
|
||||
"max_samples": 0.5,
|
||||
"max_features": 2,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True,
|
||||
},
|
||||
{
|
||||
"max_samples": 1.0,
|
||||
"max_features": 4,
|
||||
"bootstrap": True,
|
||||
"bootstrap_features": True,
|
||||
},
|
||||
{"max_features": 2, "bootstrap": False, "bootstrap_features": True},
|
||||
{"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False},
|
||||
]
|
||||
|
||||
for sparse_format in [csc_matrix, csr_matrix]:
|
||||
X_train_sparse = sparse_format(X_train)
|
||||
X_test_sparse = sparse_format(X_test)
|
||||
for params in parameter_sets:
|
||||
|
||||
# Trained on sparse format
|
||||
sparse_classifier = BaggingRegressor(
|
||||
base_estimator=CustomSVR(), random_state=1, **params
|
||||
).fit(X_train_sparse, y_train)
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
|
||||
# Trained on dense format
|
||||
dense_results = (
|
||||
BaggingRegressor(base_estimator=CustomSVR(), random_state=1, **params)
|
||||
.fit(X_train, y_train)
|
||||
.predict(X_test)
|
||||
)
|
||||
|
||||
sparse_type = type(X_train_sparse)
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
assert all([t == sparse_type for t in types])
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
|
||||
class DummySizeEstimator(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
self.training_size_ = X.shape[0]
|
||||
self.training_hash_ = joblib.hash(X)
|
||||
|
||||
|
||||
def test_bootstrap_samples():
|
||||
# Test that bootstrapping samples generate non-perfect base estimators.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
base_estimator = DecisionTreeRegressor().fit(X_train, y_train)
|
||||
|
||||
# without bootstrap, all trees are perfect on the training set
|
||||
ensemble = BaggingRegressor(
|
||||
base_estimator=DecisionTreeRegressor(),
|
||||
max_samples=1.0,
|
||||
bootstrap=False,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
assert base_estimator.score(X_train, y_train) == ensemble.score(X_train, y_train)
|
||||
|
||||
# with bootstrap, trees are no longer perfect on the training set
|
||||
ensemble = BaggingRegressor(
|
||||
base_estimator=DecisionTreeRegressor(),
|
||||
max_samples=1.0,
|
||||
bootstrap=True,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
assert base_estimator.score(X_train, y_train) > ensemble.score(X_train, y_train)
|
||||
|
||||
# check that each sampling correspond to a complete bootstrap resample.
|
||||
# the size of each bootstrap should be the same as the input data but
|
||||
# the data should be different (checked using the hash of the data).
|
||||
ensemble = BaggingRegressor(
|
||||
base_estimator=DummySizeEstimator(), bootstrap=True
|
||||
).fit(X_train, y_train)
|
||||
training_hash = []
|
||||
for estimator in ensemble.estimators_:
|
||||
assert estimator.training_size_ == X_train.shape[0]
|
||||
training_hash.append(estimator.training_hash_)
|
||||
assert len(set(training_hash)) == len(training_hash)
|
||||
|
||||
|
||||
def test_bootstrap_features():
|
||||
# Test that bootstrapping features may generate duplicate features.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
ensemble = BaggingRegressor(
|
||||
base_estimator=DecisionTreeRegressor(),
|
||||
max_features=1.0,
|
||||
bootstrap_features=False,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
for features in ensemble.estimators_features_:
|
||||
assert diabetes.data.shape[1] == np.unique(features).shape[0]
|
||||
|
||||
ensemble = BaggingRegressor(
|
||||
base_estimator=DecisionTreeRegressor(),
|
||||
max_features=1.0,
|
||||
bootstrap_features=True,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
for features in ensemble.estimators_features_:
|
||||
assert diabetes.data.shape[1] > np.unique(features).shape[0]
|
||||
|
||||
|
||||
def test_probability():
|
||||
# Predict probabilities.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=rng
|
||||
)
|
||||
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
# Normal case
|
||||
ensemble = BaggingClassifier(
|
||||
base_estimator=DecisionTreeClassifier(), random_state=rng
|
||||
).fit(X_train, y_train)
|
||||
|
||||
assert_array_almost_equal(
|
||||
np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))
|
||||
)
|
||||
|
||||
assert_array_almost_equal(
|
||||
ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))
|
||||
)
|
||||
|
||||
# Degenerate case, where some classes are missing
|
||||
ensemble = BaggingClassifier(
|
||||
base_estimator=LogisticRegression(), random_state=rng, max_samples=5
|
||||
).fit(X_train, y_train)
|
||||
|
||||
assert_array_almost_equal(
|
||||
np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))
|
||||
)
|
||||
|
||||
assert_array_almost_equal(
|
||||
ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))
|
||||
)
|
||||
|
||||
|
||||
def test_oob_score_classification():
|
||||
# Check that oob prediction is a good estimation of the generalization
|
||||
# error.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=rng
|
||||
)
|
||||
|
||||
for base_estimator in [DecisionTreeClassifier(), SVC()]:
|
||||
clf = BaggingClassifier(
|
||||
base_estimator=base_estimator,
|
||||
n_estimators=100,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
test_score = clf.score(X_test, y_test)
|
||||
|
||||
assert abs(test_score - clf.oob_score_) < 0.1
|
||||
|
||||
# Test with few estimators
|
||||
warn_msg = (
|
||||
"Some inputs do not have OOB scores. This probably means too few "
|
||||
"estimators were used to compute any reliable oob estimates."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
clf = BaggingClassifier(
|
||||
base_estimator=base_estimator,
|
||||
n_estimators=1,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng,
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
|
||||
|
||||
def test_oob_score_regression():
|
||||
# Check that oob prediction is a good estimation of the generalization
|
||||
# error.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
clf = BaggingRegressor(
|
||||
base_estimator=DecisionTreeRegressor(),
|
||||
n_estimators=50,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
test_score = clf.score(X_test, y_test)
|
||||
|
||||
assert abs(test_score - clf.oob_score_) < 0.1
|
||||
|
||||
# Test with few estimators
|
||||
warn_msg = (
|
||||
"Some inputs do not have OOB scores. This probably means too few "
|
||||
"estimators were used to compute any reliable oob estimates."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
regr = BaggingRegressor(
|
||||
base_estimator=DecisionTreeRegressor(),
|
||||
n_estimators=1,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
random_state=rng,
|
||||
)
|
||||
regr.fit(X_train, y_train)
|
||||
|
||||
|
||||
def test_single_estimator():
|
||||
# Check singleton ensembles.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
clf1 = BaggingRegressor(
|
||||
base_estimator=KNeighborsRegressor(),
|
||||
n_estimators=1,
|
||||
bootstrap=False,
|
||||
bootstrap_features=False,
|
||||
random_state=rng,
|
||||
).fit(X_train, y_train)
|
||||
|
||||
clf2 = KNeighborsRegressor().fit(X_train, y_train)
|
||||
|
||||
assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
|
||||
|
||||
|
||||
def test_error():
|
||||
# Test that it gives proper exception on deficient input.
|
||||
X, y = iris.data, iris.target
|
||||
base = DecisionTreeClassifier()
|
||||
|
||||
# Test max_samples
|
||||
with pytest.raises(ValueError):
|
||||
BaggingClassifier(base, max_samples=-1).fit(X, y)
|
||||
with pytest.raises(ValueError):
|
||||
BaggingClassifier(base, max_samples=0.0).fit(X, y)
|
||||
with pytest.raises(ValueError):
|
||||
BaggingClassifier(base, max_samples=2.0).fit(X, y)
|
||||
with pytest.raises(ValueError):
|
||||
BaggingClassifier(base, max_samples=1000).fit(X, y)
|
||||
with pytest.raises(ValueError):
|
||||
BaggingClassifier(base, max_samples="foobar").fit(X, y)
|
||||
|
||||
# Test max_features
|
||||
with pytest.raises(ValueError):
|
||||
BaggingClassifier(base, max_features=-1).fit(X, y)
|
||||
with pytest.raises(ValueError):
|
||||
BaggingClassifier(base, max_features=0.0).fit(X, y)
|
||||
with pytest.raises(ValueError):
|
||||
BaggingClassifier(base, max_features=2.0).fit(X, y)
|
||||
with pytest.raises(ValueError):
|
||||
BaggingClassifier(base, max_features=5).fit(X, y)
|
||||
with pytest.raises(ValueError):
|
||||
BaggingClassifier(base, max_features="foobar").fit(X, y)
|
||||
|
||||
# Test support of decision_function
|
||||
assert not hasattr(BaggingClassifier(base).fit(X, y), "decision_function")
|
||||
|
||||
|
||||
def test_parallel_classification():
|
||||
# Check parallel classification.
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=0
|
||||
)
|
||||
|
||||
ensemble = BaggingClassifier(
|
||||
DecisionTreeClassifier(), n_jobs=3, random_state=0
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# predict_proba
|
||||
y1 = ensemble.predict_proba(X_test)
|
||||
ensemble.set_params(n_jobs=1)
|
||||
y2 = ensemble.predict_proba(X_test)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
ensemble = BaggingClassifier(
|
||||
DecisionTreeClassifier(), n_jobs=1, random_state=0
|
||||
).fit(X_train, y_train)
|
||||
|
||||
y3 = ensemble.predict_proba(X_test)
|
||||
assert_array_almost_equal(y1, y3)
|
||||
|
||||
# decision_function
|
||||
ensemble = BaggingClassifier(
|
||||
SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0
|
||||
).fit(X_train, y_train)
|
||||
|
||||
decisions1 = ensemble.decision_function(X_test)
|
||||
ensemble.set_params(n_jobs=1)
|
||||
decisions2 = ensemble.decision_function(X_test)
|
||||
assert_array_almost_equal(decisions1, decisions2)
|
||||
|
||||
ensemble = BaggingClassifier(
|
||||
SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0
|
||||
).fit(X_train, y_train)
|
||||
|
||||
decisions3 = ensemble.decision_function(X_test)
|
||||
assert_array_almost_equal(decisions1, decisions3)
|
||||
|
||||
|
||||
def test_parallel_regression():
|
||||
# Check parallel regression.
|
||||
rng = check_random_state(0)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
|
||||
ensemble.set_params(n_jobs=1)
|
||||
y1 = ensemble.predict(X_test)
|
||||
ensemble.set_params(n_jobs=2)
|
||||
y2 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
|
||||
y3 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y3)
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
# Check that bagging ensembles can be grid-searched.
|
||||
# Transform iris into a binary classification task
|
||||
X, y = iris.data, iris.target
|
||||
y[y == 2] = 1
|
||||
|
||||
# Grid search with scoring based on decision_function
|
||||
parameters = {"n_estimators": (1, 2), "base_estimator__C": (1, 2)}
|
||||
|
||||
GridSearchCV(BaggingClassifier(SVC()), parameters, scoring="roc_auc").fit(X, y)
|
||||
|
||||
|
||||
def test_base_estimator():
|
||||
# Check base_estimator and its default values.
|
||||
rng = check_random_state(0)
|
||||
|
||||
# Classification
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
iris.data, iris.target, random_state=rng
|
||||
)
|
||||
|
||||
ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier)
|
||||
|
||||
ensemble = BaggingClassifier(
|
||||
DecisionTreeClassifier(), n_jobs=3, random_state=0
|
||||
).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier)
|
||||
|
||||
ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
|
||||
assert isinstance(ensemble.base_estimator_, Perceptron)
|
||||
|
||||
# Regression
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train)
|
||||
|
||||
assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor)
|
||||
|
||||
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(
|
||||
X_train, y_train
|
||||
)
|
||||
|
||||
assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor)
|
||||
|
||||
ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train)
|
||||
assert isinstance(ensemble.base_estimator_, SVR)
|
||||
|
||||
|
||||
def test_bagging_with_pipeline():
|
||||
estimator = BaggingClassifier(
|
||||
make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2
|
||||
)
|
||||
estimator.fit(iris.data, iris.target)
|
||||
assert isinstance(estimator[0].steps[-1][1].random_state, int)
|
||||
|
||||
|
||||
class DummyZeroEstimator(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
self.classes_ = np.unique(y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
return self.classes_[np.zeros(X.shape[0], dtype=int)]
|
||||
|
||||
|
||||
def test_bagging_sample_weight_unsupported_but_passed():
|
||||
estimator = BaggingClassifier(DummyZeroEstimator())
|
||||
rng = check_random_state(0)
|
||||
|
||||
estimator.fit(iris.data, iris.target).predict(iris.data)
|
||||
with pytest.raises(ValueError):
|
||||
estimator.fit(
|
||||
iris.data,
|
||||
iris.target,
|
||||
sample_weight=rng.randint(10, size=(iris.data.shape[0])),
|
||||
)
|
||||
|
||||
|
||||
def test_warm_start(random_state=42):
|
||||
# Test if fitting incrementally with warm start gives a forest of the
|
||||
# right size and the same results as a normal fit.
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
|
||||
clf_ws = None
|
||||
for n_estimators in [5, 10]:
|
||||
if clf_ws is None:
|
||||
clf_ws = BaggingClassifier(
|
||||
n_estimators=n_estimators, random_state=random_state, warm_start=True
|
||||
)
|
||||
else:
|
||||
clf_ws.set_params(n_estimators=n_estimators)
|
||||
clf_ws.fit(X, y)
|
||||
assert len(clf_ws) == n_estimators
|
||||
|
||||
clf_no_ws = BaggingClassifier(
|
||||
n_estimators=10, random_state=random_state, warm_start=False
|
||||
)
|
||||
clf_no_ws.fit(X, y)
|
||||
|
||||
assert set([tree.random_state for tree in clf_ws]) == set(
|
||||
[tree.random_state for tree in clf_no_ws]
|
||||
)
|
||||
|
||||
|
||||
def test_warm_start_smaller_n_estimators():
|
||||
# Test if warm start'ed second fit with smaller n_estimators raises error.
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
clf = BaggingClassifier(n_estimators=5, warm_start=True)
|
||||
clf.fit(X, y)
|
||||
clf.set_params(n_estimators=4)
|
||||
with pytest.raises(ValueError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_warm_start_equal_n_estimators():
|
||||
# Test that nothing happens when fitting without increasing n_estimators
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
|
||||
|
||||
clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
|
||||
clf.fit(X_train, y_train)
|
||||
|
||||
y_pred = clf.predict(X_test)
|
||||
# modify X to nonsense values, this should not change anything
|
||||
X_train += 1.0
|
||||
|
||||
warn_msg = "Warm-start fitting without increasing n_estimators does not"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
clf.fit(X_train, y_train)
|
||||
assert_array_equal(y_pred, clf.predict(X_test))
|
||||
|
||||
|
||||
def test_warm_start_equivalence():
|
||||
# warm started classifier with 5+5 estimators should be equivalent to
|
||||
# one classifier with 10 estimators
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
|
||||
|
||||
clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141)
|
||||
clf_ws.fit(X_train, y_train)
|
||||
clf_ws.set_params(n_estimators=10)
|
||||
clf_ws.fit(X_train, y_train)
|
||||
y1 = clf_ws.predict(X_test)
|
||||
|
||||
clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141)
|
||||
clf.fit(X_train, y_train)
|
||||
y2 = clf.predict(X_test)
|
||||
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
|
||||
def test_warm_start_with_oob_score_fails():
|
||||
# Check using oob_score and warm_start simultaneously fails
|
||||
X, y = make_hastie_10_2(n_samples=20, random_state=1)
|
||||
clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
|
||||
with pytest.raises(ValueError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_oob_score_removed_on_warm_start():
|
||||
X, y = make_hastie_10_2(n_samples=100, random_state=1)
|
||||
|
||||
clf = BaggingClassifier(n_estimators=5, oob_score=True)
|
||||
clf.fit(X, y)
|
||||
|
||||
clf.set_params(warm_start=True, oob_score=False, n_estimators=10)
|
||||
clf.fit(X, y)
|
||||
|
||||
with pytest.raises(AttributeError):
|
||||
getattr(clf, "oob_score_")
|
||||
|
||||
|
||||
def test_oob_score_consistency():
|
||||
# Make sure OOB scores are identical when random_state, estimator, and
|
||||
# training data are fixed and fitting is done twice
|
||||
X, y = make_hastie_10_2(n_samples=200, random_state=1)
|
||||
bagging = BaggingClassifier(
|
||||
KNeighborsClassifier(),
|
||||
max_samples=0.5,
|
||||
max_features=0.5,
|
||||
oob_score=True,
|
||||
random_state=1,
|
||||
)
|
||||
assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
|
||||
|
||||
|
||||
def test_estimators_samples():
|
||||
# Check that format of estimators_samples_ is correct and that results
|
||||
# generated at fit time can be identically reproduced at a later time
|
||||
# using data saved in object attributes.
|
||||
X, y = make_hastie_10_2(n_samples=200, random_state=1)
|
||||
bagging = BaggingClassifier(
|
||||
LogisticRegression(),
|
||||
max_samples=0.5,
|
||||
max_features=0.5,
|
||||
random_state=1,
|
||||
bootstrap=False,
|
||||
)
|
||||
bagging.fit(X, y)
|
||||
|
||||
# Get relevant attributes
|
||||
estimators_samples = bagging.estimators_samples_
|
||||
estimators_features = bagging.estimators_features_
|
||||
estimators = bagging.estimators_
|
||||
|
||||
# Test for correct formatting
|
||||
assert len(estimators_samples) == len(estimators)
|
||||
assert len(estimators_samples[0]) == len(X) // 2
|
||||
assert estimators_samples[0].dtype.kind == "i"
|
||||
|
||||
# Re-fit single estimator to test for consistent sampling
|
||||
estimator_index = 0
|
||||
estimator_samples = estimators_samples[estimator_index]
|
||||
estimator_features = estimators_features[estimator_index]
|
||||
estimator = estimators[estimator_index]
|
||||
|
||||
X_train = (X[estimator_samples])[:, estimator_features]
|
||||
y_train = y[estimator_samples]
|
||||
|
||||
orig_coefs = estimator.coef_
|
||||
estimator.fit(X_train, y_train)
|
||||
new_coefs = estimator.coef_
|
||||
|
||||
assert_array_almost_equal(orig_coefs, new_coefs)
|
||||
|
||||
|
||||
def test_estimators_samples_deterministic():
|
||||
# This test is a regression test to check that with a random step
|
||||
# (e.g. SparseRandomProjection) and a given random state, the results
|
||||
# generated at fit time can be identically reproduced at a later time using
|
||||
# data saved in object attributes. Check issue #9524 for full discussion.
|
||||
|
||||
iris = load_iris()
|
||||
X, y = iris.data, iris.target
|
||||
|
||||
base_pipeline = make_pipeline(
|
||||
SparseRandomProjection(n_components=2), LogisticRegression()
|
||||
)
|
||||
clf = BaggingClassifier(
|
||||
base_estimator=base_pipeline, max_samples=0.5, random_state=0
|
||||
)
|
||||
clf.fit(X, y)
|
||||
pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()
|
||||
|
||||
estimator = clf.estimators_[0]
|
||||
estimator_sample = clf.estimators_samples_[0]
|
||||
estimator_feature = clf.estimators_features_[0]
|
||||
|
||||
X_train = (X[estimator_sample])[:, estimator_feature]
|
||||
y_train = y[estimator_sample]
|
||||
|
||||
estimator.fit(X_train, y_train)
|
||||
assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
|
||||
|
||||
|
||||
def test_max_samples_consistency():
|
||||
# Make sure validated max_samples and original max_samples are identical
|
||||
# when valid integer max_samples supplied by user
|
||||
max_samples = 100
|
||||
X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1)
|
||||
bagging = BaggingClassifier(
|
||||
KNeighborsClassifier(),
|
||||
max_samples=max_samples,
|
||||
max_features=0.5,
|
||||
random_state=1,
|
||||
)
|
||||
bagging.fit(X, y)
|
||||
assert bagging._max_samples == max_samples
|
||||
|
||||
|
||||
def test_set_oob_score_label_encoding():
|
||||
# Make sure the oob_score doesn't change when the labels change
|
||||
# See: https://github.com/scikit-learn/scikit-learn/issues/8933
|
||||
random_state = 5
|
||||
X = [[-1], [0], [1]] * 5
|
||||
Y1 = ["A", "B", "C"] * 5
|
||||
Y2 = [-1, 0, 1] * 5
|
||||
Y3 = [0, 1, 2] * 5
|
||||
x1 = (
|
||||
BaggingClassifier(oob_score=True, random_state=random_state)
|
||||
.fit(X, Y1)
|
||||
.oob_score_
|
||||
)
|
||||
x2 = (
|
||||
BaggingClassifier(oob_score=True, random_state=random_state)
|
||||
.fit(X, Y2)
|
||||
.oob_score_
|
||||
)
|
||||
x3 = (
|
||||
BaggingClassifier(oob_score=True, random_state=random_state)
|
||||
.fit(X, Y3)
|
||||
.oob_score_
|
||||
)
|
||||
assert [x1, x2] == [x3, x3]
|
||||
|
||||
|
||||
def replace(X):
|
||||
X = X.astype("float", copy=True)
|
||||
X[~np.isfinite(X)] = 0
|
||||
return X
|
||||
|
||||
|
||||
def test_bagging_regressor_with_missing_inputs():
|
||||
# Check that BaggingRegressor can accept X with missing/infinite data
|
||||
X = np.array(
|
||||
[
|
||||
[1, 3, 5],
|
||||
[2, None, 6],
|
||||
[2, np.nan, 6],
|
||||
[2, np.inf, 6],
|
||||
[2, np.NINF, 6],
|
||||
]
|
||||
)
|
||||
y_values = [
|
||||
np.array([2, 3, 3, 3, 3]),
|
||||
np.array(
|
||||
[
|
||||
[2, 1, 9],
|
||||
[3, 6, 8],
|
||||
[3, 6, 8],
|
||||
[3, 6, 8],
|
||||
[3, 6, 8],
|
||||
]
|
||||
),
|
||||
]
|
||||
for y in y_values:
|
||||
regressor = DecisionTreeRegressor()
|
||||
pipeline = make_pipeline(FunctionTransformer(replace), regressor)
|
||||
pipeline.fit(X, y).predict(X)
|
||||
bagging_regressor = BaggingRegressor(pipeline)
|
||||
y_hat = bagging_regressor.fit(X, y).predict(X)
|
||||
assert y.shape == y_hat.shape
|
||||
|
||||
# Verify that exceptions can be raised by wrapper regressor
|
||||
regressor = DecisionTreeRegressor()
|
||||
pipeline = make_pipeline(regressor)
|
||||
with pytest.raises(ValueError):
|
||||
pipeline.fit(X, y)
|
||||
bagging_regressor = BaggingRegressor(pipeline)
|
||||
with pytest.raises(ValueError):
|
||||
bagging_regressor.fit(X, y)
|
||||
|
||||
|
||||
def test_bagging_classifier_with_missing_inputs():
|
||||
# Check that BaggingClassifier can accept X with missing/infinite data
|
||||
X = np.array(
|
||||
[
|
||||
[1, 3, 5],
|
||||
[2, None, 6],
|
||||
[2, np.nan, 6],
|
||||
[2, np.inf, 6],
|
||||
[2, np.NINF, 6],
|
||||
]
|
||||
)
|
||||
y = np.array([3, 6, 6, 6, 6])
|
||||
classifier = DecisionTreeClassifier()
|
||||
pipeline = make_pipeline(FunctionTransformer(replace), classifier)
|
||||
pipeline.fit(X, y).predict(X)
|
||||
bagging_classifier = BaggingClassifier(pipeline)
|
||||
bagging_classifier.fit(X, y)
|
||||
y_hat = bagging_classifier.predict(X)
|
||||
assert y.shape == y_hat.shape
|
||||
bagging_classifier.predict_log_proba(X)
|
||||
bagging_classifier.predict_proba(X)
|
||||
|
||||
# Verify that exceptions can be raised by wrapper classifier
|
||||
classifier = DecisionTreeClassifier()
|
||||
pipeline = make_pipeline(classifier)
|
||||
with pytest.raises(ValueError):
|
||||
pipeline.fit(X, y)
|
||||
bagging_classifier = BaggingClassifier(pipeline)
|
||||
with pytest.raises(ValueError):
|
||||
bagging_classifier.fit(X, y)
|
||||
|
||||
|
||||
def test_bagging_small_max_features():
|
||||
# Check that Bagging estimator can accept low fractional max_features
|
||||
|
||||
X = np.array([[1, 2], [3, 4]])
|
||||
y = np.array([1, 0])
|
||||
|
||||
bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1)
|
||||
bagging.fit(X, y)
|
||||
|
||||
|
||||
def test_bagging_get_estimators_indices():
|
||||
# Check that Bagging estimator can generate sample indices properly
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/16436
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(13, 4)
|
||||
y = np.arange(13)
|
||||
|
||||
class MyEstimator(DecisionTreeRegressor):
|
||||
"""An estimator which stores y indices information at fit."""
|
||||
|
||||
def fit(self, X, y):
|
||||
self._sample_indices = y
|
||||
|
||||
clf = BaggingRegressor(base_estimator=MyEstimator(), n_estimators=1, random_state=0)
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_array_equal(clf.estimators_[0]._sample_indices, clf.estimators_samples_[0])
|
||||
|
||||
|
||||
# FIXME: remove in 1.2
|
||||
@pytest.mark.parametrize("Estimator", [BaggingClassifier, BaggingRegressor])
|
||||
def test_n_features_deprecation(Estimator):
|
||||
# Check that we raise the proper deprecation warning if accessing
|
||||
# `n_features_`.
|
||||
X = np.array([[1, 2], [3, 4]])
|
||||
y = np.array([1, 0])
|
||||
est = Estimator().fit(X, y)
|
||||
|
||||
with pytest.warns(FutureWarning, match="`n_features_` was deprecated"):
|
||||
est.n_features_
|
||||
@@ -0,0 +1,131 @@
|
||||
"""
|
||||
Testing for the base module (sklearn.ensemble.base).
|
||||
"""
|
||||
|
||||
# Authors: Gilles Louppe
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.ensemble import BaggingClassifier
|
||||
from sklearn.ensemble._base import _set_random_states
|
||||
from sklearn.linear_model import Perceptron
|
||||
from collections import OrderedDict
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.feature_selection import SelectFromModel
|
||||
|
||||
|
||||
def test_base():
|
||||
# Check BaseEnsemble methods.
|
||||
ensemble = BaggingClassifier(
|
||||
base_estimator=Perceptron(random_state=None), n_estimators=3
|
||||
)
|
||||
|
||||
iris = load_iris()
|
||||
ensemble.fit(iris.data, iris.target)
|
||||
ensemble.estimators_ = [] # empty the list and create estimators manually
|
||||
|
||||
ensemble._make_estimator()
|
||||
random_state = np.random.RandomState(3)
|
||||
ensemble._make_estimator(random_state=random_state)
|
||||
ensemble._make_estimator(random_state=random_state)
|
||||
ensemble._make_estimator(append=False)
|
||||
|
||||
assert 3 == len(ensemble)
|
||||
assert 3 == len(ensemble.estimators_)
|
||||
|
||||
assert isinstance(ensemble[0], Perceptron)
|
||||
assert ensemble[0].random_state is None
|
||||
assert isinstance(ensemble[1].random_state, int)
|
||||
assert isinstance(ensemble[2].random_state, int)
|
||||
assert ensemble[1].random_state != ensemble[2].random_state
|
||||
|
||||
np_int_ensemble = BaggingClassifier(
|
||||
base_estimator=Perceptron(), n_estimators=np.int32(3)
|
||||
)
|
||||
np_int_ensemble.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
def test_base_zero_n_estimators():
|
||||
# Check that instantiating a BaseEnsemble with n_estimators<=0 raises
|
||||
# a ValueError.
|
||||
ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators=0)
|
||||
iris = load_iris()
|
||||
err_msg = "n_estimators must be greater than zero, got 0."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
def test_base_not_int_n_estimators():
|
||||
# Check that instantiating a BaseEnsemble with a string as n_estimators
|
||||
# raises a ValueError demanding n_estimators to be supplied as an integer.
|
||||
string_ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators="3")
|
||||
iris = load_iris()
|
||||
with pytest.raises(ValueError, match="n_estimators must be an integer"):
|
||||
string_ensemble.fit(iris.data, iris.target)
|
||||
float_ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators=3.0)
|
||||
with pytest.raises(ValueError, match="n_estimators must be an integer"):
|
||||
float_ensemble.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
def test_set_random_states():
|
||||
# Linear Discriminant Analysis doesn't have random state: smoke test
|
||||
_set_random_states(LinearDiscriminantAnalysis(), random_state=17)
|
||||
|
||||
clf1 = Perceptron(random_state=None)
|
||||
assert clf1.random_state is None
|
||||
# check random_state is None still sets
|
||||
_set_random_states(clf1, None)
|
||||
assert isinstance(clf1.random_state, int)
|
||||
|
||||
# check random_state fixes results in consistent initialisation
|
||||
_set_random_states(clf1, 3)
|
||||
assert isinstance(clf1.random_state, int)
|
||||
clf2 = Perceptron(random_state=None)
|
||||
_set_random_states(clf2, 3)
|
||||
assert clf1.random_state == clf2.random_state
|
||||
|
||||
# nested random_state
|
||||
|
||||
def make_steps():
|
||||
return [
|
||||
("sel", SelectFromModel(Perceptron(random_state=None))),
|
||||
("clf", Perceptron(random_state=None)),
|
||||
]
|
||||
|
||||
est1 = Pipeline(make_steps())
|
||||
_set_random_states(est1, 3)
|
||||
assert isinstance(est1.steps[0][1].estimator.random_state, int)
|
||||
assert isinstance(est1.steps[1][1].random_state, int)
|
||||
assert (
|
||||
est1.get_params()["sel__estimator__random_state"]
|
||||
!= est1.get_params()["clf__random_state"]
|
||||
)
|
||||
|
||||
# ensure multiple random_state parameters are invariant to get_params()
|
||||
# iteration order
|
||||
|
||||
class AlphaParamPipeline(Pipeline):
|
||||
def get_params(self, *args, **kwargs):
|
||||
params = Pipeline.get_params(self, *args, **kwargs).items()
|
||||
return OrderedDict(sorted(params))
|
||||
|
||||
class RevParamPipeline(Pipeline):
|
||||
def get_params(self, *args, **kwargs):
|
||||
params = Pipeline.get_params(self, *args, **kwargs).items()
|
||||
return OrderedDict(sorted(params, reverse=True))
|
||||
|
||||
for cls in [AlphaParamPipeline, RevParamPipeline]:
|
||||
est2 = cls(make_steps())
|
||||
_set_random_states(est2, 3)
|
||||
assert (
|
||||
est1.get_params()["sel__estimator__random_state"]
|
||||
== est2.get_params()["sel__estimator__random_state"]
|
||||
)
|
||||
assert (
|
||||
est1.get_params()["clf__random_state"]
|
||||
== est2.get_params()["clf__random_state"]
|
||||
)
|
||||
@@ -0,0 +1,257 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.base import ClassifierMixin
|
||||
from sklearn.base import is_classifier
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.datasets import load_iris, load_diabetes
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.linear_model import LogisticRegression, LinearRegression
|
||||
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
||||
|
||||
from sklearn.ensemble import StackingClassifier, StackingRegressor
|
||||
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
|
||||
X_r, y_r = load_diabetes(return_X_y=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, estimator",
|
||||
[
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression()),
|
||||
("svm", LinearSVC()),
|
||||
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
|
||||
],
|
||||
cv=2,
|
||||
),
|
||||
),
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression()),
|
||||
("svm", LinearSVC()),
|
||||
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR()),
|
||||
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
|
||||
],
|
||||
cv=2,
|
||||
),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR()),
|
||||
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"stacking-classifier",
|
||||
"voting-classifier",
|
||||
"stacking-regressor",
|
||||
"voting-regressor",
|
||||
],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
|
||||
# check that the behavior of `estimators`, `estimators_`,
|
||||
# `named_estimators`, `named_estimators_` is consistent across all
|
||||
# ensemble classes and when using `set_params()`.
|
||||
|
||||
# before fit
|
||||
assert "svm" in estimator.named_estimators
|
||||
assert estimator.named_estimators.svm is estimator.estimators[1][1]
|
||||
assert estimator.named_estimators.svm is estimator.named_estimators["svm"]
|
||||
|
||||
# check fitted attributes
|
||||
estimator.fit(X, y)
|
||||
assert len(estimator.named_estimators) == 3
|
||||
assert len(estimator.named_estimators_) == 3
|
||||
assert sorted(list(estimator.named_estimators_.keys())) == sorted(
|
||||
["lr", "svm", "rf"]
|
||||
)
|
||||
|
||||
# check that set_params() does not add a new attribute
|
||||
estimator_new_params = clone(estimator)
|
||||
svm_estimator = SVC() if is_classifier(estimator) else SVR()
|
||||
estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
|
||||
assert not hasattr(estimator_new_params, "svm")
|
||||
assert (
|
||||
estimator_new_params.named_estimators.lr.get_params()
|
||||
== estimator.named_estimators.lr.get_params()
|
||||
)
|
||||
assert (
|
||||
estimator_new_params.named_estimators.rf.get_params()
|
||||
== estimator.named_estimators.rf.get_params()
|
||||
)
|
||||
|
||||
# check the behavior when setting an dropping an estimator
|
||||
estimator_dropped = clone(estimator)
|
||||
estimator_dropped.set_params(svm="drop")
|
||||
estimator_dropped.fit(X, y)
|
||||
assert len(estimator_dropped.named_estimators) == 3
|
||||
assert estimator_dropped.named_estimators.svm == "drop"
|
||||
assert len(estimator_dropped.named_estimators_) == 3
|
||||
assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
|
||||
["lr", "svm", "rf"]
|
||||
)
|
||||
for sub_est in estimator_dropped.named_estimators_:
|
||||
# check that the correspondence is correct
|
||||
assert not isinstance(sub_est, type(estimator.named_estimators.svm))
|
||||
|
||||
# check that we can set the parameters of the underlying classifier
|
||||
estimator.set_params(svm__C=10.0)
|
||||
estimator.set_params(rf__max_depth=5)
|
||||
assert (
|
||||
estimator.get_params()["svm__C"]
|
||||
== estimator.get_params()["svm"].get_params()["C"]
|
||||
)
|
||||
assert (
|
||||
estimator.get_params()["rf__max_depth"]
|
||||
== estimator.get_params()["rf"].get_params()["max_depth"]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Ensemble",
|
||||
[StackingClassifier, VotingClassifier, StackingRegressor, VotingRegressor],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_type(Ensemble):
|
||||
# check that ensemble will fail during validation if the underlying
|
||||
# estimators are not of the same type (i.e. classifier or regressor)
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
X, y = make_classification(n_samples=10)
|
||||
estimators = [("lr", LinearRegression())]
|
||||
ensemble_type = "classifier"
|
||||
else:
|
||||
X, y = make_regression(n_samples=10)
|
||||
estimators = [("lr", LogisticRegression())]
|
||||
ensemble_type = "regressor"
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = "should be a {}".format(ensemble_type)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, Ensemble",
|
||||
[
|
||||
(*make_classification(n_samples=10), StackingClassifier),
|
||||
(*make_classification(n_samples=10), VotingClassifier),
|
||||
(*make_regression(n_samples=10), StackingRegressor),
|
||||
(*make_regression(n_samples=10), VotingRegressor),
|
||||
],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
|
||||
# raise an error when the name contains dunder
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [("lr__", LogisticRegression())]
|
||||
else:
|
||||
estimators = [("lr__", LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = r"Estimator names must not contain __: got \['lr__'\]"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
# raise an error when the name is not unique
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
|
||||
else:
|
||||
estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
# raise an error when the name conflicts with the parameters
|
||||
if issubclass(Ensemble, ClassifierMixin):
|
||||
estimators = [("estimators", LogisticRegression())]
|
||||
else:
|
||||
estimators = [("estimators", LinearRegression())]
|
||||
ensemble = Ensemble(estimators=estimators)
|
||||
|
||||
err_msg = "Estimator names conflict with constructor arguments"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, estimator",
|
||||
[
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
StackingClassifier(estimators=[("lr", LogisticRegression())]),
|
||||
),
|
||||
(
|
||||
*make_classification(n_samples=10),
|
||||
VotingClassifier(estimators=[("lr", LogisticRegression())]),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
StackingRegressor(estimators=[("lr", LinearRegression())]),
|
||||
),
|
||||
(
|
||||
*make_regression(n_samples=10),
|
||||
VotingRegressor(estimators=[("lr", LinearRegression())]),
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"stacking-classifier",
|
||||
"voting-classifier",
|
||||
"stacking-regressor",
|
||||
"voting-regressor",
|
||||
],
|
||||
)
|
||||
def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
|
||||
# check that we raise a consistent error when all estimators are
|
||||
# dropped
|
||||
estimator.set_params(lr="drop")
|
||||
with pytest.raises(ValueError, match="All estimators are dropped."):
|
||||
estimator.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Ensemble, Estimator, X, y",
|
||||
[
|
||||
(StackingClassifier, LogisticRegression, X, y),
|
||||
(StackingRegressor, LinearRegression, X_r, y_r),
|
||||
(VotingClassifier, LogisticRegression, X, y),
|
||||
(VotingRegressor, LinearRegression, X_r, y_r),
|
||||
],
|
||||
)
|
||||
# FIXME: we should move this test in `estimator_checks` once we are able
|
||||
# to construct meta-estimator instances
|
||||
def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
|
||||
# check that Voting and Stacking predictor delegate the missing values
|
||||
# validation to the underlying estimator.
|
||||
X = X.copy()
|
||||
mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
|
||||
X[mask] = np.nan
|
||||
pipe = make_pipeline(SimpleImputer(), Estimator())
|
||||
ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
|
||||
ensemble.fit(X, y).score(X, y)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,337 @@
|
||||
"""
|
||||
Testing for the gradient boosting loss functions and initial estimators.
|
||||
"""
|
||||
from itertools import product
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
import pytest
|
||||
from pytest import approx
|
||||
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.metrics import mean_pinball_loss
|
||||
from sklearn.ensemble._gb_losses import RegressionLossFunction
|
||||
from sklearn.ensemble._gb_losses import LeastSquaresError
|
||||
from sklearn.ensemble._gb_losses import LeastAbsoluteError
|
||||
from sklearn.ensemble._gb_losses import HuberLossFunction
|
||||
from sklearn.ensemble._gb_losses import QuantileLossFunction
|
||||
from sklearn.ensemble._gb_losses import BinomialDeviance
|
||||
from sklearn.ensemble._gb_losses import MultinomialDeviance
|
||||
from sklearn.ensemble._gb_losses import ExponentialLoss
|
||||
from sklearn.ensemble._gb_losses import LOSS_FUNCTIONS
|
||||
|
||||
|
||||
def test_binomial_deviance():
|
||||
# Check binomial deviance loss.
|
||||
# Check against alternative definitions in ESLII.
|
||||
bd = BinomialDeviance(2)
|
||||
|
||||
# pred has the same BD for y in {0, 1}
|
||||
assert bd(np.array([0.0]), np.array([0.0])) == bd(np.array([1.0]), np.array([0.0]))
|
||||
|
||||
assert bd(np.array([1.0, 1, 1]), np.array([100.0, 100, 100])) == approx(0)
|
||||
assert bd(np.array([1.0, 0, 0]), np.array([100.0, -100, -100])) == approx(0)
|
||||
|
||||
# check if same results as alternative definition of deviance, from ESLII
|
||||
# Eq. (10.18): -loglike = log(1 + exp(-2*z*f))
|
||||
# Note:
|
||||
# - We use y = {0, 1}, ESL (10.18) uses z in {-1, 1}, hence y=2*y-1
|
||||
# - ESL 2*f = pred_raw, hence the factor 2 of ESL disappears.
|
||||
# - Deviance = -2*loglike + .., hence a factor of 2 in front.
|
||||
def alt_dev(y, raw_pred):
|
||||
z = 2 * y - 1
|
||||
return 2 * np.mean(np.log(1 + np.exp(-z * raw_pred)))
|
||||
|
||||
test_data = product(
|
||||
(np.array([0.0, 0, 0]), np.array([1.0, 1, 1])),
|
||||
(np.array([-5.0, -5, -5]), np.array([3.0, 3, 3])),
|
||||
)
|
||||
|
||||
for datum in test_data:
|
||||
assert bd(*datum) == approx(alt_dev(*datum))
|
||||
|
||||
# check the negative gradient against alternative formula from ESLII
|
||||
# Note: negative_gradient is half the negative gradient.
|
||||
def alt_ng(y, raw_pred):
|
||||
z = 2 * y - 1
|
||||
return z / (1 + np.exp(z * raw_pred))
|
||||
|
||||
for datum in test_data:
|
||||
assert bd.negative_gradient(*datum) == approx(alt_ng(*datum))
|
||||
|
||||
|
||||
def test_sample_weight_smoke():
|
||||
rng = check_random_state(13)
|
||||
y = rng.rand(100)
|
||||
pred = rng.rand(100)
|
||||
|
||||
# least squares
|
||||
loss = LeastSquaresError()
|
||||
loss_wo_sw = loss(y, pred)
|
||||
loss_w_sw = loss(y, pred, np.ones(pred.shape[0], dtype=np.float32))
|
||||
assert loss_wo_sw == approx(loss_w_sw)
|
||||
|
||||
|
||||
def test_sample_weight_init_estimators():
|
||||
# Smoke test for init estimators with sample weights.
|
||||
rng = check_random_state(13)
|
||||
X = rng.rand(100, 2)
|
||||
sample_weight = np.ones(100)
|
||||
reg_y = rng.rand(100)
|
||||
|
||||
clf_y = rng.randint(0, 2, size=100)
|
||||
|
||||
for Loss in LOSS_FUNCTIONS.values():
|
||||
if Loss is None:
|
||||
continue
|
||||
if issubclass(Loss, RegressionLossFunction):
|
||||
y = reg_y
|
||||
loss = Loss()
|
||||
else:
|
||||
k = 2
|
||||
y = clf_y
|
||||
if Loss.is_multi_class:
|
||||
# skip multiclass
|
||||
continue
|
||||
loss = Loss(k)
|
||||
|
||||
init_est = loss.init_estimator()
|
||||
init_est.fit(X, y)
|
||||
out = loss.get_init_raw_predictions(X, init_est)
|
||||
assert out.shape == (y.shape[0], 1)
|
||||
|
||||
sw_init_est = loss.init_estimator()
|
||||
sw_init_est.fit(X, y, sample_weight=sample_weight)
|
||||
sw_out = loss.get_init_raw_predictions(X, sw_init_est)
|
||||
assert sw_out.shape == (y.shape[0], 1)
|
||||
|
||||
# check if predictions match
|
||||
assert_allclose(out, sw_out, rtol=1e-2)
|
||||
|
||||
|
||||
def test_quantile_loss_function():
|
||||
# Non regression test for the QuantileLossFunction object
|
||||
# There was a sign problem when evaluating the function
|
||||
# for negative values of 'ytrue - ypred'
|
||||
x = np.asarray([-1.0, 0.0, 1.0])
|
||||
y_found = QuantileLossFunction(0.9)(x, np.zeros_like(x))
|
||||
y_expected = np.asarray([0.1, 0.0, 0.9]).mean()
|
||||
np.testing.assert_allclose(y_found, y_expected)
|
||||
y_found_p = mean_pinball_loss(x, np.zeros_like(x), alpha=0.9)
|
||||
np.testing.assert_allclose(y_found, y_found_p)
|
||||
|
||||
|
||||
def test_sample_weight_deviance():
|
||||
# Test if deviance supports sample weights.
|
||||
rng = check_random_state(13)
|
||||
sample_weight = np.ones(100)
|
||||
reg_y = rng.rand(100)
|
||||
clf_y = rng.randint(0, 2, size=100)
|
||||
mclf_y = rng.randint(0, 3, size=100)
|
||||
|
||||
for Loss in LOSS_FUNCTIONS.values():
|
||||
if Loss is None:
|
||||
continue
|
||||
if issubclass(Loss, RegressionLossFunction):
|
||||
y = reg_y
|
||||
p = reg_y
|
||||
loss = Loss()
|
||||
else:
|
||||
k = 2
|
||||
y = clf_y
|
||||
p = clf_y
|
||||
if Loss.is_multi_class:
|
||||
k = 3
|
||||
y = mclf_y
|
||||
# one-hot encoding
|
||||
p = np.zeros((y.shape[0], k), dtype=np.float64)
|
||||
for i in range(k):
|
||||
p[:, i] = y == i
|
||||
loss = Loss(k)
|
||||
|
||||
deviance_w_w = loss(y, p, sample_weight)
|
||||
deviance_wo_w = loss(y, p)
|
||||
assert_allclose(deviance_wo_w, deviance_w_w)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_classes, n_samples", [(3, 100), (5, 57), (7, 13)])
|
||||
def test_multinomial_deviance(n_classes, n_samples):
|
||||
# Check multinomial deviance with and without sample weights.
|
||||
rng = np.random.RandomState(13)
|
||||
sample_weight = np.ones(n_samples)
|
||||
y_true = rng.randint(0, n_classes, size=n_samples)
|
||||
y_pred = np.zeros((n_samples, n_classes), dtype=np.float64)
|
||||
for klass in range(y_pred.shape[1]):
|
||||
y_pred[:, klass] = y_true == klass
|
||||
|
||||
loss = MultinomialDeviance(n_classes)
|
||||
loss_wo_sw = loss(y_true, y_pred)
|
||||
assert loss_wo_sw > 0
|
||||
loss_w_sw = loss(y_true, y_pred, sample_weight=sample_weight)
|
||||
assert loss_wo_sw == approx(loss_w_sw)
|
||||
|
||||
# Multinomial deviance uses weighted average loss rather than
|
||||
# weighted sum loss, so we make sure that the value remains the same
|
||||
# when we device the weight by 2.
|
||||
loss_w_sw = loss(y_true, y_pred, sample_weight=0.5 * sample_weight)
|
||||
assert loss_wo_sw == approx(loss_w_sw)
|
||||
|
||||
|
||||
def test_mdl_computation_weighted():
|
||||
raw_predictions = np.array([[1.0, -1.0, -0.1], [-2.0, 1.0, 2.0]])
|
||||
y_true = np.array([0, 1])
|
||||
weights = np.array([1, 3])
|
||||
expected_loss = 1.0909323
|
||||
# MultinomialDeviance loss computation with weights.
|
||||
loss = MultinomialDeviance(3)
|
||||
assert loss(y_true, raw_predictions, weights) == approx(expected_loss)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [0, 1, 2])
|
||||
def test_mdl_exception(n):
|
||||
# Check that MultinomialDeviance throws an exception when n_classes <= 2
|
||||
err_msg = "MultinomialDeviance requires more than 2 classes."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
MultinomialDeviance(n)
|
||||
|
||||
|
||||
def test_init_raw_predictions_shapes():
|
||||
# Make sure get_init_raw_predictions returns float64 arrays with shape
|
||||
# (n_samples, K) where K is 1 for binary classification and regression, and
|
||||
# K = n_classes for multiclass classification
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 5))
|
||||
y = rng.normal(size=n_samples)
|
||||
for loss in (
|
||||
LeastSquaresError(),
|
||||
LeastAbsoluteError(),
|
||||
QuantileLossFunction(),
|
||||
HuberLossFunction(),
|
||||
):
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
assert raw_predictions.shape == (n_samples, 1)
|
||||
assert raw_predictions.dtype == np.float64
|
||||
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
for loss in (BinomialDeviance(n_classes=2), ExponentialLoss(n_classes=2)):
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
assert raw_predictions.shape == (n_samples, 1)
|
||||
assert raw_predictions.dtype == np.float64
|
||||
|
||||
for n_classes in range(3, 5):
|
||||
y = rng.randint(0, n_classes, size=n_samples)
|
||||
loss = MultinomialDeviance(n_classes=n_classes)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
assert raw_predictions.shape == (n_samples, n_classes)
|
||||
assert raw_predictions.dtype == np.float64
|
||||
|
||||
|
||||
def test_init_raw_predictions_values():
|
||||
# Make sure the get_init_raw_predictions() returns the expected values for
|
||||
# each loss.
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 5))
|
||||
y = rng.normal(size=n_samples)
|
||||
|
||||
# Least squares loss
|
||||
loss = LeastSquaresError()
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
# Make sure baseline prediction is the mean of all targets
|
||||
assert_allclose(raw_predictions, y.mean())
|
||||
|
||||
# Least absolute and huber loss
|
||||
for Loss in (LeastAbsoluteError, HuberLossFunction):
|
||||
loss = Loss()
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
# Make sure baseline prediction is the median of all targets
|
||||
assert_allclose(raw_predictions, np.median(y))
|
||||
|
||||
# Quantile loss
|
||||
for alpha in (0.1, 0.5, 0.9):
|
||||
loss = QuantileLossFunction(alpha=alpha)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
# Make sure baseline prediction is the alpha-quantile of all targets
|
||||
assert_allclose(raw_predictions, np.percentile(y, alpha * 100))
|
||||
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
|
||||
# Binomial deviance
|
||||
loss = BinomialDeviance(n_classes=2)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
# Make sure baseline prediction is equal to link_function(p), where p
|
||||
# is the proba of the positive class. We want predict_proba() to return p,
|
||||
# and by definition
|
||||
# p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
|
||||
# So we want raw_prediction = link_function(p) = log(p / (1 - p))
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
p = y.mean()
|
||||
assert_allclose(raw_predictions, np.log(p / (1 - p)))
|
||||
|
||||
# Exponential loss
|
||||
loss = ExponentialLoss(n_classes=2)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
p = y.mean()
|
||||
assert_allclose(raw_predictions, 0.5 * np.log(p / (1 - p)))
|
||||
|
||||
# Multinomial deviance loss
|
||||
for n_classes in range(3, 5):
|
||||
y = rng.randint(0, n_classes, size=n_samples)
|
||||
loss = MultinomialDeviance(n_classes=n_classes)
|
||||
init_estimator = loss.init_estimator().fit(X, y)
|
||||
raw_predictions = loss.get_init_raw_predictions(y, init_estimator)
|
||||
for k in range(n_classes):
|
||||
p = (y == k).mean()
|
||||
assert_allclose(raw_predictions[:, k], np.log(p))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(5))
|
||||
@pytest.mark.parametrize("alpha", [0.4, 0.5, 0.6])
|
||||
def test_lad_equals_quantiles(seed, alpha):
|
||||
# Make sure quantile loss with alpha = .5 is equivalent to LAD
|
||||
lad = LeastAbsoluteError()
|
||||
ql = QuantileLossFunction(alpha=alpha)
|
||||
|
||||
n_samples = 50
|
||||
rng = np.random.RandomState(seed)
|
||||
raw_predictions = rng.normal(size=(n_samples))
|
||||
y_true = rng.normal(size=(n_samples))
|
||||
|
||||
lad_loss = lad(y_true, raw_predictions)
|
||||
ql_loss = ql(y_true, raw_predictions)
|
||||
if alpha == 0.5:
|
||||
assert lad_loss == approx(2 * ql_loss)
|
||||
|
||||
weights = np.linspace(0, 1, n_samples) ** 2
|
||||
lad_weighted_loss = lad(y_true, raw_predictions, sample_weight=weights)
|
||||
ql_weighted_loss = ql(y_true, raw_predictions, sample_weight=weights)
|
||||
if alpha == 0.5:
|
||||
assert lad_weighted_loss == approx(2 * ql_weighted_loss)
|
||||
pbl_weighted_loss = mean_pinball_loss(
|
||||
y_true, raw_predictions, sample_weight=weights, alpha=alpha
|
||||
)
|
||||
assert pbl_weighted_loss == approx(ql_weighted_loss)
|
||||
|
||||
|
||||
def test_exponential_loss():
|
||||
"""Check that we compute the negative gradient of the exponential loss.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/9666
|
||||
"""
|
||||
loss = ExponentialLoss(n_classes=2)
|
||||
y_true = np.array([0])
|
||||
y_pred = np.array([0])
|
||||
# we expect to have loss = exp(0) = 1
|
||||
assert loss(y_true, y_pred) == pytest.approx(1)
|
||||
# we expect to have negative gradient = -1 * (1 * exp(0)) = -1
|
||||
assert_allclose(loss.negative_gradient(y_true, y_pred), -1)
|
||||
@@ -0,0 +1,349 @@
|
||||
"""
|
||||
Testing for Isolation Forest algorithm (sklearn.ensemble.iforest).
|
||||
"""
|
||||
|
||||
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
|
||||
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import pytest
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
from sklearn.model_selection import ParameterGrid
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.ensemble._iforest import _average_path_length
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.datasets import load_diabetes, load_iris
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
from scipy.sparse import csc_matrix, csr_matrix
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
rng = check_random_state(0)
|
||||
|
||||
# load the iris dataset
|
||||
# and randomly permute it
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
# also load the diabetes dataset
|
||||
# and randomly permute it
|
||||
diabetes = load_diabetes()
|
||||
perm = rng.permutation(diabetes.target.size)
|
||||
diabetes.data = diabetes.data[perm]
|
||||
diabetes.target = diabetes.target[perm]
|
||||
|
||||
|
||||
def test_iforest():
|
||||
"""Check Isolation Forest for various parameter settings."""
|
||||
X_train = np.array([[0, 1], [1, 2]])
|
||||
X_test = np.array([[2, 1], [1, 1]])
|
||||
|
||||
grid = ParameterGrid(
|
||||
{"n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False]}
|
||||
)
|
||||
|
||||
with ignore_warnings():
|
||||
for params in grid:
|
||||
IsolationForest(random_state=rng, **params).fit(X_train).predict(X_test)
|
||||
|
||||
|
||||
def test_iforest_sparse():
|
||||
"""Check IForest for various parameter settings on sparse input."""
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data[:50], diabetes.target[:50], random_state=rng
|
||||
)
|
||||
grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]})
|
||||
|
||||
for sparse_format in [csc_matrix, csr_matrix]:
|
||||
X_train_sparse = sparse_format(X_train)
|
||||
X_test_sparse = sparse_format(X_test)
|
||||
|
||||
for params in grid:
|
||||
# Trained on sparse format
|
||||
sparse_classifier = IsolationForest(
|
||||
n_estimators=10, random_state=1, **params
|
||||
).fit(X_train_sparse)
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = IsolationForest(
|
||||
n_estimators=10, random_state=1, **params
|
||||
).fit(X_train)
|
||||
dense_results = dense_classifier.predict(X_test)
|
||||
|
||||
assert_array_equal(sparse_results, dense_results)
|
||||
|
||||
|
||||
def test_iforest_error():
|
||||
"""Test that it gives proper exception on deficient input."""
|
||||
X = iris.data
|
||||
|
||||
# Test max_samples
|
||||
with pytest.raises(ValueError):
|
||||
IsolationForest(max_samples=-1).fit(X)
|
||||
with pytest.raises(ValueError):
|
||||
IsolationForest(max_samples=0.0).fit(X)
|
||||
with pytest.raises(ValueError):
|
||||
IsolationForest(max_samples=2.0).fit(X)
|
||||
# The dataset has less than 256 samples, explicitly setting
|
||||
# max_samples > n_samples should result in a warning. If not set
|
||||
# explicitly there should be no warning
|
||||
warn_msg = "max_samples will be set to n_samples for estimation"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
IsolationForest(max_samples=1000).fit(X)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
IsolationForest(max_samples="auto").fit(X)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
IsolationForest(max_samples=np.int64(2)).fit(X)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
IsolationForest(max_samples="foobar").fit(X)
|
||||
with pytest.raises(ValueError):
|
||||
IsolationForest(max_samples=1.5).fit(X)
|
||||
|
||||
# test X_test n_features match X_train one:
|
||||
with pytest.raises(ValueError):
|
||||
IsolationForest().fit(X).predict(X[:, 1:])
|
||||
|
||||
|
||||
def test_recalculate_max_depth():
|
||||
"""Check max_depth recalculation when max_samples is reset to n_samples"""
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
for est in clf.estimators_:
|
||||
assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
|
||||
|
||||
|
||||
def test_max_samples_attribute():
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
assert clf.max_samples_ == X.shape[0]
|
||||
|
||||
clf = IsolationForest(max_samples=500)
|
||||
warn_msg = "max_samples will be set to n_samples for estimation"
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
clf.fit(X)
|
||||
assert clf.max_samples_ == X.shape[0]
|
||||
|
||||
clf = IsolationForest(max_samples=0.4).fit(X)
|
||||
assert clf.max_samples_ == 0.4 * X.shape[0]
|
||||
|
||||
|
||||
def test_iforest_parallel_regression():
|
||||
"""Check parallel regression."""
|
||||
rng = check_random_state(0)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
ensemble = IsolationForest(n_jobs=3, random_state=0).fit(X_train)
|
||||
|
||||
ensemble.set_params(n_jobs=1)
|
||||
y1 = ensemble.predict(X_test)
|
||||
ensemble.set_params(n_jobs=2)
|
||||
y2 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
|
||||
ensemble = IsolationForest(n_jobs=1, random_state=0).fit(X_train)
|
||||
|
||||
y3 = ensemble.predict(X_test)
|
||||
assert_array_almost_equal(y1, y3)
|
||||
|
||||
|
||||
def test_iforest_performance():
|
||||
"""Test Isolation Forest performs well"""
|
||||
|
||||
# Generate train/test data
|
||||
rng = check_random_state(2)
|
||||
X = 0.3 * rng.randn(120, 2)
|
||||
X_train = np.r_[X + 2, X - 2]
|
||||
X_train = X[:100]
|
||||
|
||||
# Generate some abnormal novel observations
|
||||
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
|
||||
X_test = np.r_[X[100:], X_outliers]
|
||||
y_test = np.array([0] * 20 + [1] * 20)
|
||||
|
||||
# fit the model
|
||||
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
|
||||
|
||||
# predict scores (the lower, the more normal)
|
||||
y_pred = -clf.decision_function(X_test)
|
||||
|
||||
# check that there is at most 6 errors (false positive or false negative)
|
||||
assert roc_auc_score(y_test, y_pred) > 0.98
|
||||
|
||||
|
||||
@pytest.mark.parametrize("contamination", [0.25, "auto"])
|
||||
def test_iforest_works(contamination):
|
||||
# toy sample (the last two samples are outliers)
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
|
||||
|
||||
# Test IsolationForest
|
||||
clf = IsolationForest(random_state=rng, contamination=contamination)
|
||||
clf.fit(X)
|
||||
decision_func = -clf.decision_function(X)
|
||||
pred = clf.predict(X)
|
||||
# assert detect outliers:
|
||||
assert np.min(decision_func[-2:]) > np.max(decision_func[:-2])
|
||||
assert_array_equal(pred, 6 * [1] + 2 * [-1])
|
||||
|
||||
|
||||
def test_max_samples_consistency():
|
||||
# Make sure validated max_samples in iforest and BaseBagging are identical
|
||||
X = iris.data
|
||||
clf = IsolationForest().fit(X)
|
||||
assert clf.max_samples_ == clf._max_samples
|
||||
|
||||
|
||||
def test_iforest_subsampled_features():
|
||||
# It tests non-regression for #5732 which failed at predict.
|
||||
rng = check_random_state(0)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
diabetes.data[:50], diabetes.target[:50], random_state=rng
|
||||
)
|
||||
clf = IsolationForest(max_features=0.8)
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
|
||||
|
||||
def test_iforest_average_path_length():
|
||||
# It tests non-regression for #8549 which used the wrong formula
|
||||
# for average path length, strictly for the integer case
|
||||
# Updated to check average path length when input is <= 2 (issue #11839)
|
||||
result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
|
||||
result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
|
||||
assert_allclose(_average_path_length([0]), [0.0])
|
||||
assert_allclose(_average_path_length([1]), [0.0])
|
||||
assert_allclose(_average_path_length([2]), [1.0])
|
||||
assert_allclose(_average_path_length([5]), [result_one])
|
||||
assert_allclose(_average_path_length([999]), [result_two])
|
||||
assert_allclose(
|
||||
_average_path_length(np.array([1, 2, 5, 999])),
|
||||
[0.0, 1.0, result_one, result_two],
|
||||
)
|
||||
# _average_path_length is increasing
|
||||
avg_path_length = _average_path_length(np.arange(5))
|
||||
assert_array_equal(avg_path_length, np.sort(avg_path_length))
|
||||
|
||||
|
||||
def test_score_samples():
|
||||
X_train = [[1, 1], [1, 2], [2, 1]]
|
||||
clf1 = IsolationForest(contamination=0.1).fit(X_train)
|
||||
clf2 = IsolationForest().fit(X_train)
|
||||
assert_array_equal(
|
||||
clf1.score_samples([[2.0, 2.0]]),
|
||||
clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
|
||||
)
|
||||
assert_array_equal(
|
||||
clf2.score_samples([[2.0, 2.0]]),
|
||||
clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
|
||||
)
|
||||
assert_array_equal(
|
||||
clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
|
||||
)
|
||||
|
||||
|
||||
def test_iforest_warm_start():
|
||||
"""Test iterative addition of iTrees to an iForest"""
|
||||
|
||||
rng = check_random_state(0)
|
||||
X = rng.randn(20, 2)
|
||||
|
||||
# fit first 10 trees
|
||||
clf = IsolationForest(
|
||||
n_estimators=10, max_samples=20, random_state=rng, warm_start=True
|
||||
)
|
||||
clf.fit(X)
|
||||
# remember the 1st tree
|
||||
tree_1 = clf.estimators_[0]
|
||||
# fit another 10 trees
|
||||
clf.set_params(n_estimators=20)
|
||||
clf.fit(X)
|
||||
# expecting 20 fitted trees and no overwritten trees
|
||||
assert len(clf.estimators_) == 20
|
||||
assert clf.estimators_[0] is tree_1
|
||||
|
||||
|
||||
# mock get_chunk_n_rows to actually test more than one chunk (here one
|
||||
# chunk = 3 rows:
|
||||
@patch(
|
||||
"sklearn.ensemble._iforest.get_chunk_n_rows",
|
||||
side_effect=Mock(**{"return_value": 3}),
|
||||
)
|
||||
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
|
||||
def test_iforest_chunks_works1(mocked_get_chunk, contamination, n_predict_calls):
|
||||
test_iforest_works(contamination)
|
||||
assert mocked_get_chunk.call_count == n_predict_calls
|
||||
|
||||
|
||||
# idem with chunk_size = 5 rows
|
||||
@patch(
|
||||
"sklearn.ensemble._iforest.get_chunk_n_rows",
|
||||
side_effect=Mock(**{"return_value": 10}),
|
||||
)
|
||||
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
|
||||
def test_iforest_chunks_works2(mocked_get_chunk, contamination, n_predict_calls):
|
||||
test_iforest_works(contamination)
|
||||
assert mocked_get_chunk.call_count == n_predict_calls
|
||||
|
||||
|
||||
def test_iforest_with_uniform_data():
|
||||
"""Test whether iforest predicts inliers when using uniform data"""
|
||||
|
||||
# 2-d array of all 1s
|
||||
X = np.ones((100, 10))
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(X + 1) == 1)
|
||||
assert all(iforest.predict(X - 1) == 1)
|
||||
|
||||
# 2-d array where columns contain the same value across rows
|
||||
X = np.repeat(rng.randn(1, 10), 100, 0)
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(np.ones((100, 10))) == 1)
|
||||
|
||||
# Single row
|
||||
X = rng.randn(1, 10)
|
||||
iforest = IsolationForest()
|
||||
iforest.fit(X)
|
||||
|
||||
assert all(iforest.predict(X) == 1)
|
||||
assert all(iforest.predict(rng.randn(100, 10)) == 1)
|
||||
assert all(iforest.predict(np.ones((100, 10))) == 1)
|
||||
|
||||
|
||||
# FIXME: remove in 1.2
|
||||
def test_n_features_deprecation():
|
||||
# Check that we raise the proper deprecation warning if accessing
|
||||
# `n_features_`.
|
||||
X = np.array([[1, 2], [3, 4]])
|
||||
y = np.array([1, 0])
|
||||
est = IsolationForest().fit(X, y)
|
||||
|
||||
with pytest.warns(FutureWarning, match="`n_features_` was deprecated"):
|
||||
est.n_features_
|
||||
@@ -0,0 +1,741 @@
|
||||
"""Test the stacking classifier and regressor."""
|
||||
|
||||
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_equal
|
||||
import scipy.sparse as sparse
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import ClassifierMixin
|
||||
from sklearn.base import RegressorMixin
|
||||
from sklearn.base import clone
|
||||
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.datasets import load_diabetes
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn.dummy import DummyRegressor
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.svm import LinearSVR
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.preprocessing import scale
|
||||
|
||||
from sklearn.ensemble import StackingClassifier
|
||||
from sklearn.ensemble import StackingRegressor
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
from sklearn.utils._mocking import CheckingClassifier
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils._testing import assert_allclose_dense_sparse
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
from sklearn.exceptions import NotFittedError
|
||||
|
||||
from unittest.mock import Mock
|
||||
|
||||
diabetes = load_diabetes()
|
||||
X_diabetes, y_diabetes = diabetes.data, diabetes.target
|
||||
iris = load_iris()
|
||||
X_iris, y_iris = iris.data, iris.target
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"final_estimator", [None, RandomForestClassifier(random_state=42)]
|
||||
)
|
||||
@pytest.mark.parametrize("passthrough", [False, True])
|
||||
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
scale(X_iris), y_iris, stratify=y_iris, random_state=42
|
||||
)
|
||||
estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
|
||||
clf = StackingClassifier(
|
||||
estimators=estimators,
|
||||
final_estimator=final_estimator,
|
||||
cv=cv,
|
||||
passthrough=passthrough,
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
clf.predict_proba(X_test)
|
||||
assert clf.score(X_test, y_test) > 0.8
|
||||
|
||||
X_trans = clf.transform(X_test)
|
||||
expected_column_count = 10 if passthrough else 6
|
||||
assert X_trans.shape[1] == expected_column_count
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -4:])
|
||||
|
||||
clf.set_params(lr="drop")
|
||||
clf.fit(X_train, y_train)
|
||||
clf.predict(X_test)
|
||||
clf.predict_proba(X_test)
|
||||
if final_estimator is None:
|
||||
# LogisticRegression has decision_function method
|
||||
clf.decision_function(X_test)
|
||||
|
||||
X_trans = clf.transform(X_test)
|
||||
expected_column_count_drop = 7 if passthrough else 3
|
||||
assert X_trans.shape[1] == expected_column_count_drop
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -4:])
|
||||
|
||||
|
||||
def test_stacking_classifier_drop_column_binary_classification():
|
||||
# check that a column is dropped in binary classification
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X), y, stratify=y, random_state=42
|
||||
)
|
||||
|
||||
# both classifiers implement 'predict_proba' and will both drop one column
|
||||
estimators = [
|
||||
("lr", LogisticRegression()),
|
||||
("rf", RandomForestClassifier(random_state=42)),
|
||||
]
|
||||
clf = StackingClassifier(estimators=estimators, cv=3)
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert X_trans.shape[1] == 2
|
||||
|
||||
# LinearSVC does not implement 'predict_proba' and will not drop one column
|
||||
estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
|
||||
clf.set_params(estimators=estimators)
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert X_trans.shape[1] == 2
|
||||
|
||||
|
||||
def test_stacking_classifier_drop_estimator():
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X_iris), y_iris, stratify=y_iris, random_state=42
|
||||
)
|
||||
estimators = [("lr", "drop"), ("svc", LinearSVC(random_state=0))]
|
||||
rf = RandomForestClassifier(n_estimators=10, random_state=42)
|
||||
clf = StackingClassifier(
|
||||
estimators=[("svc", LinearSVC(random_state=0))], final_estimator=rf, cv=5
|
||||
)
|
||||
clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5)
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
clf_drop.fit(X_train, y_train)
|
||||
assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
|
||||
assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
|
||||
assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
|
||||
|
||||
|
||||
def test_stacking_regressor_drop_estimator():
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X_diabetes), y_diabetes, random_state=42
|
||||
)
|
||||
estimators = [("lr", "drop"), ("svr", LinearSVR(random_state=0))]
|
||||
rf = RandomForestRegressor(n_estimators=10, random_state=42)
|
||||
reg = StackingRegressor(
|
||||
estimators=[("svr", LinearSVR(random_state=0))], final_estimator=rf, cv=5
|
||||
)
|
||||
reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5)
|
||||
|
||||
reg.fit(X_train, y_train)
|
||||
reg_drop.fit(X_train, y_train)
|
||||
assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
|
||||
assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)])
|
||||
@pytest.mark.parametrize(
|
||||
"final_estimator, predict_params",
|
||||
[
|
||||
(None, {}),
|
||||
(RandomForestRegressor(random_state=42), {}),
|
||||
(DummyRegressor(), {"return_std": True}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("passthrough", [False, True])
|
||||
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough):
|
||||
# prescale the data to avoid convergence warning without using a pipeline
|
||||
# for later assert
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
scale(X_diabetes), y_diabetes, random_state=42
|
||||
)
|
||||
estimators = [("lr", LinearRegression()), ("svr", LinearSVR())]
|
||||
reg = StackingRegressor(
|
||||
estimators=estimators,
|
||||
final_estimator=final_estimator,
|
||||
cv=cv,
|
||||
passthrough=passthrough,
|
||||
)
|
||||
reg.fit(X_train, y_train)
|
||||
result = reg.predict(X_test, **predict_params)
|
||||
expected_result_length = 2 if predict_params else 1
|
||||
if predict_params:
|
||||
assert len(result) == expected_result_length
|
||||
|
||||
X_trans = reg.transform(X_test)
|
||||
expected_column_count = 12 if passthrough else 2
|
||||
assert X_trans.shape[1] == expected_column_count
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -10:])
|
||||
|
||||
reg.set_params(lr="drop")
|
||||
reg.fit(X_train, y_train)
|
||||
reg.predict(X_test)
|
||||
|
||||
X_trans = reg.transform(X_test)
|
||||
expected_column_count_drop = 11 if passthrough else 1
|
||||
assert X_trans.shape[1] == expected_column_count_drop
|
||||
if passthrough:
|
||||
assert_allclose(X_test, X_trans[:, -10:])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fmt", ["csc", "csr", "coo"])
|
||||
def test_stacking_regressor_sparse_passthrough(fmt):
|
||||
# Check passthrough behavior on a sparse X matrix
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
sparse.coo_matrix(scale(X_diabetes)).asformat(fmt), y_diabetes, random_state=42
|
||||
)
|
||||
estimators = [("lr", LinearRegression()), ("svr", LinearSVR())]
|
||||
rf = RandomForestRegressor(n_estimators=10, random_state=42)
|
||||
clf = StackingRegressor(
|
||||
estimators=estimators, final_estimator=rf, cv=5, passthrough=True
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
|
||||
assert sparse.issparse(X_trans)
|
||||
assert X_test.format == X_trans.format
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fmt", ["csc", "csr", "coo"])
|
||||
def test_stacking_classifier_sparse_passthrough(fmt):
|
||||
# Check passthrough behavior on a sparse X matrix
|
||||
X_train, X_test, y_train, _ = train_test_split(
|
||||
sparse.coo_matrix(scale(X_iris)).asformat(fmt), y_iris, random_state=42
|
||||
)
|
||||
estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
|
||||
rf = RandomForestClassifier(n_estimators=10, random_state=42)
|
||||
clf = StackingClassifier(
|
||||
estimators=estimators, final_estimator=rf, cv=5, passthrough=True
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
X_trans = clf.transform(X_test)
|
||||
assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
|
||||
assert sparse.issparse(X_trans)
|
||||
assert X_test.format == X_trans.format
|
||||
|
||||
|
||||
def test_stacking_classifier_drop_binary_prob():
|
||||
# check that classifier will drop one of the probability column for
|
||||
# binary classification problem
|
||||
|
||||
# Select only the 2 first classes
|
||||
X_, y_ = scale(X_iris[:100]), y_iris[:100]
|
||||
|
||||
estimators = [("lr", LogisticRegression()), ("rf", RandomForestClassifier())]
|
||||
clf = StackingClassifier(estimators=estimators)
|
||||
clf.fit(X_, y_)
|
||||
X_meta = clf.transform(X_)
|
||||
assert X_meta.shape[1] == 2
|
||||
|
||||
|
||||
class NoWeightRegressor(RegressorMixin, BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
self.reg = DummyRegressor()
|
||||
return self.reg.fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
return np.ones(X.shape[0])
|
||||
|
||||
|
||||
class NoWeightClassifier(ClassifierMixin, BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
self.clf = DummyClassifier(strategy="stratified")
|
||||
return self.clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, params, type_err, msg_err",
|
||||
[
|
||||
(y_iris, {"estimators": None}, ValueError, "Invalid 'estimators' attribute,"),
|
||||
(y_iris, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"),
|
||||
(
|
||||
y_iris,
|
||||
{
|
||||
"estimators": [
|
||||
("lr", LogisticRegression()),
|
||||
("svm", SVC(max_iter=5e4)),
|
||||
],
|
||||
"stack_method": "predict_proba",
|
||||
},
|
||||
ValueError,
|
||||
"does not implement the method predict_proba",
|
||||
),
|
||||
(
|
||||
y_iris,
|
||||
{
|
||||
"estimators": [
|
||||
("lr", LogisticRegression()),
|
||||
("cor", NoWeightClassifier()),
|
||||
]
|
||||
},
|
||||
TypeError,
|
||||
"does not support sample weight",
|
||||
),
|
||||
(
|
||||
y_iris,
|
||||
{
|
||||
"estimators": [
|
||||
("lr", LogisticRegression()),
|
||||
("cor", LinearSVC(max_iter=5e4)),
|
||||
],
|
||||
"final_estimator": NoWeightClassifier(),
|
||||
},
|
||||
TypeError,
|
||||
"does not support sample weight",
|
||||
),
|
||||
(
|
||||
y_iris,
|
||||
{"estimators": [("lr", LogisticRegression())], "passthrough": "foo"},
|
||||
TypeError,
|
||||
"passthrough must be an instance of",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_stacking_classifier_error(y, params, type_err, msg_err):
|
||||
with pytest.raises(type_err, match=msg_err):
|
||||
clf = StackingClassifier(**params, cv=3)
|
||||
clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y, params, type_err, msg_err",
|
||||
[
|
||||
(
|
||||
y_diabetes,
|
||||
{"estimators": None},
|
||||
ValueError,
|
||||
"Invalid 'estimators' attribute,",
|
||||
),
|
||||
(y_diabetes, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"),
|
||||
(
|
||||
y_diabetes,
|
||||
{"estimators": [("lr", LinearRegression()), ("cor", NoWeightRegressor())]},
|
||||
TypeError,
|
||||
"does not support sample weight",
|
||||
),
|
||||
(
|
||||
y_diabetes,
|
||||
{
|
||||
"estimators": [("lr", LinearRegression()), ("cor", LinearSVR())],
|
||||
"final_estimator": NoWeightRegressor(),
|
||||
},
|
||||
TypeError,
|
||||
"does not support sample weight",
|
||||
),
|
||||
(
|
||||
y_diabetes,
|
||||
{"estimators": [("lr", LinearRegression())], "passthrough": "foo"},
|
||||
TypeError,
|
||||
"passthrough must be an instance of",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_stacking_regressor_error(y, params, type_err, msg_err):
|
||||
with pytest.raises(type_err, match=msg_err):
|
||||
reg = StackingRegressor(**params, cv=3)
|
||||
reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, X, y",
|
||||
[
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("svm", LinearSVC(random_state=0)),
|
||||
]
|
||||
),
|
||||
X_iris[:100],
|
||||
y_iris[:100],
|
||||
), # keep only classes 0 and 1
|
||||
(
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR(random_state=0)),
|
||||
]
|
||||
),
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
),
|
||||
],
|
||||
ids=["StackingClassifier", "StackingRegressor"],
|
||||
)
|
||||
def test_stacking_randomness(estimator, X, y):
|
||||
# checking that fixing the random state of the CV will lead to the same
|
||||
# results
|
||||
estimator_full = clone(estimator)
|
||||
estimator_full.set_params(
|
||||
cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
|
||||
)
|
||||
|
||||
estimator_drop = clone(estimator)
|
||||
estimator_drop.set_params(lr="drop")
|
||||
estimator_drop.set_params(
|
||||
cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
|
||||
)
|
||||
|
||||
assert_allclose(
|
||||
estimator_full.fit(X, y).transform(X)[:, 1:],
|
||||
estimator_drop.fit(X, y).transform(X),
|
||||
)
|
||||
|
||||
|
||||
def test_stacking_classifier_stratify_default():
|
||||
# check that we stratify the classes for the default CV
|
||||
clf = StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(max_iter=1e4)),
|
||||
("svm", LinearSVC(max_iter=1e4)),
|
||||
]
|
||||
)
|
||||
# since iris is not shuffled, a simple k-fold would not contain the
|
||||
# 3 classes during training
|
||||
clf.fit(X_iris, y_iris)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stacker, X, y",
|
||||
[
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression()),
|
||||
("svm", LinearSVC(random_state=42)),
|
||||
],
|
||||
final_estimator=LogisticRegression(),
|
||||
cv=KFold(shuffle=True, random_state=42),
|
||||
),
|
||||
*load_breast_cancer(return_X_y=True),
|
||||
),
|
||||
(
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR(random_state=42)),
|
||||
],
|
||||
final_estimator=LinearRegression(),
|
||||
cv=KFold(shuffle=True, random_state=42),
|
||||
),
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
),
|
||||
],
|
||||
ids=["StackingClassifier", "StackingRegressor"],
|
||||
)
|
||||
def test_stacking_with_sample_weight(stacker, X, y):
|
||||
# check that sample weights has an influence on the fitting
|
||||
# note: ConvergenceWarning are catch since we are not worrying about the
|
||||
# convergence here
|
||||
n_half_samples = len(y) // 2
|
||||
total_sample_weight = np.array(
|
||||
[0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples)
|
||||
)
|
||||
X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split(
|
||||
X, y, total_sample_weight, random_state=42
|
||||
)
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
stacker.fit(X_train, y_train)
|
||||
y_pred_no_weight = stacker.predict(X_test)
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape))
|
||||
y_pred_unit_weight = stacker.predict(X_test)
|
||||
|
||||
assert_allclose(y_pred_no_weight, y_pred_unit_weight)
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
stacker.fit(X_train, y_train, sample_weight=sample_weight_train)
|
||||
y_pred_biased = stacker.predict(X_test)
|
||||
|
||||
assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0
|
||||
|
||||
|
||||
def test_stacking_classifier_sample_weight_fit_param():
|
||||
# check sample_weight is passed to all invocations of fit
|
||||
stacker = StackingClassifier(
|
||||
estimators=[("lr", CheckingClassifier(expected_sample_weight=True))],
|
||||
final_estimator=CheckingClassifier(expected_sample_weight=True),
|
||||
)
|
||||
stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0]))
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"stacker, X, y",
|
||||
[
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression()),
|
||||
("svm", LinearSVC(random_state=42)),
|
||||
],
|
||||
final_estimator=LogisticRegression(),
|
||||
),
|
||||
*load_breast_cancer(return_X_y=True),
|
||||
),
|
||||
(
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR(random_state=42)),
|
||||
],
|
||||
final_estimator=LinearRegression(),
|
||||
),
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
),
|
||||
],
|
||||
ids=["StackingClassifier", "StackingRegressor"],
|
||||
)
|
||||
def test_stacking_cv_influence(stacker, X, y):
|
||||
# check that the stacking affects the fit of the final estimator but not
|
||||
# the fit of the base estimators
|
||||
# note: ConvergenceWarning are catch since we are not worrying about the
|
||||
# convergence here
|
||||
stacker_cv_3 = clone(stacker)
|
||||
stacker_cv_5 = clone(stacker)
|
||||
|
||||
stacker_cv_3.set_params(cv=3)
|
||||
stacker_cv_5.set_params(cv=5)
|
||||
|
||||
stacker_cv_3.fit(X, y)
|
||||
stacker_cv_5.fit(X, y)
|
||||
|
||||
# the base estimators should be identical
|
||||
for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_, stacker_cv_5.estimators_):
|
||||
assert_allclose(est_cv_3.coef_, est_cv_5.coef_)
|
||||
|
||||
# the final estimator should be different
|
||||
with pytest.raises(AssertionError, match="Not equal"):
|
||||
assert_allclose(
|
||||
stacker_cv_3.final_estimator_.coef_, stacker_cv_5.final_estimator_.coef_
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Stacker, Estimator, stack_method, final_estimator, X, y",
|
||||
[
|
||||
(
|
||||
StackingClassifier,
|
||||
DummyClassifier,
|
||||
"predict_proba",
|
||||
LogisticRegression(random_state=42),
|
||||
X_iris,
|
||||
y_iris,
|
||||
),
|
||||
(
|
||||
StackingRegressor,
|
||||
DummyRegressor,
|
||||
"predict",
|
||||
LinearRegression(),
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_stacking_prefit(Stacker, Estimator, stack_method, final_estimator, X, y):
|
||||
"""Check the behaviour of stacking when `cv='prefit'`"""
|
||||
X_train1, X_train2, y_train1, y_train2 = train_test_split(
|
||||
X, y, random_state=42, test_size=0.5
|
||||
)
|
||||
estimators = [
|
||||
("d0", Estimator().fit(X_train1, y_train1)),
|
||||
("d1", Estimator().fit(X_train1, y_train1)),
|
||||
]
|
||||
|
||||
# mock out fit and stack_method to be asserted later
|
||||
for _, estimator in estimators:
|
||||
estimator.fit = Mock()
|
||||
stack_func = getattr(estimator, stack_method)
|
||||
setattr(estimator, stack_method, Mock(side_effect=stack_func))
|
||||
|
||||
stacker = Stacker(
|
||||
estimators=estimators, cv="prefit", final_estimator=final_estimator
|
||||
)
|
||||
stacker.fit(X_train2, y_train2)
|
||||
|
||||
assert stacker.estimators_ == [estimator for _, estimator in estimators]
|
||||
# fit was not called again
|
||||
assert all(estimator.fit.call_count == 0 for estimator in stacker.estimators_)
|
||||
|
||||
# stack method is called with the proper inputs
|
||||
for estimator in stacker.estimators_:
|
||||
stack_func_mock = getattr(estimator, stack_method)
|
||||
stack_func_mock.assert_called_with(X_train2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stacker, X, y",
|
||||
[
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[("lr", LogisticRegression()), ("svm", SVC())],
|
||||
cv="prefit",
|
||||
),
|
||||
X_iris,
|
||||
y_iris,
|
||||
),
|
||||
(
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR()),
|
||||
],
|
||||
cv="prefit",
|
||||
),
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_stacking_prefit_error(stacker, X, y):
|
||||
# check that NotFittedError is raised
|
||||
# if base estimators are not fitted when cv="prefit"
|
||||
with pytest.raises(NotFittedError):
|
||||
stacker.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"make_dataset, Stacking, Estimator",
|
||||
[
|
||||
(make_classification, StackingClassifier, LogisticRegression),
|
||||
(make_regression, StackingRegressor, LinearRegression),
|
||||
],
|
||||
)
|
||||
def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator):
|
||||
# Stacking supports estimators without `n_features_in_`. Regression test
|
||||
# for #17353
|
||||
|
||||
class MyEstimator(Estimator):
|
||||
"""Estimator without n_features_in_"""
|
||||
|
||||
def fit(self, X, y):
|
||||
super().fit(X, y)
|
||||
del self.n_features_in_
|
||||
|
||||
X, y = make_dataset(random_state=0, n_samples=100)
|
||||
stacker = Stacking(estimators=[("lr", MyEstimator())])
|
||||
|
||||
msg = f"{Stacking.__name__} object has no attribute n_features_in_"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
stacker.n_features_in_
|
||||
|
||||
# Does not raise
|
||||
stacker.fit(X, y)
|
||||
|
||||
msg = "'MyEstimator' object has no attribute 'n_features_in_'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
stacker.n_features_in_
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stacker, feature_names, X, y, expected_names",
|
||||
[
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("svm", LinearSVC(random_state=0)),
|
||||
]
|
||||
),
|
||||
iris.feature_names,
|
||||
X_iris,
|
||||
y_iris,
|
||||
[
|
||||
"stackingclassifier_lr0",
|
||||
"stackingclassifier_lr1",
|
||||
"stackingclassifier_lr2",
|
||||
"stackingclassifier_svm0",
|
||||
"stackingclassifier_svm1",
|
||||
"stackingclassifier_svm2",
|
||||
],
|
||||
),
|
||||
(
|
||||
StackingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("other", "drop"),
|
||||
("svm", LinearSVC(random_state=0)),
|
||||
]
|
||||
),
|
||||
iris.feature_names,
|
||||
X_iris[:100],
|
||||
y_iris[:100], # keep only classes 0 and 1
|
||||
[
|
||||
"stackingclassifier_lr",
|
||||
"stackingclassifier_svm",
|
||||
],
|
||||
),
|
||||
(
|
||||
StackingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("svm", LinearSVR(random_state=0)),
|
||||
]
|
||||
),
|
||||
diabetes.feature_names,
|
||||
X_diabetes,
|
||||
y_diabetes,
|
||||
[
|
||||
"stackingregressor_lr",
|
||||
"stackingregressor_svm",
|
||||
],
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"StackingClassifier_multiclass",
|
||||
"StackingClassifier_binary",
|
||||
"StackingRegressor",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("passthrough", [True, False])
|
||||
def test_get_feature_names_out(
|
||||
stacker, feature_names, X, y, expected_names, passthrough
|
||||
):
|
||||
"""Check get_feature_names_out works for stacking."""
|
||||
|
||||
stacker.set_params(passthrough=passthrough)
|
||||
stacker.fit(scale(X), y)
|
||||
|
||||
if passthrough:
|
||||
expected_names = np.concatenate((expected_names, feature_names))
|
||||
|
||||
names_out = stacker.get_feature_names_out(feature_names)
|
||||
assert_array_equal(names_out, expected_names)
|
||||
@@ -0,0 +1,698 @@
|
||||
"""Testing for the VotingClassifier and VotingRegressor"""
|
||||
|
||||
import pytest
|
||||
import re
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._testing import assert_almost_equal, assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.exceptions import NotFittedError
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn import datasets
|
||||
from sklearn.model_selection import cross_val_score, train_test_split
|
||||
from sklearn.datasets import make_multilabel_classification
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin, clone
|
||||
from sklearn.dummy import DummyRegressor
|
||||
|
||||
|
||||
# Load datasets
|
||||
iris = datasets.load_iris()
|
||||
X, y = iris.data[:, 1:3], iris.target
|
||||
|
||||
X_r, y_r = datasets.load_diabetes(return_X_y=True)
|
||||
|
||||
|
||||
def test_invalid_type_for_flatten_transform():
|
||||
# Test that invalid input raises the proper exception
|
||||
ensemble = VotingClassifier(
|
||||
estimators=[("lr", LogisticRegression())], flatten_transform="foo"
|
||||
)
|
||||
err_msg = "flatten_transform must be an instance of"
|
||||
with pytest.raises(TypeError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, voter, learner",
|
||||
[
|
||||
(X, y, VotingClassifier, {"estimators": [("lr", LogisticRegression())]}),
|
||||
(X_r, y_r, VotingRegressor, {"estimators": [("lr", LinearRegression())]}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_type, err_msg",
|
||||
[
|
||||
({"verbose": -1}, ValueError, "verbose == -1, must be >= 0"),
|
||||
({"verbose": "foo"}, TypeError, "verbose must be an instance of"),
|
||||
],
|
||||
)
|
||||
def test_voting_estimators_param_validation(
|
||||
X, y, voter, learner, params, err_type, err_msg
|
||||
):
|
||||
# Test that invalid input raises the proper exception
|
||||
params.update(learner)
|
||||
ensemble = voter(**params)
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[
|
||||
(
|
||||
{"estimators": []},
|
||||
"Invalid 'estimators' attribute, 'estimators' should be a list of",
|
||||
),
|
||||
(
|
||||
{"estimators": [("lr", LogisticRegression())], "voting": "error"},
|
||||
r"Voting must be 'soft' or 'hard'; got \(voting='error'\)",
|
||||
),
|
||||
(
|
||||
{"estimators": [("lr", LogisticRegression())], "weights": [1, 2]},
|
||||
"Number of `estimators` and weights must be equal",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_voting_classifier_estimator_init(params, err_msg):
|
||||
ensemble = VotingClassifier(**params)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ensemble.fit(X, y)
|
||||
|
||||
|
||||
def test_predictproba_hardvoting():
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
|
||||
voting="hard",
|
||||
)
|
||||
msg = "predict_proba is not available when voting='hard'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
eclf.predict_proba
|
||||
|
||||
assert not hasattr(eclf, "predict_proba")
|
||||
eclf.fit(X, y)
|
||||
assert not hasattr(eclf, "predict_proba")
|
||||
|
||||
|
||||
def test_notfitted():
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
|
||||
voting="soft",
|
||||
)
|
||||
ereg = VotingRegressor([("dr", DummyRegressor())])
|
||||
msg = (
|
||||
"This %s instance is not fitted yet. Call 'fit'"
|
||||
" with appropriate arguments before using this estimator."
|
||||
)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
|
||||
eclf.predict(X)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
|
||||
eclf.predict_proba(X)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
|
||||
eclf.transform(X)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
|
||||
ereg.predict(X_r)
|
||||
with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
|
||||
ereg.transform(X_r)
|
||||
|
||||
|
||||
def test_majority_label_iris():
|
||||
"""Check classification by majority label on dataset iris."""
|
||||
clf1 = LogisticRegression(solver="liblinear", random_state=123)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
|
||||
)
|
||||
scores = cross_val_score(eclf, X, y, scoring="accuracy")
|
||||
assert_almost_equal(scores.mean(), 0.95, decimal=2)
|
||||
|
||||
|
||||
def test_tie_situation():
|
||||
"""Check voting classifier selects smaller class label in tie situation."""
|
||||
clf1 = LogisticRegression(random_state=123, solver="liblinear")
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard")
|
||||
assert clf1.fit(X, y).predict(X)[73] == 2
|
||||
assert clf2.fit(X, y).predict(X)[73] == 1
|
||||
assert eclf.fit(X, y).predict(X)[73] == 1
|
||||
|
||||
|
||||
def test_weights_iris():
|
||||
"""Check classification by average probabilities on dataset iris."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
weights=[1, 2, 10],
|
||||
)
|
||||
scores = cross_val_score(eclf, X, y, scoring="accuracy")
|
||||
assert_almost_equal(scores.mean(), 0.93, decimal=2)
|
||||
|
||||
|
||||
def test_weights_regressor():
|
||||
"""Check weighted average regression prediction on diabetes dataset."""
|
||||
reg1 = DummyRegressor(strategy="mean")
|
||||
reg2 = DummyRegressor(strategy="median")
|
||||
reg3 = DummyRegressor(strategy="quantile", quantile=0.2)
|
||||
ereg = VotingRegressor(
|
||||
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10]
|
||||
)
|
||||
|
||||
X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
|
||||
X_r, y_r, test_size=0.25
|
||||
)
|
||||
|
||||
reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
|
||||
|
||||
avg = np.average(
|
||||
np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]
|
||||
)
|
||||
assert_almost_equal(ereg_pred, avg, decimal=2)
|
||||
|
||||
ereg_weights_none = VotingRegressor(
|
||||
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None
|
||||
)
|
||||
ereg_weights_equal = VotingRegressor(
|
||||
[("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1]
|
||||
)
|
||||
ereg_weights_none.fit(X_r_train, y_r_train)
|
||||
ereg_weights_equal.fit(X_r_train, y_r_train)
|
||||
ereg_none_pred = ereg_weights_none.predict(X_r_test)
|
||||
ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
|
||||
assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
|
||||
|
||||
|
||||
def test_predict_on_toy_problem():
|
||||
"""Manually check predicted class labels for toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
|
||||
X = np.array(
|
||||
[[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]
|
||||
)
|
||||
|
||||
y = np.array([1, 1, 1, 2, 2, 2])
|
||||
|
||||
assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="hard",
|
||||
weights=[1, 1, 1],
|
||||
)
|
||||
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
weights=[1, 1, 1],
|
||||
)
|
||||
assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
|
||||
|
||||
|
||||
def test_predict_proba_on_toy_problem():
|
||||
"""Calculate predicted probabilities on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
clf1_res = np.array(
|
||||
[
|
||||
[0.59790391, 0.40209609],
|
||||
[0.57622162, 0.42377838],
|
||||
[0.50728456, 0.49271544],
|
||||
[0.40241774, 0.59758226],
|
||||
]
|
||||
)
|
||||
|
||||
clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]])
|
||||
|
||||
clf3_res = np.array(
|
||||
[[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0.0, 1.0], [0.0, 1.0]]
|
||||
)
|
||||
|
||||
t00 = (2 * clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
|
||||
t11 = (2 * clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
|
||||
t21 = (2 * clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
|
||||
t31 = (2 * clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
|
||||
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
weights=[2, 1, 1],
|
||||
)
|
||||
eclf_res = eclf.fit(X, y).predict_proba(X)
|
||||
|
||||
assert_almost_equal(t00, eclf_res[0][0], decimal=1)
|
||||
assert_almost_equal(t11, eclf_res[1][1], decimal=1)
|
||||
assert_almost_equal(t21, eclf_res[2][1], decimal=1)
|
||||
assert_almost_equal(t31, eclf_res[3][1], decimal=1)
|
||||
|
||||
with pytest.raises(
|
||||
AttributeError, match="predict_proba is not available when voting='hard'"
|
||||
):
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
|
||||
)
|
||||
eclf.fit(X, y).predict_proba(X)
|
||||
|
||||
|
||||
def test_multilabel():
|
||||
"""Check if error is raised for multilabel classification."""
|
||||
X, y = make_multilabel_classification(
|
||||
n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123
|
||||
)
|
||||
clf = OneVsRestClassifier(SVC(kernel="linear"))
|
||||
|
||||
eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard")
|
||||
|
||||
try:
|
||||
eclf.fit(X, y)
|
||||
except NotImplementedError:
|
||||
return
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
"""Check GridSearch support."""
|
||||
clf1 = LogisticRegression(random_state=1)
|
||||
clf2 = RandomForestClassifier(random_state=1, n_estimators=3)
|
||||
clf3 = GaussianNB()
|
||||
eclf = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
|
||||
)
|
||||
|
||||
params = {
|
||||
"lr__C": [1.0, 100.0],
|
||||
"voting": ["soft", "hard"],
|
||||
"weights": [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]],
|
||||
}
|
||||
|
||||
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=2)
|
||||
grid.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
def test_parallel_fit():
|
||||
"""Check parallel backend of VotingClassifier on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1
|
||||
).fit(X, y)
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2
|
||||
).fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
|
||||
|
||||
def test_sample_weight():
|
||||
"""Tests sample_weight parameter of VotingClassifier"""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = SVC(probability=True, random_state=123)
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
|
||||
).fit(X, y, sample_weight=np.ones((len(y),)))
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
|
||||
).fit(X, y)
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
|
||||
sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
|
||||
eclf3 = VotingClassifier(estimators=[("lr", clf1)], voting="soft")
|
||||
eclf3.fit(X, y, sample_weight)
|
||||
clf1.fit(X, y, sample_weight)
|
||||
assert_array_equal(eclf3.predict(X), clf1.predict(X))
|
||||
assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))
|
||||
|
||||
# check that an error is raised and indicative if sample_weight is not
|
||||
# supported.
|
||||
clf4 = KNeighborsClassifier()
|
||||
eclf3 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("svc", clf3), ("knn", clf4)], voting="soft"
|
||||
)
|
||||
msg = "Underlying estimator KNeighborsClassifier does not support sample weights."
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
eclf3.fit(X, y, sample_weight)
|
||||
|
||||
# check that _fit_single_estimator will raise the right error
|
||||
# it should raise the original error if this is not linked to sample_weight
|
||||
class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
|
||||
def fit(self, X, y, sample_weight):
|
||||
raise TypeError("Error unrelated to sample_weight.")
|
||||
|
||||
clf = ClassifierErrorFit()
|
||||
with pytest.raises(TypeError, match="Error unrelated to sample_weight"):
|
||||
clf.fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
def test_sample_weight_kwargs():
|
||||
"""Check that VotingClassifier passes sample_weight as kwargs"""
|
||||
|
||||
class MockClassifier(ClassifierMixin, BaseEstimator):
|
||||
"""Mock Classifier to check that sample_weight is received as kwargs"""
|
||||
|
||||
def fit(self, X, y, *args, **sample_weight):
|
||||
assert "sample_weight" in sample_weight
|
||||
|
||||
clf = MockClassifier()
|
||||
eclf = VotingClassifier(estimators=[("mock", clf)], voting="soft")
|
||||
|
||||
# Should not raise an error.
|
||||
eclf.fit(X, y, sample_weight=np.ones((len(y),)))
|
||||
|
||||
|
||||
def test_voting_classifier_set_params():
|
||||
# check equivalence in the output when setting underlying estimators
|
||||
clf1 = LogisticRegression(random_state=123, C=1.0)
|
||||
clf2 = RandomForestClassifier(random_state=123, max_depth=None)
|
||||
clf3 = GaussianNB()
|
||||
|
||||
eclf1 = VotingClassifier(
|
||||
[("lr", clf1), ("rf", clf2)], voting="soft", weights=[1, 2]
|
||||
).fit(X, y)
|
||||
eclf2 = VotingClassifier(
|
||||
[("lr", clf1), ("nb", clf3)], voting="soft", weights=[1, 2]
|
||||
)
|
||||
eclf2.set_params(nb=clf2).fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
assert eclf2.estimators[0][1].get_params() == clf1.get_params()
|
||||
assert eclf2.estimators[1][1].get_params() == clf2.get_params()
|
||||
|
||||
|
||||
def test_set_estimator_drop():
|
||||
# VotingClassifier set_params should be able to set estimators as drop
|
||||
# Test predict
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
|
||||
voting="hard",
|
||||
weights=[1, 0, 0.5],
|
||||
).fit(X, y)
|
||||
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
|
||||
voting="hard",
|
||||
weights=[1, 1, 0.5],
|
||||
)
|
||||
eclf2.set_params(rf="drop").fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
|
||||
assert dict(eclf2.estimators)["rf"] == "drop"
|
||||
assert len(eclf2.estimators_) == 2
|
||||
assert all(
|
||||
isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_
|
||||
)
|
||||
assert eclf2.get_params()["rf"] == "drop"
|
||||
|
||||
eclf1.set_params(voting="soft").fit(X, y)
|
||||
eclf2.set_params(voting="soft").fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
msg = "All estimators are dropped. At least one is required"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y)
|
||||
|
||||
# Test soft voting transform
|
||||
X1 = np.array([[1], [2]])
|
||||
y1 = np.array([1, 2])
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("rf", clf2), ("nb", clf3)],
|
||||
voting="soft",
|
||||
weights=[0, 0.5],
|
||||
flatten_transform=False,
|
||||
).fit(X1, y1)
|
||||
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("rf", clf2), ("nb", clf3)],
|
||||
voting="soft",
|
||||
weights=[1, 0.5],
|
||||
flatten_transform=False,
|
||||
)
|
||||
eclf2.set_params(rf="drop").fit(X1, y1)
|
||||
assert_array_almost_equal(
|
||||
eclf1.transform(X1),
|
||||
np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]),
|
||||
)
|
||||
assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]]))
|
||||
eclf1.set_params(voting="hard")
|
||||
eclf2.set_params(voting="hard")
|
||||
assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
|
||||
assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
|
||||
|
||||
|
||||
def test_estimator_weights_format():
|
||||
# Test estimator weights inputs as list and array
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft"
|
||||
)
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft"
|
||||
)
|
||||
eclf1.fit(X, y)
|
||||
eclf2.fit(X, y)
|
||||
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||||
|
||||
|
||||
def test_transform():
|
||||
"""Check transform method of VotingClassifier on toy dataset."""
|
||||
clf1 = LogisticRegression(random_state=123)
|
||||
clf2 = RandomForestClassifier(random_state=123)
|
||||
clf3 = GaussianNB()
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
eclf1 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
|
||||
).fit(X, y)
|
||||
eclf2 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
flatten_transform=True,
|
||||
).fit(X, y)
|
||||
eclf3 = VotingClassifier(
|
||||
estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
|
||||
voting="soft",
|
||||
flatten_transform=False,
|
||||
).fit(X, y)
|
||||
|
||||
assert_array_equal(eclf1.transform(X).shape, (4, 6))
|
||||
assert_array_equal(eclf2.transform(X).shape, (4, 6))
|
||||
assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
|
||||
assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X))
|
||||
assert_array_almost_equal(
|
||||
eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X, y, voter",
|
||||
[
|
||||
(
|
||||
X,
|
||||
y,
|
||||
VotingClassifier(
|
||||
[
|
||||
("lr", LogisticRegression()),
|
||||
("rf", RandomForestClassifier(n_estimators=5)),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
X_r,
|
||||
y_r,
|
||||
VotingRegressor(
|
||||
[
|
||||
("lr", LinearRegression()),
|
||||
("rf", RandomForestRegressor(n_estimators=5)),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_none_estimator_with_weights(X, y, voter):
|
||||
# check that an estimator can be set to 'drop' and passing some weight
|
||||
# regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/13777
|
||||
voter = clone(voter)
|
||||
voter.fit(X, y, sample_weight=np.ones(y.shape))
|
||||
voter.set_params(lr="drop")
|
||||
voter.fit(X, y, sample_weight=np.ones(y.shape))
|
||||
y_pred = voter.predict(X)
|
||||
assert y_pred.shape == y.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"est",
|
||||
[
|
||||
VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("tree", DecisionTreeRegressor(random_state=0)),
|
||||
]
|
||||
),
|
||||
VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("tree", DecisionTreeClassifier(random_state=0)),
|
||||
]
|
||||
),
|
||||
],
|
||||
ids=["VotingRegressor", "VotingClassifier"],
|
||||
)
|
||||
def test_n_features_in(est):
|
||||
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
assert not hasattr(est, "n_features_in_")
|
||||
est.fit(X, y)
|
||||
assert est.n_features_in_ == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("rf", RandomForestRegressor(random_state=123)),
|
||||
],
|
||||
verbose=True,
|
||||
),
|
||||
VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=123)),
|
||||
("rf", RandomForestClassifier(random_state=123)),
|
||||
],
|
||||
verbose=True,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_voting_verbose(estimator, capsys):
|
||||
|
||||
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||||
y = np.array([1, 1, 2, 2])
|
||||
|
||||
pattern = (
|
||||
r"\[Voting\].*\(1 of 2\) Processing lr, total=.*\n"
|
||||
r"\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$"
|
||||
)
|
||||
|
||||
estimator.fit(X, y)
|
||||
assert re.match(pattern, capsys.readouterr()[0])
|
||||
|
||||
|
||||
def test_get_features_names_out_regressor():
|
||||
"""Check get_feature_names_out output for regressor."""
|
||||
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
voting = VotingRegressor(
|
||||
estimators=[
|
||||
("lr", LinearRegression()),
|
||||
("tree", DecisionTreeRegressor(random_state=0)),
|
||||
("ignore", "drop"),
|
||||
]
|
||||
)
|
||||
voting.fit(X, y)
|
||||
|
||||
names_out = voting.get_feature_names_out()
|
||||
expected_names = ["votingregressor_lr", "votingregressor_tree"]
|
||||
assert_array_equal(names_out, expected_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs, expected_names",
|
||||
[
|
||||
(
|
||||
{"voting": "soft", "flatten_transform": True},
|
||||
[
|
||||
"votingclassifier_lr0",
|
||||
"votingclassifier_lr1",
|
||||
"votingclassifier_lr2",
|
||||
"votingclassifier_tree0",
|
||||
"votingclassifier_tree1",
|
||||
"votingclassifier_tree2",
|
||||
],
|
||||
),
|
||||
({"voting": "hard"}, ["votingclassifier_lr", "votingclassifier_tree"]),
|
||||
],
|
||||
)
|
||||
def test_get_features_names_out_classifier(kwargs, expected_names):
|
||||
"""Check get_feature_names_out for classifier for different settings."""
|
||||
X = [[1, 2], [3, 4], [5, 6], [1, 1.2]]
|
||||
y = [0, 1, 2, 0]
|
||||
|
||||
voting = VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("tree", DecisionTreeClassifier(random_state=0)),
|
||||
],
|
||||
**kwargs,
|
||||
)
|
||||
voting.fit(X, y)
|
||||
X_trans = voting.transform(X)
|
||||
names_out = voting.get_feature_names_out()
|
||||
|
||||
assert X_trans.shape[1] == len(expected_names)
|
||||
assert_array_equal(names_out, expected_names)
|
||||
|
||||
|
||||
def test_get_features_names_out_classifier_error():
|
||||
"""Check that error is raised when voting="soft" and flatten_transform=False."""
|
||||
X = [[1, 2], [3, 4], [5, 6]]
|
||||
y = [0, 1, 2]
|
||||
|
||||
voting = VotingClassifier(
|
||||
estimators=[
|
||||
("lr", LogisticRegression(random_state=0)),
|
||||
("tree", DecisionTreeClassifier(random_state=0)),
|
||||
],
|
||||
voting="soft",
|
||||
flatten_transform=False,
|
||||
)
|
||||
voting.fit(X, y)
|
||||
|
||||
msg = (
|
||||
"get_feature_names_out is not supported when `voting='soft'` and "
|
||||
"`flatten_transform=False`"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
voting.get_feature_names_out()
|
||||
@@ -0,0 +1,616 @@
|
||||
"""Testing for the boost module (sklearn.ensemble.boost)."""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import re
|
||||
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse import csr_matrix
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy.sparse import dok_matrix
|
||||
from scipy.sparse import lil_matrix
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal, assert_array_less
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import clone
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.ensemble import AdaBoostClassifier
|
||||
from sklearn.ensemble import AdaBoostRegressor
|
||||
from sklearn.ensemble._weight_boosting import _samme_proba
|
||||
from sklearn.svm import SVC, SVR
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils import shuffle
|
||||
from sklearn.utils._mocking import NoSampleWeightWrapper
|
||||
from sklearn import datasets
|
||||
|
||||
|
||||
# Common random state
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
# Toy sample
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
||||
y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels
|
||||
y_regr = [-1, -1, -1, 1, 1, 1]
|
||||
T = [[-1, -1], [2, 2], [3, 2]]
|
||||
y_t_class = ["foo", 1, 1]
|
||||
y_t_regr = [-1, 1, 1]
|
||||
|
||||
# Load the iris dataset and randomly permute it
|
||||
iris = datasets.load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
|
||||
|
||||
# Load the diabetes dataset and randomly permute it
|
||||
diabetes = datasets.load_diabetes()
|
||||
diabetes.data, diabetes.target = shuffle(
|
||||
diabetes.data, diabetes.target, random_state=rng
|
||||
)
|
||||
|
||||
|
||||
def test_samme_proba():
|
||||
# Test the `_samme_proba` helper function.
|
||||
|
||||
# Define some example (bad) `predict_proba` output.
|
||||
probs = np.array(
|
||||
[[1, 1e-6, 0], [0.19, 0.6, 0.2], [-999, 0.51, 0.5], [1e-6, 1, 1e-9]]
|
||||
)
|
||||
probs /= np.abs(probs.sum(axis=1))[:, np.newaxis]
|
||||
|
||||
# _samme_proba calls estimator.predict_proba.
|
||||
# Make a mock object so I can control what gets returned.
|
||||
class MockEstimator:
|
||||
def predict_proba(self, X):
|
||||
assert_array_equal(X.shape, probs.shape)
|
||||
return probs
|
||||
|
||||
mock = MockEstimator()
|
||||
|
||||
samme_proba = _samme_proba(mock, 3, np.ones_like(probs))
|
||||
|
||||
assert_array_equal(samme_proba.shape, probs.shape)
|
||||
assert np.isfinite(samme_proba).all()
|
||||
|
||||
# Make sure that the correct elements come out as smallest --
|
||||
# `_samme_proba` should preserve the ordering in each example.
|
||||
assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2])
|
||||
assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1])
|
||||
|
||||
|
||||
def test_oneclass_adaboost_proba():
|
||||
# Test predict_proba robustness for one class label input.
|
||||
# In response to issue #7501
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/7501
|
||||
y_t = np.ones(len(X))
|
||||
clf = AdaBoostClassifier().fit(X, y_t)
|
||||
assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_classification_toy(algorithm):
|
||||
# Check classification on a toy dataset.
|
||||
clf = AdaBoostClassifier(algorithm=algorithm, random_state=0)
|
||||
clf.fit(X, y_class)
|
||||
assert_array_equal(clf.predict(T), y_t_class)
|
||||
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
|
||||
assert clf.predict_proba(T).shape == (len(T), 2)
|
||||
assert clf.decision_function(T).shape == (len(T),)
|
||||
|
||||
|
||||
def test_regression_toy():
|
||||
# Check classification on a toy dataset.
|
||||
clf = AdaBoostRegressor(random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
assert_array_equal(clf.predict(T), y_t_regr)
|
||||
|
||||
|
||||
def test_iris():
|
||||
# Check consistency on dataset iris.
|
||||
classes = np.unique(iris.target)
|
||||
clf_samme = prob_samme = None
|
||||
|
||||
for alg in ["SAMME", "SAMME.R"]:
|
||||
clf = AdaBoostClassifier(algorithm=alg)
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
assert_array_equal(classes, clf.classes_)
|
||||
proba = clf.predict_proba(iris.data)
|
||||
if alg == "SAMME":
|
||||
clf_samme = clf
|
||||
prob_samme = proba
|
||||
assert proba.shape[1] == len(classes)
|
||||
assert clf.decision_function(iris.data).shape[1] == len(classes)
|
||||
|
||||
score = clf.score(iris.data, iris.target)
|
||||
assert score > 0.9, "Failed with algorithm %s and score = %f" % (alg, score)
|
||||
|
||||
# Check we used multiple estimators
|
||||
assert len(clf.estimators_) > 1
|
||||
# Check for distinct random states (see issue #7408)
|
||||
assert len(set(est.random_state for est in clf.estimators_)) == len(
|
||||
clf.estimators_
|
||||
)
|
||||
|
||||
# Somewhat hacky regression test: prior to
|
||||
# ae7adc880d624615a34bafdb1d75ef67051b8200,
|
||||
# predict_proba returned SAMME.R values for SAMME.
|
||||
clf_samme.algorithm = "SAMME.R"
|
||||
assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("loss", ["linear", "square", "exponential"])
|
||||
def test_diabetes(loss):
|
||||
# Check consistency on dataset diabetes.
|
||||
reg = AdaBoostRegressor(loss=loss, random_state=0)
|
||||
reg.fit(diabetes.data, diabetes.target)
|
||||
score = reg.score(diabetes.data, diabetes.target)
|
||||
assert score > 0.55
|
||||
|
||||
# Check we used multiple estimators
|
||||
assert len(reg.estimators_) > 1
|
||||
# Check for distinct random states (see issue #7408)
|
||||
assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_staged_predict(algorithm):
|
||||
# Check staged predictions.
|
||||
rng = np.random.RandomState(0)
|
||||
iris_weights = rng.randint(10, size=iris.target.shape)
|
||||
diabetes_weights = rng.randint(10, size=diabetes.target.shape)
|
||||
|
||||
clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10)
|
||||
clf.fit(iris.data, iris.target, sample_weight=iris_weights)
|
||||
|
||||
predictions = clf.predict(iris.data)
|
||||
staged_predictions = [p for p in clf.staged_predict(iris.data)]
|
||||
proba = clf.predict_proba(iris.data)
|
||||
staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
|
||||
score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
|
||||
staged_scores = [
|
||||
s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights)
|
||||
]
|
||||
|
||||
assert len(staged_predictions) == 10
|
||||
assert_array_almost_equal(predictions, staged_predictions[-1])
|
||||
assert len(staged_probas) == 10
|
||||
assert_array_almost_equal(proba, staged_probas[-1])
|
||||
assert len(staged_scores) == 10
|
||||
assert_array_almost_equal(score, staged_scores[-1])
|
||||
|
||||
# AdaBoost regression
|
||||
clf = AdaBoostRegressor(n_estimators=10, random_state=0)
|
||||
clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
|
||||
|
||||
predictions = clf.predict(diabetes.data)
|
||||
staged_predictions = [p for p in clf.staged_predict(diabetes.data)]
|
||||
score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
|
||||
staged_scores = [
|
||||
s
|
||||
for s in clf.staged_score(
|
||||
diabetes.data, diabetes.target, sample_weight=diabetes_weights
|
||||
)
|
||||
]
|
||||
|
||||
assert len(staged_predictions) == 10
|
||||
assert_array_almost_equal(predictions, staged_predictions[-1])
|
||||
assert len(staged_scores) == 10
|
||||
assert_array_almost_equal(score, staged_scores[-1])
|
||||
|
||||
|
||||
def test_gridsearch():
|
||||
# Check that base trees can be grid-searched.
|
||||
# AdaBoost classification
|
||||
boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
|
||||
parameters = {
|
||||
"n_estimators": (1, 2),
|
||||
"base_estimator__max_depth": (1, 2),
|
||||
"algorithm": ("SAMME", "SAMME.R"),
|
||||
}
|
||||
clf = GridSearchCV(boost, parameters)
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
# AdaBoost regression
|
||||
boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0)
|
||||
parameters = {"n_estimators": (1, 2), "base_estimator__max_depth": (1, 2)}
|
||||
clf = GridSearchCV(boost, parameters)
|
||||
clf.fit(diabetes.data, diabetes.target)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
# Check pickability.
|
||||
import pickle
|
||||
|
||||
# Adaboost classifier
|
||||
for alg in ["SAMME", "SAMME.R"]:
|
||||
obj = AdaBoostClassifier(algorithm=alg)
|
||||
obj.fit(iris.data, iris.target)
|
||||
score = obj.score(iris.data, iris.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(iris.data, iris.target)
|
||||
assert score == score2
|
||||
|
||||
# Adaboost regressor
|
||||
obj = AdaBoostRegressor(random_state=0)
|
||||
obj.fit(diabetes.data, diabetes.target)
|
||||
score = obj.score(diabetes.data, diabetes.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(diabetes.data, diabetes.target)
|
||||
assert score == score2
|
||||
|
||||
|
||||
def test_importances():
|
||||
# Check variable importances.
|
||||
X, y = datasets.make_classification(
|
||||
n_samples=2000,
|
||||
n_features=10,
|
||||
n_informative=3,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
shuffle=False,
|
||||
random_state=1,
|
||||
)
|
||||
|
||||
for alg in ["SAMME", "SAMME.R"]:
|
||||
clf = AdaBoostClassifier(algorithm=alg)
|
||||
|
||||
clf.fit(X, y)
|
||||
importances = clf.feature_importances_
|
||||
|
||||
assert importances.shape[0] == 10
|
||||
assert (importances[:3, np.newaxis] >= importances[3:]).all()
|
||||
|
||||
|
||||
def test_error():
|
||||
# Test that it gives proper exception on deficient input.
|
||||
|
||||
reg = AdaBoostRegressor(loss="foo")
|
||||
msg = "loss must be 'linear', 'square', or 'exponential'. Got 'foo' instead."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
reg.fit(X, y_class)
|
||||
|
||||
clf = AdaBoostClassifier(algorithm="foo")
|
||||
msg = "Algorithm must be 'SAMME' or 'SAMME.R'. Got 'foo' instead."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clf.fit(X, y_class)
|
||||
|
||||
clf = AdaBoostClassifier()
|
||||
msg = re.escape("sample_weight.shape == (1,), expected (6,)")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
clf.fit(X, y_class, sample_weight=np.asarray([-1]))
|
||||
|
||||
|
||||
def test_base_estimator():
|
||||
# Test different base estimators.
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
# XXX doesn't work with y_class because RF doesn't support classes_
|
||||
# Shouldn't AdaBoost run a LabelBinarizer?
|
||||
clf = AdaBoostClassifier(RandomForestClassifier())
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
|
||||
clf.fit(X, y_class)
|
||||
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
|
||||
clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
clf = AdaBoostRegressor(SVR(), random_state=0)
|
||||
clf.fit(X, y_regr)
|
||||
|
||||
# Check that an empty discrete ensemble fails in fit, not predict.
|
||||
X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
|
||||
y_fail = ["foo", "bar", 1, 2]
|
||||
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
|
||||
with pytest.raises(ValueError, match="worse than random"):
|
||||
clf.fit(X_fail, y_fail)
|
||||
|
||||
|
||||
def test_sample_weights_infinite():
|
||||
msg = "Sample weights have reached infinite values"
|
||||
clf = AdaBoostClassifier(n_estimators=30, learning_rate=5.0, algorithm="SAMME")
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
clf.fit(iris.data, iris.target)
|
||||
|
||||
|
||||
def test_sparse_classification():
|
||||
# Check classification with sparse input.
|
||||
|
||||
class CustomSVC(SVC):
|
||||
"""SVC variant that records the nature of the training set."""
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Modification on fit caries data type for later verification."""
|
||||
super().fit(X, y, sample_weight=sample_weight)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
X, y = datasets.make_multilabel_classification(
|
||||
n_classes=1, n_samples=15, n_features=5, random_state=42
|
||||
)
|
||||
# Flatten y to a 1d array
|
||||
y = np.ravel(y)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]:
|
||||
X_train_sparse = sparse_format(X_train)
|
||||
X_test_sparse = sparse_format(X_test)
|
||||
|
||||
# Trained on sparse format
|
||||
sparse_classifier = AdaBoostClassifier(
|
||||
base_estimator=CustomSVC(probability=True),
|
||||
random_state=1,
|
||||
algorithm="SAMME",
|
||||
).fit(X_train_sparse, y_train)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = AdaBoostClassifier(
|
||||
base_estimator=CustomSVC(probability=True),
|
||||
random_state=1,
|
||||
algorithm="SAMME",
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# predict
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
dense_results = dense_classifier.predict(X_test)
|
||||
assert_array_equal(sparse_results, dense_results)
|
||||
|
||||
# decision_function
|
||||
sparse_results = sparse_classifier.decision_function(X_test_sparse)
|
||||
dense_results = dense_classifier.decision_function(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
# predict_log_proba
|
||||
sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
|
||||
dense_results = dense_classifier.predict_log_proba(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
# predict_proba
|
||||
sparse_results = sparse_classifier.predict_proba(X_test_sparse)
|
||||
dense_results = dense_classifier.predict_proba(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
# score
|
||||
sparse_results = sparse_classifier.score(X_test_sparse, y_test)
|
||||
dense_results = dense_classifier.score(X_test, y_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
# staged_decision_function
|
||||
sparse_results = sparse_classifier.staged_decision_function(X_test_sparse)
|
||||
dense_results = dense_classifier.staged_decision_function(X_test)
|
||||
for sprase_res, dense_res in zip(sparse_results, dense_results):
|
||||
assert_array_almost_equal(sprase_res, dense_res)
|
||||
|
||||
# staged_predict
|
||||
sparse_results = sparse_classifier.staged_predict(X_test_sparse)
|
||||
dense_results = dense_classifier.staged_predict(X_test)
|
||||
for sprase_res, dense_res in zip(sparse_results, dense_results):
|
||||
assert_array_equal(sprase_res, dense_res)
|
||||
|
||||
# staged_predict_proba
|
||||
sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
|
||||
dense_results = dense_classifier.staged_predict_proba(X_test)
|
||||
for sprase_res, dense_res in zip(sparse_results, dense_results):
|
||||
assert_array_almost_equal(sprase_res, dense_res)
|
||||
|
||||
# staged_score
|
||||
sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test)
|
||||
dense_results = dense_classifier.staged_score(X_test, y_test)
|
||||
for sprase_res, dense_res in zip(sparse_results, dense_results):
|
||||
assert_array_equal(sprase_res, dense_res)
|
||||
|
||||
# Verify sparsity of data is maintained during training
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert all([(t == csc_matrix or t == csr_matrix) for t in types])
|
||||
|
||||
|
||||
def test_sparse_regression():
|
||||
# Check regression with sparse input.
|
||||
|
||||
class CustomSVR(SVR):
|
||||
"""SVR variant that records the nature of the training set."""
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Modification on fit caries data type for later verification."""
|
||||
super().fit(X, y, sample_weight=sample_weight)
|
||||
self.data_type_ = type(X)
|
||||
return self
|
||||
|
||||
X, y = datasets.make_regression(
|
||||
n_samples=15, n_features=50, n_targets=1, random_state=42
|
||||
)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||
|
||||
for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]:
|
||||
X_train_sparse = sparse_format(X_train)
|
||||
X_test_sparse = sparse_format(X_test)
|
||||
|
||||
# Trained on sparse format
|
||||
sparse_classifier = AdaBoostRegressor(
|
||||
base_estimator=CustomSVR(), random_state=1
|
||||
).fit(X_train_sparse, y_train)
|
||||
|
||||
# Trained on dense format
|
||||
dense_classifier = dense_results = AdaBoostRegressor(
|
||||
base_estimator=CustomSVR(), random_state=1
|
||||
).fit(X_train, y_train)
|
||||
|
||||
# predict
|
||||
sparse_results = sparse_classifier.predict(X_test_sparse)
|
||||
dense_results = dense_classifier.predict(X_test)
|
||||
assert_array_almost_equal(sparse_results, dense_results)
|
||||
|
||||
# staged_predict
|
||||
sparse_results = sparse_classifier.staged_predict(X_test_sparse)
|
||||
dense_results = dense_classifier.staged_predict(X_test)
|
||||
for sprase_res, dense_res in zip(sparse_results, dense_results):
|
||||
assert_array_almost_equal(sprase_res, dense_res)
|
||||
|
||||
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||||
|
||||
assert all([(t == csc_matrix or t == csr_matrix) for t in types])
|
||||
|
||||
|
||||
def test_sample_weight_adaboost_regressor():
|
||||
"""
|
||||
AdaBoostRegressor should work without sample_weights in the base estimator
|
||||
The random weighted sampling is done internally in the _boost method in
|
||||
AdaBoostRegressor.
|
||||
"""
|
||||
|
||||
class DummyEstimator(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
pass
|
||||
|
||||
def predict(self, X):
|
||||
return np.zeros(X.shape[0])
|
||||
|
||||
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
|
||||
boost.fit(X, y_regr)
|
||||
assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
|
||||
|
||||
|
||||
def test_multidimensional_X():
|
||||
"""
|
||||
Check that the AdaBoost estimators can work with n-dimensional
|
||||
data matrix
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
X = rng.randn(50, 3, 3)
|
||||
yc = rng.choice([0, 1], 50)
|
||||
yr = rng.randn(50)
|
||||
|
||||
boost = AdaBoostClassifier(DummyClassifier(strategy="most_frequent"))
|
||||
boost.fit(X, yc)
|
||||
boost.predict(X)
|
||||
boost.predict_proba(X)
|
||||
|
||||
boost = AdaBoostRegressor(DummyRegressor())
|
||||
boost.fit(X, yr)
|
||||
boost.predict(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_adaboostclassifier_without_sample_weight(algorithm):
|
||||
X, y = iris.data, iris.target
|
||||
base_estimator = NoSampleWeightWrapper(DummyClassifier())
|
||||
clf = AdaBoostClassifier(base_estimator=base_estimator, algorithm=algorithm)
|
||||
err_msg = "{} doesn't support sample_weight".format(
|
||||
base_estimator.__class__.__name__
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_adaboostregressor_sample_weight():
|
||||
# check that giving weight will have an influence on the error computed
|
||||
# for a weak learner
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.linspace(0, 100, num=1000)
|
||||
y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
|
||||
X = X.reshape(-1, 1)
|
||||
|
||||
# add an arbitrary outlier
|
||||
X[-1] *= 10
|
||||
y[-1] = 10000
|
||||
|
||||
# random_state=0 ensure that the underlying bootstrap will use the outlier
|
||||
regr_no_outlier = AdaBoostRegressor(
|
||||
base_estimator=LinearRegression(), n_estimators=1, random_state=0
|
||||
)
|
||||
regr_with_weight = clone(regr_no_outlier)
|
||||
regr_with_outlier = clone(regr_no_outlier)
|
||||
|
||||
# fit 3 models:
|
||||
# - a model containing the outlier
|
||||
# - a model without the outlier
|
||||
# - a model containing the outlier but with a null sample-weight
|
||||
regr_with_outlier.fit(X, y)
|
||||
regr_no_outlier.fit(X[:-1], y[:-1])
|
||||
sample_weight = np.ones_like(y)
|
||||
sample_weight[-1] = 0
|
||||
regr_with_weight.fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
|
||||
score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
|
||||
score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
|
||||
|
||||
assert score_with_outlier < score_no_outlier
|
||||
assert score_with_outlier < score_with_weight
|
||||
assert score_no_outlier == pytest.approx(score_with_weight)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_type, err_msg",
|
||||
[
|
||||
({"n_estimators": -1}, ValueError, "n_estimators == -1, must be >= 1"),
|
||||
({"n_estimators": 0}, ValueError, "n_estimators == 0, must be >= 1"),
|
||||
(
|
||||
{"n_estimators": 1.5},
|
||||
TypeError,
|
||||
"n_estimators must be an instance of int, not float",
|
||||
),
|
||||
({"learning_rate": -1}, ValueError, "learning_rate == -1, must be > 0."),
|
||||
({"learning_rate": 0}, ValueError, "learning_rate == 0, must be > 0."),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model, X, y",
|
||||
[
|
||||
(AdaBoostClassifier, X, y_class),
|
||||
(AdaBoostRegressor, X, y_regr),
|
||||
],
|
||||
)
|
||||
def test_adaboost_params_validation(model, X, y, params, err_type, err_msg):
|
||||
"""Check input parameter validation in weight boosting."""
|
||||
est = model(**params)
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
est.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||||
def test_adaboost_consistent_predict(algorithm):
|
||||
# check that predict_proba and predict give consistent results
|
||||
# regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/14084
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
*datasets.load_digits(return_X_y=True), random_state=42
|
||||
)
|
||||
model = AdaBoostClassifier(algorithm=algorithm, random_state=42)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
assert_array_equal(
|
||||
np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model, X, y",
|
||||
[
|
||||
(AdaBoostClassifier(), iris.data, iris.target),
|
||||
(AdaBoostRegressor(), diabetes.data, diabetes.target),
|
||||
],
|
||||
)
|
||||
def test_adaboost_negative_weight_error(model, X, y):
|
||||
sample_weight = np.ones_like(y)
|
||||
sample_weight[-1] = -10
|
||||
|
||||
err_msg = "Negative values in data passed to `sample_weight`"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
model.fit(X, y, sample_weight=sample_weight)
|
||||
Reference in New Issue
Block a user