first commit
This commit is contained in:
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,30 @@
|
||||
from .validation import check_random_state
|
||||
|
||||
|
||||
def _init_arpack_v0(size, random_state):
|
||||
"""Initialize the starting vector for iteration in ARPACK functions.
|
||||
|
||||
Initialize a ndarray with values sampled from the uniform distribution on
|
||||
[-1, 1]. This initialization model has been chosen to be consistent with
|
||||
the ARPACK one as another initialization can lead to convergence issues.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int
|
||||
The size of the eigenvalue vector to be initialized.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
The seed of the pseudo random number generator used to generate a
|
||||
uniform distribution. If int, random_state is the seed used by the
|
||||
random number generator; If RandomState instance, random_state is the
|
||||
random number generator; If None, the random number generator is the
|
||||
RandomState instance used by `np.random`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v0 : ndarray of shape (size,)
|
||||
The initialized vector.
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
v0 = random_state.uniform(-1, 1, size)
|
||||
return v0
|
||||
@@ -0,0 +1,48 @@
|
||||
class Bunch(dict):
|
||||
"""Container object exposing keys as attributes.
|
||||
|
||||
Bunch objects are sometimes used as an output for functions and methods.
|
||||
They extend dictionaries by enabling values to be accessed by key,
|
||||
`bunch["value_key"]`, or by an attribute, `bunch.value_key`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.utils import Bunch
|
||||
>>> b = Bunch(a=1, b=2)
|
||||
>>> b['b']
|
||||
2
|
||||
>>> b.b
|
||||
2
|
||||
>>> b.a = 3
|
||||
>>> b['a']
|
||||
3
|
||||
>>> b.c = 6
|
||||
>>> b['c']
|
||||
6
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(kwargs)
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
self[key] = value
|
||||
|
||||
def __dir__(self):
|
||||
return self.keys()
|
||||
|
||||
def __getattr__(self, key):
|
||||
try:
|
||||
return self[key]
|
||||
except KeyError:
|
||||
raise AttributeError(key)
|
||||
|
||||
def __setstate__(self, state):
|
||||
# Bunch pickles generated with scikit-learn 0.16.* have an non
|
||||
# empty __dict__. This causes a surprising behaviour when
|
||||
# loading these pickles scikit-learn 0.17: reading bunch.key
|
||||
# uses __dict__ but assigning to bunch.key use __setattr__ and
|
||||
# only changes bunch['key']. More details can be found at:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/6196.
|
||||
# Overriding __setstate__ to be a noop has the effect of
|
||||
# ignoring the pickled __dict__
|
||||
pass
|
||||
Binary file not shown.
@@ -0,0 +1,41 @@
|
||||
from cython cimport floating
|
||||
|
||||
|
||||
cpdef enum BLAS_Order:
|
||||
RowMajor # C contiguous
|
||||
ColMajor # Fortran contiguous
|
||||
|
||||
|
||||
cpdef enum BLAS_Trans:
|
||||
NoTrans = 110 # correspond to 'n'
|
||||
Trans = 116 # correspond to 't'
|
||||
|
||||
|
||||
# BLAS Level 1 ################################################################
|
||||
cdef floating _dot(int, floating*, int, floating*, int) nogil
|
||||
|
||||
cdef floating _asum(int, floating*, int) nogil
|
||||
|
||||
cdef void _axpy(int, floating, floating*, int, floating*, int) nogil
|
||||
|
||||
cdef floating _nrm2(int, floating*, int) nogil
|
||||
|
||||
cdef void _copy(int, floating*, int, floating*, int) nogil
|
||||
|
||||
cdef void _scal(int, floating, floating*, int) nogil
|
||||
|
||||
cdef void _rotg(floating*, floating*, floating*, floating*) nogil
|
||||
|
||||
cdef void _rot(int, floating*, int, floating*, int, floating, floating) nogil
|
||||
|
||||
# BLAS Level 2 ################################################################
|
||||
cdef void _gemv(BLAS_Order, BLAS_Trans, int, int, floating, floating*, int,
|
||||
floating*, int, floating, floating*, int) nogil
|
||||
|
||||
cdef void _ger(BLAS_Order, int, int, floating, floating*, int, floating*, int,
|
||||
floating*, int) nogil
|
||||
|
||||
# BLASLevel 3 ################################################################
|
||||
cdef void _gemm(BLAS_Order, BLAS_Trans, BLAS_Trans, int, int, int, floating,
|
||||
floating*, int, floating*, int, floating, floating*,
|
||||
int) nogil
|
||||
@@ -0,0 +1,366 @@
|
||||
from contextlib import suppress
|
||||
from collections import Counter
|
||||
from typing import NamedTuple
|
||||
|
||||
import numpy as np
|
||||
from . import is_scalar_nan
|
||||
|
||||
|
||||
def _unique(values, *, return_inverse=False, return_counts=False):
|
||||
"""Helper function to find unique values with support for python objects.
|
||||
|
||||
Uses pure python method for object dtype, and numpy method for
|
||||
all other dtypes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray
|
||||
Values to check for unknowns.
|
||||
|
||||
return_inverse : bool, default=False
|
||||
If True, also return the indices of the unique values.
|
||||
|
||||
return_counts : bool, default=False
|
||||
If True, also return the number of times each unique item appears in
|
||||
values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
unique : ndarray
|
||||
The sorted unique values.
|
||||
|
||||
unique_inverse : ndarray
|
||||
The indices to reconstruct the original array from the unique array.
|
||||
Only provided if `return_inverse` is True.
|
||||
|
||||
unique_counts : ndarray
|
||||
The number of times each of the unique values comes up in the original
|
||||
array. Only provided if `return_counts` is True.
|
||||
"""
|
||||
if values.dtype == object:
|
||||
return _unique_python(
|
||||
values, return_inverse=return_inverse, return_counts=return_counts
|
||||
)
|
||||
# numerical
|
||||
return _unique_np(
|
||||
values, return_inverse=return_inverse, return_counts=return_counts
|
||||
)
|
||||
|
||||
|
||||
def _unique_np(values, return_inverse=False, return_counts=False):
|
||||
"""Helper function to find unique values for numpy arrays that correctly
|
||||
accounts for nans. See `_unique` documentation for details."""
|
||||
uniques = np.unique(
|
||||
values, return_inverse=return_inverse, return_counts=return_counts
|
||||
)
|
||||
|
||||
inverse, counts = None, None
|
||||
|
||||
if return_counts:
|
||||
*uniques, counts = uniques
|
||||
|
||||
if return_inverse:
|
||||
*uniques, inverse = uniques
|
||||
|
||||
if return_counts or return_inverse:
|
||||
uniques = uniques[0]
|
||||
|
||||
# np.unique will have duplicate missing values at the end of `uniques`
|
||||
# here we clip the nans and remove it from uniques
|
||||
if uniques.size and is_scalar_nan(uniques[-1]):
|
||||
nan_idx = np.searchsorted(uniques, np.nan)
|
||||
uniques = uniques[: nan_idx + 1]
|
||||
if return_inverse:
|
||||
inverse[inverse > nan_idx] = nan_idx
|
||||
|
||||
if return_counts:
|
||||
counts[nan_idx] = np.sum(counts[nan_idx:])
|
||||
counts = counts[: nan_idx + 1]
|
||||
|
||||
ret = (uniques,)
|
||||
|
||||
if return_inverse:
|
||||
ret += (inverse,)
|
||||
|
||||
if return_counts:
|
||||
ret += (counts,)
|
||||
|
||||
return ret[0] if len(ret) == 1 else ret
|
||||
|
||||
|
||||
class MissingValues(NamedTuple):
|
||||
"""Data class for missing data information"""
|
||||
|
||||
nan: bool
|
||||
none: bool
|
||||
|
||||
def to_list(self):
|
||||
"""Convert tuple to a list where None is always first."""
|
||||
output = []
|
||||
if self.none:
|
||||
output.append(None)
|
||||
if self.nan:
|
||||
output.append(np.nan)
|
||||
return output
|
||||
|
||||
|
||||
def _extract_missing(values):
|
||||
"""Extract missing values from `values`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values: set
|
||||
Set of values to extract missing from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
output: set
|
||||
Set with missing values extracted.
|
||||
|
||||
missing_values: MissingValues
|
||||
Object with missing value information.
|
||||
"""
|
||||
missing_values_set = {
|
||||
value for value in values if value is None or is_scalar_nan(value)
|
||||
}
|
||||
|
||||
if not missing_values_set:
|
||||
return values, MissingValues(nan=False, none=False)
|
||||
|
||||
if None in missing_values_set:
|
||||
if len(missing_values_set) == 1:
|
||||
output_missing_values = MissingValues(nan=False, none=True)
|
||||
else:
|
||||
# If there is more than one missing value, then it has to be
|
||||
# float('nan') or np.nan
|
||||
output_missing_values = MissingValues(nan=True, none=True)
|
||||
else:
|
||||
output_missing_values = MissingValues(nan=True, none=False)
|
||||
|
||||
# create set without the missing values
|
||||
output = values - missing_values_set
|
||||
return output, output_missing_values
|
||||
|
||||
|
||||
class _nandict(dict):
|
||||
"""Dictionary with support for nans."""
|
||||
|
||||
def __init__(self, mapping):
|
||||
super().__init__(mapping)
|
||||
for key, value in mapping.items():
|
||||
if is_scalar_nan(key):
|
||||
self.nan_value = value
|
||||
break
|
||||
|
||||
def __missing__(self, key):
|
||||
if hasattr(self, "nan_value") and is_scalar_nan(key):
|
||||
return self.nan_value
|
||||
raise KeyError(key)
|
||||
|
||||
|
||||
def _map_to_integer(values, uniques):
|
||||
"""Map values based on its position in uniques."""
|
||||
table = _nandict({val: i for i, val in enumerate(uniques)})
|
||||
return np.array([table[v] for v in values])
|
||||
|
||||
|
||||
def _unique_python(values, *, return_inverse, return_counts):
|
||||
# Only used in `_uniques`, see docstring there for details
|
||||
try:
|
||||
uniques_set = set(values)
|
||||
uniques_set, missing_values = _extract_missing(uniques_set)
|
||||
|
||||
uniques = sorted(uniques_set)
|
||||
uniques.extend(missing_values.to_list())
|
||||
uniques = np.array(uniques, dtype=values.dtype)
|
||||
except TypeError:
|
||||
types = sorted(t.__qualname__ for t in set(type(v) for v in values))
|
||||
raise TypeError(
|
||||
"Encoders require their input to be uniformly "
|
||||
f"strings or numbers. Got {types}"
|
||||
)
|
||||
ret = (uniques,)
|
||||
|
||||
if return_inverse:
|
||||
ret += (_map_to_integer(values, uniques),)
|
||||
|
||||
if return_counts:
|
||||
ret += (_get_counts(values, uniques),)
|
||||
|
||||
return ret[0] if len(ret) == 1 else ret
|
||||
|
||||
|
||||
def _encode(values, *, uniques, check_unknown=True):
|
||||
"""Helper function to encode values into [0, n_uniques - 1].
|
||||
|
||||
Uses pure python method for object dtype, and numpy method for
|
||||
all other dtypes.
|
||||
The numpy method has the limitation that the `uniques` need to
|
||||
be sorted. Importantly, this is not checked but assumed to already be
|
||||
the case. The calling method needs to ensure this for all non-object
|
||||
values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray
|
||||
Values to encode.
|
||||
uniques : ndarray
|
||||
The unique values in `values`. If the dtype is not object, then
|
||||
`uniques` needs to be sorted.
|
||||
check_unknown : bool, default=True
|
||||
If True, check for values in `values` that are not in `unique`
|
||||
and raise an error. This is ignored for object dtype, and treated as
|
||||
True in this case. This parameter is useful for
|
||||
_BaseEncoder._transform() to avoid calling _check_unknown()
|
||||
twice.
|
||||
|
||||
Returns
|
||||
-------
|
||||
encoded : ndarray
|
||||
Encoded values
|
||||
"""
|
||||
if values.dtype.kind in "OUS":
|
||||
try:
|
||||
return _map_to_integer(values, uniques)
|
||||
except KeyError as e:
|
||||
raise ValueError(f"y contains previously unseen labels: {str(e)}")
|
||||
else:
|
||||
if check_unknown:
|
||||
diff = _check_unknown(values, uniques)
|
||||
if diff:
|
||||
raise ValueError(f"y contains previously unseen labels: {str(diff)}")
|
||||
return np.searchsorted(uniques, values)
|
||||
|
||||
|
||||
def _check_unknown(values, known_values, return_mask=False):
|
||||
"""
|
||||
Helper function to check for unknowns in values to be encoded.
|
||||
|
||||
Uses pure python method for object dtype, and numpy method for
|
||||
all other dtypes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : array
|
||||
Values to check for unknowns.
|
||||
known_values : array
|
||||
Known values. Must be unique.
|
||||
return_mask : bool, default=False
|
||||
If True, return a mask of the same shape as `values` indicating
|
||||
the valid values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
diff : list
|
||||
The unique values present in `values` and not in `know_values`.
|
||||
valid_mask : boolean array
|
||||
Additionally returned if ``return_mask=True``.
|
||||
|
||||
"""
|
||||
valid_mask = None
|
||||
|
||||
if values.dtype.kind in "OUS":
|
||||
values_set = set(values)
|
||||
values_set, missing_in_values = _extract_missing(values_set)
|
||||
|
||||
uniques_set = set(known_values)
|
||||
uniques_set, missing_in_uniques = _extract_missing(uniques_set)
|
||||
diff = values_set - uniques_set
|
||||
|
||||
nan_in_diff = missing_in_values.nan and not missing_in_uniques.nan
|
||||
none_in_diff = missing_in_values.none and not missing_in_uniques.none
|
||||
|
||||
def is_valid(value):
|
||||
return (
|
||||
value in uniques_set
|
||||
or missing_in_uniques.none
|
||||
and value is None
|
||||
or missing_in_uniques.nan
|
||||
and is_scalar_nan(value)
|
||||
)
|
||||
|
||||
if return_mask:
|
||||
if diff or nan_in_diff or none_in_diff:
|
||||
valid_mask = np.array([is_valid(value) for value in values])
|
||||
else:
|
||||
valid_mask = np.ones(len(values), dtype=bool)
|
||||
|
||||
diff = list(diff)
|
||||
if none_in_diff:
|
||||
diff.append(None)
|
||||
if nan_in_diff:
|
||||
diff.append(np.nan)
|
||||
else:
|
||||
unique_values = np.unique(values)
|
||||
diff = np.setdiff1d(unique_values, known_values, assume_unique=True)
|
||||
if return_mask:
|
||||
if diff.size:
|
||||
valid_mask = np.in1d(values, known_values)
|
||||
else:
|
||||
valid_mask = np.ones(len(values), dtype=bool)
|
||||
|
||||
# check for nans in the known_values
|
||||
if np.isnan(known_values).any():
|
||||
diff_is_nan = np.isnan(diff)
|
||||
if diff_is_nan.any():
|
||||
# removes nan from valid_mask
|
||||
if diff.size and return_mask:
|
||||
is_nan = np.isnan(values)
|
||||
valid_mask[is_nan] = 1
|
||||
|
||||
# remove nan from diff
|
||||
diff = diff[~diff_is_nan]
|
||||
diff = list(diff)
|
||||
|
||||
if return_mask:
|
||||
return diff, valid_mask
|
||||
return diff
|
||||
|
||||
|
||||
class _NaNCounter(Counter):
|
||||
"""Counter with support for nan values."""
|
||||
|
||||
def __init__(self, items):
|
||||
super().__init__(self._generate_items(items))
|
||||
|
||||
def _generate_items(self, items):
|
||||
"""Generate items without nans. Stores the nan counts separately."""
|
||||
for item in items:
|
||||
if not is_scalar_nan(item):
|
||||
yield item
|
||||
continue
|
||||
if not hasattr(self, "nan_count"):
|
||||
self.nan_count = 0
|
||||
self.nan_count += 1
|
||||
|
||||
def __missing__(self, key):
|
||||
if hasattr(self, "nan_count") and is_scalar_nan(key):
|
||||
return self.nan_count
|
||||
raise KeyError(key)
|
||||
|
||||
|
||||
def _get_counts(values, uniques):
|
||||
"""Get the count of each of the `uniques` in `values`.
|
||||
|
||||
The counts will use the order passed in by `uniques`. For non-object dtypes,
|
||||
`uniques` is assumed to be sorted and `np.nan` is at the end.
|
||||
"""
|
||||
if values.dtype.kind in "OU":
|
||||
counter = _NaNCounter(values)
|
||||
output = np.zeros(len(uniques), dtype=np.int64)
|
||||
for i, item in enumerate(uniques):
|
||||
with suppress(KeyError):
|
||||
output[i] = counter[item]
|
||||
return output
|
||||
|
||||
unique_values, counts = _unique_np(values, return_counts=True)
|
||||
|
||||
# Recorder unique_values based on input: `uniques`
|
||||
uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
|
||||
if np.isnan(unique_values[-1]) and np.isnan(uniques[-1]):
|
||||
uniques_in_values[-1] = True
|
||||
|
||||
unique_valid_indices = np.searchsorted(unique_values, uniques[uniques_in_values])
|
||||
output = np.zeros_like(uniques, dtype=np.int64)
|
||||
output[uniques_in_values] = counts[unique_valid_indices]
|
||||
return output
|
||||
@@ -0,0 +1,420 @@
|
||||
from contextlib import closing
|
||||
from contextlib import suppress
|
||||
from io import StringIO
|
||||
from string import Template
|
||||
import html
|
||||
|
||||
from .. import config_context
|
||||
|
||||
|
||||
class _IDCounter:
|
||||
"""Generate sequential ids with a prefix."""
|
||||
|
||||
def __init__(self, prefix):
|
||||
self.prefix = prefix
|
||||
self.count = 0
|
||||
|
||||
def get_id(self):
|
||||
self.count += 1
|
||||
return f"{self.prefix}-{self.count}"
|
||||
|
||||
|
||||
_CONTAINER_ID_COUNTER = _IDCounter("sk-container-id")
|
||||
_ESTIMATOR_ID_COUNTER = _IDCounter("sk-estimator-id")
|
||||
|
||||
|
||||
class _VisualBlock:
|
||||
"""HTML Representation of Estimator
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kind : {'serial', 'parallel', 'single'}
|
||||
kind of HTML block
|
||||
|
||||
estimators : list of estimators or `_VisualBlock`s or a single estimator
|
||||
If kind != 'single', then `estimators` is a list of
|
||||
estimators.
|
||||
If kind == 'single', then `estimators` is a single estimator.
|
||||
|
||||
names : list of str, default=None
|
||||
If kind != 'single', then `names` corresponds to estimators.
|
||||
If kind == 'single', then `names` is a single string corresponding to
|
||||
the single estimator.
|
||||
|
||||
name_details : list of str, str, or None, default=None
|
||||
If kind != 'single', then `name_details` corresponds to `names`.
|
||||
If kind == 'single', then `name_details` is a single string
|
||||
corresponding to the single estimator.
|
||||
|
||||
dash_wrapped : bool, default=True
|
||||
If true, wrapped HTML element will be wrapped with a dashed border.
|
||||
Only active when kind != 'single'.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, kind, estimators, *, names=None, name_details=None, dash_wrapped=True
|
||||
):
|
||||
self.kind = kind
|
||||
self.estimators = estimators
|
||||
self.dash_wrapped = dash_wrapped
|
||||
|
||||
if self.kind in ("parallel", "serial"):
|
||||
if names is None:
|
||||
names = (None,) * len(estimators)
|
||||
if name_details is None:
|
||||
name_details = (None,) * len(estimators)
|
||||
|
||||
self.names = names
|
||||
self.name_details = name_details
|
||||
|
||||
def _sk_visual_block_(self):
|
||||
return self
|
||||
|
||||
|
||||
def _write_label_html(
|
||||
out,
|
||||
name,
|
||||
name_details,
|
||||
outer_class="sk-label-container",
|
||||
inner_class="sk-label",
|
||||
checked=False,
|
||||
):
|
||||
"""Write labeled html with or without a dropdown with named details"""
|
||||
out.write(f'<div class="{outer_class}"><div class="{inner_class} sk-toggleable">')
|
||||
name = html.escape(name)
|
||||
|
||||
if name_details is not None:
|
||||
name_details = html.escape(str(name_details))
|
||||
label_class = "sk-toggleable__label sk-toggleable__label-arrow"
|
||||
|
||||
checked_str = "checked" if checked else ""
|
||||
est_id = _ESTIMATOR_ID_COUNTER.get_id()
|
||||
out.write(
|
||||
'<input class="sk-toggleable__control sk-hidden--visually" '
|
||||
f'id="{est_id}" type="checkbox" {checked_str}>'
|
||||
f'<label for="{est_id}" class="{label_class}">{name}</label>'
|
||||
f'<div class="sk-toggleable__content"><pre>{name_details}'
|
||||
"</pre></div>"
|
||||
)
|
||||
else:
|
||||
out.write(f"<label>{name}</label>")
|
||||
out.write("</div></div>") # outer_class inner_class
|
||||
|
||||
|
||||
def _get_visual_block(estimator):
|
||||
"""Generate information about how to display an estimator."""
|
||||
with suppress(AttributeError):
|
||||
return estimator._sk_visual_block_()
|
||||
|
||||
if isinstance(estimator, str):
|
||||
return _VisualBlock(
|
||||
"single", estimator, names=estimator, name_details=estimator
|
||||
)
|
||||
elif estimator is None:
|
||||
return _VisualBlock("single", estimator, names="None", name_details="None")
|
||||
|
||||
# check if estimator looks like a meta estimator wraps estimators
|
||||
if hasattr(estimator, "get_params"):
|
||||
estimators = [
|
||||
(key, est)
|
||||
for key, est in estimator.get_params(deep=False).items()
|
||||
if hasattr(est, "get_params") and hasattr(est, "fit")
|
||||
]
|
||||
if estimators:
|
||||
return _VisualBlock(
|
||||
"parallel",
|
||||
[est for _, est in estimators],
|
||||
names=[f"{key}: {est.__class__.__name__}" for key, est in estimators],
|
||||
name_details=[str(est) for _, est in estimators],
|
||||
)
|
||||
|
||||
return _VisualBlock(
|
||||
"single",
|
||||
estimator,
|
||||
names=estimator.__class__.__name__,
|
||||
name_details=str(estimator),
|
||||
)
|
||||
|
||||
|
||||
def _write_estimator_html(
|
||||
out, estimator, estimator_label, estimator_label_details, first_call=False
|
||||
):
|
||||
"""Write estimator to html in serial, parallel, or by itself (single)."""
|
||||
if first_call:
|
||||
est_block = _get_visual_block(estimator)
|
||||
else:
|
||||
with config_context(print_changed_only=True):
|
||||
est_block = _get_visual_block(estimator)
|
||||
|
||||
if est_block.kind in ("serial", "parallel"):
|
||||
dashed_wrapped = first_call or est_block.dash_wrapped
|
||||
dash_cls = " sk-dashed-wrapped" if dashed_wrapped else ""
|
||||
out.write(f'<div class="sk-item{dash_cls}">')
|
||||
|
||||
if estimator_label:
|
||||
_write_label_html(out, estimator_label, estimator_label_details)
|
||||
|
||||
kind = est_block.kind
|
||||
out.write(f'<div class="sk-{kind}">')
|
||||
est_infos = zip(est_block.estimators, est_block.names, est_block.name_details)
|
||||
|
||||
for est, name, name_details in est_infos:
|
||||
if kind == "serial":
|
||||
_write_estimator_html(out, est, name, name_details)
|
||||
else: # parallel
|
||||
out.write('<div class="sk-parallel-item">')
|
||||
# wrap element in a serial visualblock
|
||||
serial_block = _VisualBlock("serial", [est], dash_wrapped=False)
|
||||
_write_estimator_html(out, serial_block, name, name_details)
|
||||
out.write("</div>") # sk-parallel-item
|
||||
|
||||
out.write("</div></div>")
|
||||
elif est_block.kind == "single":
|
||||
_write_label_html(
|
||||
out,
|
||||
est_block.names,
|
||||
est_block.name_details,
|
||||
outer_class="sk-item",
|
||||
inner_class="sk-estimator",
|
||||
checked=first_call,
|
||||
)
|
||||
|
||||
|
||||
_STYLE = """
|
||||
#$id {
|
||||
color: black;
|
||||
background-color: white;
|
||||
}
|
||||
#$id pre{
|
||||
padding: 0;
|
||||
}
|
||||
#$id div.sk-toggleable {
|
||||
background-color: white;
|
||||
}
|
||||
#$id label.sk-toggleable__label {
|
||||
cursor: pointer;
|
||||
display: block;
|
||||
width: 100%;
|
||||
margin-bottom: 0;
|
||||
padding: 0.3em;
|
||||
box-sizing: border-box;
|
||||
text-align: center;
|
||||
}
|
||||
#$id label.sk-toggleable__label-arrow:before {
|
||||
content: "▸";
|
||||
float: left;
|
||||
margin-right: 0.25em;
|
||||
color: #696969;
|
||||
}
|
||||
#$id label.sk-toggleable__label-arrow:hover:before {
|
||||
color: black;
|
||||
}
|
||||
#$id div.sk-estimator:hover label.sk-toggleable__label-arrow:before {
|
||||
color: black;
|
||||
}
|
||||
#$id div.sk-toggleable__content {
|
||||
max-height: 0;
|
||||
max-width: 0;
|
||||
overflow: hidden;
|
||||
text-align: left;
|
||||
background-color: #f0f8ff;
|
||||
}
|
||||
#$id div.sk-toggleable__content pre {
|
||||
margin: 0.2em;
|
||||
color: black;
|
||||
border-radius: 0.25em;
|
||||
background-color: #f0f8ff;
|
||||
}
|
||||
#$id input.sk-toggleable__control:checked~div.sk-toggleable__content {
|
||||
max-height: 200px;
|
||||
max-width: 100%;
|
||||
overflow: auto;
|
||||
}
|
||||
#$id input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {
|
||||
content: "▾";
|
||||
}
|
||||
#$id div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
|
||||
background-color: #d4ebff;
|
||||
}
|
||||
#$id div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
|
||||
background-color: #d4ebff;
|
||||
}
|
||||
#$id input.sk-hidden--visually {
|
||||
border: 0;
|
||||
clip: rect(1px 1px 1px 1px);
|
||||
clip: rect(1px, 1px, 1px, 1px);
|
||||
height: 1px;
|
||||
margin: -1px;
|
||||
overflow: hidden;
|
||||
padding: 0;
|
||||
position: absolute;
|
||||
width: 1px;
|
||||
}
|
||||
#$id div.sk-estimator {
|
||||
font-family: monospace;
|
||||
background-color: #f0f8ff;
|
||||
border: 1px dotted black;
|
||||
border-radius: 0.25em;
|
||||
box-sizing: border-box;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
#$id div.sk-estimator:hover {
|
||||
background-color: #d4ebff;
|
||||
}
|
||||
#$id div.sk-parallel-item::after {
|
||||
content: "";
|
||||
width: 100%;
|
||||
border-bottom: 1px solid gray;
|
||||
flex-grow: 1;
|
||||
}
|
||||
#$id div.sk-label:hover label.sk-toggleable__label {
|
||||
background-color: #d4ebff;
|
||||
}
|
||||
#$id div.sk-serial::before {
|
||||
content: "";
|
||||
position: absolute;
|
||||
border-left: 1px solid gray;
|
||||
box-sizing: border-box;
|
||||
top: 0;
|
||||
bottom: 0;
|
||||
left: 50%;
|
||||
z-index: 0;
|
||||
}
|
||||
#$id div.sk-serial {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
background-color: white;
|
||||
padding-right: 0.2em;
|
||||
padding-left: 0.2em;
|
||||
position: relative;
|
||||
}
|
||||
#$id div.sk-item {
|
||||
position: relative;
|
||||
z-index: 1;
|
||||
}
|
||||
#$id div.sk-parallel {
|
||||
display: flex;
|
||||
align-items: stretch;
|
||||
justify-content: center;
|
||||
background-color: white;
|
||||
position: relative;
|
||||
}
|
||||
#$id div.sk-item::before, #$id div.sk-parallel-item::before {
|
||||
content: "";
|
||||
position: absolute;
|
||||
border-left: 1px solid gray;
|
||||
box-sizing: border-box;
|
||||
top: 0;
|
||||
bottom: 0;
|
||||
left: 50%;
|
||||
z-index: -1;
|
||||
}
|
||||
#$id div.sk-parallel-item {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
z-index: 1;
|
||||
position: relative;
|
||||
background-color: white;
|
||||
}
|
||||
#$id div.sk-parallel-item:first-child::after {
|
||||
align-self: flex-end;
|
||||
width: 50%;
|
||||
}
|
||||
#$id div.sk-parallel-item:last-child::after {
|
||||
align-self: flex-start;
|
||||
width: 50%;
|
||||
}
|
||||
#$id div.sk-parallel-item:only-child::after {
|
||||
width: 0;
|
||||
}
|
||||
#$id div.sk-dashed-wrapped {
|
||||
border: 1px dashed gray;
|
||||
margin: 0 0.4em 0.5em 0.4em;
|
||||
box-sizing: border-box;
|
||||
padding-bottom: 0.4em;
|
||||
background-color: white;
|
||||
}
|
||||
#$id div.sk-label label {
|
||||
font-family: monospace;
|
||||
font-weight: bold;
|
||||
display: inline-block;
|
||||
line-height: 1.2em;
|
||||
}
|
||||
#$id div.sk-label-container {
|
||||
text-align: center;
|
||||
}
|
||||
#$id div.sk-container {
|
||||
/* jupyter's `normalize.less` sets `[hidden] { display: none; }`
|
||||
but bootstrap.min.css set `[hidden] { display: none !important; }`
|
||||
so we also need the `!important` here to be able to override the
|
||||
default hidden behavior on the sphinx rendered scikit-learn.org.
|
||||
See: https://github.com/scikit-learn/scikit-learn/issues/21755 */
|
||||
display: inline-block !important;
|
||||
position: relative;
|
||||
}
|
||||
#$id div.sk-text-repr-fallback {
|
||||
display: none;
|
||||
}
|
||||
""".replace(
|
||||
" ", ""
|
||||
).replace(
|
||||
"\n", ""
|
||||
) # noqa
|
||||
|
||||
|
||||
def estimator_html_repr(estimator):
|
||||
"""Build a HTML representation of an estimator.
|
||||
|
||||
Read more in the :ref:`User Guide <visualizing_composite_estimators>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator object
|
||||
The estimator to visualize.
|
||||
|
||||
Returns
|
||||
-------
|
||||
html: str
|
||||
HTML representation of estimator.
|
||||
"""
|
||||
with closing(StringIO()) as out:
|
||||
container_id = _CONTAINER_ID_COUNTER.get_id()
|
||||
style_template = Template(_STYLE)
|
||||
style_with_id = style_template.substitute(id=container_id)
|
||||
estimator_str = str(estimator)
|
||||
|
||||
# The fallback message is shown by default and loading the CSS sets
|
||||
# div.sk-text-repr-fallback to display: none to hide the fallback message.
|
||||
#
|
||||
# If the notebook is trusted, the CSS is loaded which hides the fallback
|
||||
# message. If the notebook is not trusted, then the CSS is not loaded and the
|
||||
# fallback message is shown by default.
|
||||
#
|
||||
# The reverse logic applies to HTML repr div.sk-container.
|
||||
# div.sk-container is hidden by default and the loading the CSS displays it.
|
||||
fallback_msg = (
|
||||
"In a Jupyter environment, please rerun this cell to show the HTML"
|
||||
" representation or trust the notebook. <br />On GitHub, the"
|
||||
" HTML representation is unable to render, please try loading this page"
|
||||
" with nbviewer.org."
|
||||
)
|
||||
out.write(
|
||||
f"<style>{style_with_id}</style>"
|
||||
f'<div id="{container_id}" class="sk-top-container">'
|
||||
'<div class="sk-text-repr-fallback">'
|
||||
f"<pre>{html.escape(estimator_str)}</pre><b>{fallback_msg}</b>"
|
||||
"</div>"
|
||||
'<div class="sk-container" hidden>'
|
||||
)
|
||||
_write_estimator_html(
|
||||
out,
|
||||
estimator,
|
||||
estimator.__class__.__name__,
|
||||
estimator_str,
|
||||
first_call=True,
|
||||
)
|
||||
out.write("</div></div>")
|
||||
|
||||
html_output = out.getvalue()
|
||||
return html_output
|
||||
Binary file not shown.
@@ -0,0 +1,22 @@
|
||||
# Author: Gael Varoquaux
|
||||
# License: BSD
|
||||
"""
|
||||
Uses C++ map containers for fast dict-like behavior with keys being
|
||||
integers, and values float.
|
||||
"""
|
||||
|
||||
from libcpp.map cimport map as cpp_map
|
||||
|
||||
# Import the C-level symbols of numpy
|
||||
cimport numpy as np
|
||||
|
||||
ctypedef np.float64_t DTYPE_t
|
||||
|
||||
ctypedef np.intp_t ITYPE_t
|
||||
|
||||
###############################################################################
|
||||
# An object to be used in Python
|
||||
|
||||
cdef class IntFloatDict:
|
||||
cdef cpp_map[ITYPE_t, DTYPE_t] my_map
|
||||
cdef _to_arrays(self, ITYPE_t [:] keys, DTYPE_t [:] values)
|
||||
Binary file not shown.
@@ -0,0 +1,14 @@
|
||||
# Heap routines, used in various Cython implementations.
|
||||
|
||||
from cython cimport floating
|
||||
|
||||
from ._typedefs cimport ITYPE_t
|
||||
|
||||
|
||||
cdef int heap_push(
|
||||
floating* values,
|
||||
ITYPE_t* indices,
|
||||
ITYPE_t size,
|
||||
floating val,
|
||||
ITYPE_t val_idx,
|
||||
) nogil
|
||||
@@ -0,0 +1,31 @@
|
||||
import warnings as _warnings
|
||||
|
||||
with _warnings.catch_warnings():
|
||||
_warnings.simplefilter("ignore")
|
||||
# joblib imports may raise DeprecationWarning on certain Python
|
||||
# versions
|
||||
import joblib
|
||||
from joblib import logger
|
||||
from joblib import dump, load
|
||||
from joblib import __version__
|
||||
from joblib import effective_n_jobs
|
||||
from joblib import hash
|
||||
from joblib import cpu_count, Parallel, Memory, delayed
|
||||
from joblib import parallel_backend, register_parallel_backend
|
||||
|
||||
|
||||
__all__ = [
|
||||
"parallel_backend",
|
||||
"register_parallel_backend",
|
||||
"cpu_count",
|
||||
"Parallel",
|
||||
"Memory",
|
||||
"delayed",
|
||||
"effective_n_jobs",
|
||||
"hash",
|
||||
"logger",
|
||||
"dump",
|
||||
"load",
|
||||
"joblib",
|
||||
"__version__",
|
||||
]
|
||||
Binary file not shown.
@@ -0,0 +1,62 @@
|
||||
import numpy as np
|
||||
from scipy import sparse as sp
|
||||
from contextlib import suppress
|
||||
|
||||
from . import is_scalar_nan
|
||||
from .fixes import _object_dtype_isnan
|
||||
|
||||
|
||||
def _get_dense_mask(X, value_to_mask):
|
||||
with suppress(ImportError, AttributeError):
|
||||
# We also suppress `AttributeError` because older versions of pandas do
|
||||
# not have `NA`.
|
||||
import pandas
|
||||
|
||||
if value_to_mask is pandas.NA:
|
||||
return pandas.isna(X)
|
||||
|
||||
if is_scalar_nan(value_to_mask):
|
||||
if X.dtype.kind == "f":
|
||||
Xt = np.isnan(X)
|
||||
elif X.dtype.kind in ("i", "u"):
|
||||
# can't have NaNs in integer array.
|
||||
Xt = np.zeros(X.shape, dtype=bool)
|
||||
else:
|
||||
# np.isnan does not work on object dtypes.
|
||||
Xt = _object_dtype_isnan(X)
|
||||
else:
|
||||
Xt = X == value_to_mask
|
||||
|
||||
return Xt
|
||||
|
||||
|
||||
def _get_mask(X, value_to_mask):
|
||||
"""Compute the boolean mask X == value_to_mask.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data, where ``n_samples`` is the number of samples and
|
||||
``n_features`` is the number of features.
|
||||
|
||||
value_to_mask : {int, float}
|
||||
The value which is to be masked in X.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)
|
||||
Missing mask.
|
||||
"""
|
||||
if not sp.issparse(X):
|
||||
# For all cases apart of a sparse input where we need to reconstruct
|
||||
# a sparse output
|
||||
return _get_dense_mask(X, value_to_mask)
|
||||
|
||||
Xt = _get_dense_mask(X.data, value_to_mask)
|
||||
|
||||
sparse_constructor = sp.csr_matrix if X.format == "csr" else sp.csc_matrix
|
||||
Xt_sparse = sparse_constructor(
|
||||
(Xt, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool
|
||||
)
|
||||
|
||||
return Xt_sparse
|
||||
@@ -0,0 +1,346 @@
|
||||
import numpy as np
|
||||
|
||||
from ..base import BaseEstimator, ClassifierMixin
|
||||
from .validation import _check_sample_weight, _num_samples, check_array
|
||||
from .validation import check_is_fitted
|
||||
|
||||
|
||||
class ArraySlicingWrapper:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
array
|
||||
"""
|
||||
|
||||
def __init__(self, array):
|
||||
self.array = array
|
||||
|
||||
def __getitem__(self, aslice):
|
||||
return MockDataFrame(self.array[aslice])
|
||||
|
||||
|
||||
class MockDataFrame:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
array
|
||||
"""
|
||||
|
||||
# have shape and length but don't support indexing.
|
||||
|
||||
def __init__(self, array):
|
||||
self.array = array
|
||||
self.values = array
|
||||
self.shape = array.shape
|
||||
self.ndim = array.ndim
|
||||
# ugly hack to make iloc work.
|
||||
self.iloc = ArraySlicingWrapper(array)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.array)
|
||||
|
||||
def __array__(self, dtype=None):
|
||||
# Pandas data frames also are array-like: we want to make sure that
|
||||
# input validation in cross-validation does not try to call that
|
||||
# method.
|
||||
return self.array
|
||||
|
||||
def __eq__(self, other):
|
||||
return MockDataFrame(self.array == other.array)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def take(self, indices, axis=0):
|
||||
return MockDataFrame(self.array.take(indices, axis=axis))
|
||||
|
||||
|
||||
class CheckingClassifier(ClassifierMixin, BaseEstimator):
|
||||
"""Dummy classifier to test pipelining and meta-estimators.
|
||||
|
||||
Checks some property of `X` and `y`in fit / predict.
|
||||
This allows testing whether pipelines / cross-validation or metaestimators
|
||||
changed the input.
|
||||
|
||||
Can also be used to check if `fit_params` are passed correctly, and
|
||||
to force a certain score to be returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
check_y, check_X : callable, default=None
|
||||
The callable used to validate `X` and `y`. These callable should return
|
||||
a bool where `False` will trigger an `AssertionError`.
|
||||
|
||||
check_y_params, check_X_params : dict, default=None
|
||||
The optional parameters to pass to `check_X` and `check_y`.
|
||||
|
||||
methods_to_check : "all" or list of str, default="all"
|
||||
The methods in which the checks should be applied. By default,
|
||||
all checks will be done on all methods (`fit`, `predict`,
|
||||
`predict_proba`, `decision_function` and `score`).
|
||||
|
||||
foo_param : int, default=0
|
||||
A `foo` param. When `foo > 1`, the output of :meth:`score` will be 1
|
||||
otherwise it is 0.
|
||||
|
||||
expected_sample_weight : bool, default=False
|
||||
Whether to check if a valid `sample_weight` was passed to `fit`.
|
||||
|
||||
expected_fit_params : list of str, default=None
|
||||
A list of the expected parameters given when calling `fit`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : int
|
||||
The classes seen during `fit`.
|
||||
|
||||
n_features_in_ : int
|
||||
The number of features seen during `fit`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.utils._mocking import CheckingClassifier
|
||||
|
||||
This helper allow to assert to specificities regarding `X` or `y`. In this
|
||||
case we expect `check_X` or `check_y` to return a boolean.
|
||||
|
||||
>>> from sklearn.datasets import load_iris
|
||||
>>> X, y = load_iris(return_X_y=True)
|
||||
>>> clf = CheckingClassifier(check_X=lambda x: x.shape == (150, 4))
|
||||
>>> clf.fit(X, y)
|
||||
CheckingClassifier(...)
|
||||
|
||||
We can also provide a check which might raise an error. In this case, we
|
||||
expect `check_X` to return `X` and `check_y` to return `y`.
|
||||
|
||||
>>> from sklearn.utils import check_array
|
||||
>>> clf = CheckingClassifier(check_X=check_array)
|
||||
>>> clf.fit(X, y)
|
||||
CheckingClassifier(...)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
check_y=None,
|
||||
check_y_params=None,
|
||||
check_X=None,
|
||||
check_X_params=None,
|
||||
methods_to_check="all",
|
||||
foo_param=0,
|
||||
expected_sample_weight=None,
|
||||
expected_fit_params=None,
|
||||
):
|
||||
self.check_y = check_y
|
||||
self.check_y_params = check_y_params
|
||||
self.check_X = check_X
|
||||
self.check_X_params = check_X_params
|
||||
self.methods_to_check = methods_to_check
|
||||
self.foo_param = foo_param
|
||||
self.expected_sample_weight = expected_sample_weight
|
||||
self.expected_fit_params = expected_fit_params
|
||||
|
||||
def _check_X_y(self, X, y=None, should_be_fitted=True):
|
||||
"""Validate X and y and make extra check.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data set.
|
||||
y : array-like of shape (n_samples), default=None
|
||||
The corresponding target, by default None.
|
||||
should_be_fitted : bool, default=True
|
||||
Whether or not the classifier should be already fitted.
|
||||
By default True.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X, y
|
||||
"""
|
||||
if should_be_fitted:
|
||||
check_is_fitted(self)
|
||||
if self.check_X is not None:
|
||||
params = {} if self.check_X_params is None else self.check_X_params
|
||||
checked_X = self.check_X(X, **params)
|
||||
if isinstance(checked_X, (bool, np.bool_)):
|
||||
assert checked_X
|
||||
else:
|
||||
X = checked_X
|
||||
if y is not None and self.check_y is not None:
|
||||
params = {} if self.check_y_params is None else self.check_y_params
|
||||
checked_y = self.check_y(y, **params)
|
||||
if isinstance(checked_y, (bool, np.bool_)):
|
||||
assert checked_y
|
||||
else:
|
||||
y = checked_y
|
||||
return X, y
|
||||
|
||||
def fit(self, X, y, sample_weight=None, **fit_params):
|
||||
"""Fit classifier.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : array-like of shape (n_samples, n_outputs) or (n_samples,), \
|
||||
default=None
|
||||
Target relative to X for classification or regression;
|
||||
None for unsupervised learning.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
|
||||
**fit_params : dict of string -> object
|
||||
Parameters passed to the ``fit`` method of the estimator
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
assert _num_samples(X) == _num_samples(y)
|
||||
if self.methods_to_check == "all" or "fit" in self.methods_to_check:
|
||||
X, y = self._check_X_y(X, y, should_be_fitted=False)
|
||||
self.n_features_in_ = np.shape(X)[1]
|
||||
self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True))
|
||||
if self.expected_fit_params:
|
||||
missing = set(self.expected_fit_params) - set(fit_params)
|
||||
if missing:
|
||||
raise AssertionError(
|
||||
f"Expected fit parameter(s) {list(missing)} not seen."
|
||||
)
|
||||
for key, value in fit_params.items():
|
||||
if _num_samples(value) != _num_samples(X):
|
||||
raise AssertionError(
|
||||
f"Fit parameter {key} has length {_num_samples(value)}"
|
||||
f"; expected {_num_samples(X)}."
|
||||
)
|
||||
if self.expected_sample_weight:
|
||||
if sample_weight is None:
|
||||
raise AssertionError("Expected sample_weight to be passed")
|
||||
_check_sample_weight(sample_weight, X)
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the first class seen in `classes_`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
preds : ndarray of shape (n_samples,)
|
||||
Predictions of the first class seens in `classes_`.
|
||||
"""
|
||||
if self.methods_to_check == "all" or "predict" in self.methods_to_check:
|
||||
X, y = self._check_X_y(X)
|
||||
return self.classes_[np.zeros(_num_samples(X), dtype=int)]
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Predict probabilities for each class.
|
||||
|
||||
Here, the dummy classifier will provide a probability of 1 for the
|
||||
first class of `classes_` and 0 otherwise.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
proba : ndarray of shape (n_samples, n_classes)
|
||||
The probabilities for each sample and class.
|
||||
"""
|
||||
if self.methods_to_check == "all" or "predict_proba" in self.methods_to_check:
|
||||
X, y = self._check_X_y(X)
|
||||
proba = np.zeros((_num_samples(X), len(self.classes_)))
|
||||
proba[:, 0] = 1
|
||||
return proba
|
||||
|
||||
def decision_function(self, X):
|
||||
"""Confidence score.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
decision : ndarray of shape (n_samples,) if n_classes == 2\
|
||||
else (n_samples, n_classes)
|
||||
Confidence score.
|
||||
"""
|
||||
if (
|
||||
self.methods_to_check == "all"
|
||||
or "decision_function" in self.methods_to_check
|
||||
):
|
||||
X, y = self._check_X_y(X)
|
||||
if len(self.classes_) == 2:
|
||||
# for binary classifier, the confidence score is related to
|
||||
# classes_[1] and therefore should be null.
|
||||
return np.zeros(_num_samples(X))
|
||||
else:
|
||||
decision = np.zeros((_num_samples(X), len(self.classes_)))
|
||||
decision[:, 0] = 1
|
||||
return decision
|
||||
|
||||
def score(self, X=None, Y=None):
|
||||
"""Fake score.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Input data, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
Y : array-like of shape (n_samples, n_output) or (n_samples,)
|
||||
Target relative to X for classification or regression;
|
||||
None for unsupervised learning.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Either 0 or 1 depending of `foo_param` (i.e. `foo_param > 1 =>
|
||||
score=1` otherwise `score=0`).
|
||||
"""
|
||||
if self.methods_to_check == "all" or "score" in self.methods_to_check:
|
||||
self._check_X_y(X, Y)
|
||||
if self.foo_param > 1:
|
||||
score = 1.0
|
||||
else:
|
||||
score = 0.0
|
||||
return score
|
||||
|
||||
def _more_tags(self):
|
||||
return {"_skip_test": True, "X_types": ["1dlabel"]}
|
||||
|
||||
|
||||
class NoSampleWeightWrapper(BaseEstimator):
|
||||
"""Wrap estimator which will not expose `sample_weight`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
est : estimator, default=None
|
||||
The estimator to wrap.
|
||||
"""
|
||||
|
||||
def __init__(self, est=None):
|
||||
self.est = est
|
||||
|
||||
def fit(self, X, y):
|
||||
return self.est.fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
return self.est.predict(X)
|
||||
|
||||
def predict_proba(self, X):
|
||||
return self.est.predict_proba(X)
|
||||
|
||||
def _more_tags(self):
|
||||
return {"_skip_test": True}
|
||||
Binary file not shown.
@@ -0,0 +1,6 @@
|
||||
# Helpers to access OpenMP threads information
|
||||
#
|
||||
# Those interfaces act as indirections which allows the non-support of OpenMP
|
||||
# for implementations which have been written for it.
|
||||
|
||||
cdef int _openmp_thread_num() nogil
|
||||
@@ -0,0 +1,463 @@
|
||||
"""This module contains the _EstimatorPrettyPrinter class used in
|
||||
BaseEstimator.__repr__ for pretty-printing estimators"""
|
||||
|
||||
# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
|
||||
# 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018 Python Software Foundation;
|
||||
# All Rights Reserved
|
||||
|
||||
# Authors: Fred L. Drake, Jr. <fdrake@acm.org> (built-in CPython pprint module)
|
||||
# Nicolas Hug (scikit-learn specific changes)
|
||||
|
||||
# License: PSF License version 2 (see below)
|
||||
|
||||
# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
|
||||
# --------------------------------------------
|
||||
|
||||
# 1. This LICENSE AGREEMENT is between the Python Software Foundation ("PSF"),
|
||||
# and the Individual or Organization ("Licensee") accessing and otherwise
|
||||
# using this software ("Python") in source or binary form and its associated
|
||||
# documentation.
|
||||
|
||||
# 2. Subject to the terms and conditions of this License Agreement, PSF hereby
|
||||
# grants Licensee a nonexclusive, royalty-free, world-wide license to
|
||||
# reproduce, analyze, test, perform and/or display publicly, prepare
|
||||
# derivative works, distribute, and otherwise use Python alone or in any
|
||||
# derivative version, provided, however, that PSF's License Agreement and
|
||||
# PSF's notice of copyright, i.e., "Copyright (c) 2001, 2002, 2003, 2004,
|
||||
# 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
|
||||
# 2017, 2018 Python Software Foundation; All Rights Reserved" are retained in
|
||||
# Python alone or in any derivative version prepared by Licensee.
|
||||
|
||||
# 3. In the event Licensee prepares a derivative work that is based on or
|
||||
# incorporates Python or any part thereof, and wants to make the derivative
|
||||
# work available to others as provided herein, then Licensee hereby agrees to
|
||||
# include in any such work a brief summary of the changes made to Python.
|
||||
|
||||
# 4. PSF is making Python available to Licensee on an "AS IS" basis. PSF MAKES
|
||||
# NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT
|
||||
# NOT LIMITATION, PSF MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF
|
||||
# MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF
|
||||
# PYTHON WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
|
||||
|
||||
# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON FOR ANY
|
||||
# INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF
|
||||
# MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, OR ANY DERIVATIVE
|
||||
# THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
|
||||
|
||||
# 6. This License Agreement will automatically terminate upon a material
|
||||
# breach of its terms and conditions.
|
||||
|
||||
# 7. Nothing in this License Agreement shall be deemed to create any
|
||||
# relationship of agency, partnership, or joint venture between PSF and
|
||||
# Licensee. This License Agreement does not grant permission to use PSF
|
||||
# trademarks or trade name in a trademark sense to endorse or promote products
|
||||
# or services of Licensee, or any third party.
|
||||
|
||||
# 8. By copying, installing or otherwise using Python, Licensee agrees to be
|
||||
# bound by the terms and conditions of this License Agreement.
|
||||
|
||||
|
||||
# Brief summary of changes to original code:
|
||||
# - "compact" parameter is supported for dicts, not just lists or tuples
|
||||
# - estimators have a custom handler, they're not just treated as objects
|
||||
# - long sequences (lists, tuples, dict items) with more than N elements are
|
||||
# shortened using ellipsis (', ...') at the end.
|
||||
|
||||
import inspect
|
||||
import pprint
|
||||
from collections import OrderedDict
|
||||
|
||||
from ..base import BaseEstimator
|
||||
from .._config import get_config
|
||||
from . import is_scalar_nan
|
||||
|
||||
|
||||
class KeyValTuple(tuple):
|
||||
"""Dummy class for correctly rendering key-value tuples from dicts."""
|
||||
|
||||
def __repr__(self):
|
||||
# needed for _dispatch[tuple.__repr__] not to be overridden
|
||||
return super().__repr__()
|
||||
|
||||
|
||||
class KeyValTupleParam(KeyValTuple):
|
||||
"""Dummy class for correctly rendering key-value tuples from parameters."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
def _changed_params(estimator):
|
||||
"""Return dict (param_name: value) of parameters that were given to
|
||||
estimator with non-default values."""
|
||||
|
||||
params = estimator.get_params(deep=False)
|
||||
init_func = getattr(estimator.__init__, "deprecated_original", estimator.__init__)
|
||||
init_params = inspect.signature(init_func).parameters
|
||||
init_params = {name: param.default for name, param in init_params.items()}
|
||||
|
||||
def has_changed(k, v):
|
||||
if k not in init_params: # happens if k is part of a **kwargs
|
||||
return True
|
||||
if init_params[k] == inspect._empty: # k has no default value
|
||||
return True
|
||||
# try to avoid calling repr on nested estimators
|
||||
if isinstance(v, BaseEstimator) and v.__class__ != init_params[k].__class__:
|
||||
return True
|
||||
# Use repr as a last resort. It may be expensive.
|
||||
if repr(v) != repr(init_params[k]) and not (
|
||||
is_scalar_nan(init_params[k]) and is_scalar_nan(v)
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
return {k: v for k, v in params.items() if has_changed(k, v)}
|
||||
|
||||
|
||||
class _EstimatorPrettyPrinter(pprint.PrettyPrinter):
|
||||
"""Pretty Printer class for estimator objects.
|
||||
|
||||
This extends the pprint.PrettyPrinter class, because:
|
||||
- we need estimators to be printed with their parameters, e.g.
|
||||
Estimator(param1=value1, ...) which is not supported by default.
|
||||
- the 'compact' parameter of PrettyPrinter is ignored for dicts, which
|
||||
may lead to very long representations that we want to avoid.
|
||||
|
||||
Quick overview of pprint.PrettyPrinter (see also
|
||||
https://stackoverflow.com/questions/49565047/pprint-with-hex-numbers):
|
||||
|
||||
- the entry point is the _format() method which calls format() (overridden
|
||||
here)
|
||||
- format() directly calls _safe_repr() for a first try at rendering the
|
||||
object
|
||||
- _safe_repr formats the whole object recursively, only calling itself,
|
||||
not caring about line length or anything
|
||||
- back to _format(), if the output string is too long, _format() then calls
|
||||
the appropriate _pprint_TYPE() method (e.g. _pprint_list()) depending on
|
||||
the type of the object. This where the line length and the compact
|
||||
parameters are taken into account.
|
||||
- those _pprint_TYPE() methods will internally use the format() method for
|
||||
rendering the nested objects of an object (e.g. the elements of a list)
|
||||
|
||||
In the end, everything has to be implemented twice: in _safe_repr and in
|
||||
the custom _pprint_TYPE methods. Unfortunately PrettyPrinter is really not
|
||||
straightforward to extend (especially when we want a compact output), so
|
||||
the code is a bit convoluted.
|
||||
|
||||
This class overrides:
|
||||
- format() to support the changed_only parameter
|
||||
- _safe_repr to support printing of estimators (for when they fit on a
|
||||
single line)
|
||||
- _format_dict_items so that dict are correctly 'compacted'
|
||||
- _format_items so that ellipsis is used on long lists and tuples
|
||||
|
||||
When estimators cannot be printed on a single line, the builtin _format()
|
||||
will call _pprint_estimator() because it was registered to do so (see
|
||||
_dispatch[BaseEstimator.__repr__] = _pprint_estimator).
|
||||
|
||||
both _format_dict_items() and _pprint_estimator() use the
|
||||
_format_params_or_dict_items() method that will format parameters and
|
||||
key-value pairs respecting the compact parameter. This method needs another
|
||||
subroutine _pprint_key_val_tuple() used when a parameter or a key-value
|
||||
pair is too long to fit on a single line. This subroutine is called in
|
||||
_format() and is registered as well in the _dispatch dict (just like
|
||||
_pprint_estimator). We had to create the two classes KeyValTuple and
|
||||
KeyValTupleParam for this.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
indent=1,
|
||||
width=80,
|
||||
depth=None,
|
||||
stream=None,
|
||||
*,
|
||||
compact=False,
|
||||
indent_at_name=True,
|
||||
n_max_elements_to_show=None,
|
||||
):
|
||||
super().__init__(indent, width, depth, stream, compact=compact)
|
||||
self._indent_at_name = indent_at_name
|
||||
if self._indent_at_name:
|
||||
self._indent_per_level = 1 # ignore indent param
|
||||
self._changed_only = get_config()["print_changed_only"]
|
||||
# Max number of elements in a list, dict, tuple until we start using
|
||||
# ellipsis. This also affects the number of arguments of an estimators
|
||||
# (they are treated as dicts)
|
||||
self.n_max_elements_to_show = n_max_elements_to_show
|
||||
|
||||
def format(self, object, context, maxlevels, level):
|
||||
return _safe_repr(
|
||||
object, context, maxlevels, level, changed_only=self._changed_only
|
||||
)
|
||||
|
||||
def _pprint_estimator(self, object, stream, indent, allowance, context, level):
|
||||
stream.write(object.__class__.__name__ + "(")
|
||||
if self._indent_at_name:
|
||||
indent += len(object.__class__.__name__)
|
||||
|
||||
if self._changed_only:
|
||||
params = _changed_params(object)
|
||||
else:
|
||||
params = object.get_params(deep=False)
|
||||
|
||||
params = OrderedDict((name, val) for (name, val) in sorted(params.items()))
|
||||
|
||||
self._format_params(
|
||||
params.items(), stream, indent, allowance + 1, context, level
|
||||
)
|
||||
stream.write(")")
|
||||
|
||||
def _format_dict_items(self, items, stream, indent, allowance, context, level):
|
||||
return self._format_params_or_dict_items(
|
||||
items, stream, indent, allowance, context, level, is_dict=True
|
||||
)
|
||||
|
||||
def _format_params(self, items, stream, indent, allowance, context, level):
|
||||
return self._format_params_or_dict_items(
|
||||
items, stream, indent, allowance, context, level, is_dict=False
|
||||
)
|
||||
|
||||
def _format_params_or_dict_items(
|
||||
self, object, stream, indent, allowance, context, level, is_dict
|
||||
):
|
||||
"""Format dict items or parameters respecting the compact=True
|
||||
parameter. For some reason, the builtin rendering of dict items doesn't
|
||||
respect compact=True and will use one line per key-value if all cannot
|
||||
fit in a single line.
|
||||
Dict items will be rendered as <'key': value> while params will be
|
||||
rendered as <key=value>. The implementation is mostly copy/pasting from
|
||||
the builtin _format_items().
|
||||
This also adds ellipsis if the number of items is greater than
|
||||
self.n_max_elements_to_show.
|
||||
"""
|
||||
write = stream.write
|
||||
indent += self._indent_per_level
|
||||
delimnl = ",\n" + " " * indent
|
||||
delim = ""
|
||||
width = max_width = self._width - indent + 1
|
||||
it = iter(object)
|
||||
try:
|
||||
next_ent = next(it)
|
||||
except StopIteration:
|
||||
return
|
||||
last = False
|
||||
n_items = 0
|
||||
while not last:
|
||||
if n_items == self.n_max_elements_to_show:
|
||||
write(", ...")
|
||||
break
|
||||
n_items += 1
|
||||
ent = next_ent
|
||||
try:
|
||||
next_ent = next(it)
|
||||
except StopIteration:
|
||||
last = True
|
||||
max_width -= allowance
|
||||
width -= allowance
|
||||
if self._compact:
|
||||
k, v = ent
|
||||
krepr = self._repr(k, context, level)
|
||||
vrepr = self._repr(v, context, level)
|
||||
if not is_dict:
|
||||
krepr = krepr.strip("'")
|
||||
middle = ": " if is_dict else "="
|
||||
rep = krepr + middle + vrepr
|
||||
w = len(rep) + 2
|
||||
if width < w:
|
||||
width = max_width
|
||||
if delim:
|
||||
delim = delimnl
|
||||
if width >= w:
|
||||
width -= w
|
||||
write(delim)
|
||||
delim = ", "
|
||||
write(rep)
|
||||
continue
|
||||
write(delim)
|
||||
delim = delimnl
|
||||
class_ = KeyValTuple if is_dict else KeyValTupleParam
|
||||
self._format(
|
||||
class_(ent), stream, indent, allowance if last else 1, context, level
|
||||
)
|
||||
|
||||
def _format_items(self, items, stream, indent, allowance, context, level):
|
||||
"""Format the items of an iterable (list, tuple...). Same as the
|
||||
built-in _format_items, with support for ellipsis if the number of
|
||||
elements is greater than self.n_max_elements_to_show.
|
||||
"""
|
||||
write = stream.write
|
||||
indent += self._indent_per_level
|
||||
if self._indent_per_level > 1:
|
||||
write((self._indent_per_level - 1) * " ")
|
||||
delimnl = ",\n" + " " * indent
|
||||
delim = ""
|
||||
width = max_width = self._width - indent + 1
|
||||
it = iter(items)
|
||||
try:
|
||||
next_ent = next(it)
|
||||
except StopIteration:
|
||||
return
|
||||
last = False
|
||||
n_items = 0
|
||||
while not last:
|
||||
if n_items == self.n_max_elements_to_show:
|
||||
write(", ...")
|
||||
break
|
||||
n_items += 1
|
||||
ent = next_ent
|
||||
try:
|
||||
next_ent = next(it)
|
||||
except StopIteration:
|
||||
last = True
|
||||
max_width -= allowance
|
||||
width -= allowance
|
||||
if self._compact:
|
||||
rep = self._repr(ent, context, level)
|
||||
w = len(rep) + 2
|
||||
if width < w:
|
||||
width = max_width
|
||||
if delim:
|
||||
delim = delimnl
|
||||
if width >= w:
|
||||
width -= w
|
||||
write(delim)
|
||||
delim = ", "
|
||||
write(rep)
|
||||
continue
|
||||
write(delim)
|
||||
delim = delimnl
|
||||
self._format(ent, stream, indent, allowance if last else 1, context, level)
|
||||
|
||||
def _pprint_key_val_tuple(self, object, stream, indent, allowance, context, level):
|
||||
"""Pretty printing for key-value tuples from dict or parameters."""
|
||||
k, v = object
|
||||
rep = self._repr(k, context, level)
|
||||
if isinstance(object, KeyValTupleParam):
|
||||
rep = rep.strip("'")
|
||||
middle = "="
|
||||
else:
|
||||
middle = ": "
|
||||
stream.write(rep)
|
||||
stream.write(middle)
|
||||
self._format(
|
||||
v, stream, indent + len(rep) + len(middle), allowance, context, level
|
||||
)
|
||||
|
||||
# Note: need to copy _dispatch to prevent instances of the builtin
|
||||
# PrettyPrinter class to call methods of _EstimatorPrettyPrinter (see issue
|
||||
# 12906)
|
||||
# mypy error: "Type[PrettyPrinter]" has no attribute "_dispatch"
|
||||
_dispatch = pprint.PrettyPrinter._dispatch.copy() # type: ignore
|
||||
_dispatch[BaseEstimator.__repr__] = _pprint_estimator
|
||||
_dispatch[KeyValTuple.__repr__] = _pprint_key_val_tuple
|
||||
|
||||
|
||||
def _safe_repr(object, context, maxlevels, level, changed_only=False):
|
||||
"""Same as the builtin _safe_repr, with added support for Estimator
|
||||
objects."""
|
||||
typ = type(object)
|
||||
|
||||
if typ in pprint._builtin_scalars:
|
||||
return repr(object), True, False
|
||||
|
||||
r = getattr(typ, "__repr__", None)
|
||||
if issubclass(typ, dict) and r is dict.__repr__:
|
||||
if not object:
|
||||
return "{}", True, False
|
||||
objid = id(object)
|
||||
if maxlevels and level >= maxlevels:
|
||||
return "{...}", False, objid in context
|
||||
if objid in context:
|
||||
return pprint._recursion(object), False, True
|
||||
context[objid] = 1
|
||||
readable = True
|
||||
recursive = False
|
||||
components = []
|
||||
append = components.append
|
||||
level += 1
|
||||
saferepr = _safe_repr
|
||||
items = sorted(object.items(), key=pprint._safe_tuple)
|
||||
for k, v in items:
|
||||
krepr, kreadable, krecur = saferepr(
|
||||
k, context, maxlevels, level, changed_only=changed_only
|
||||
)
|
||||
vrepr, vreadable, vrecur = saferepr(
|
||||
v, context, maxlevels, level, changed_only=changed_only
|
||||
)
|
||||
append("%s: %s" % (krepr, vrepr))
|
||||
readable = readable and kreadable and vreadable
|
||||
if krecur or vrecur:
|
||||
recursive = True
|
||||
del context[objid]
|
||||
return "{%s}" % ", ".join(components), readable, recursive
|
||||
|
||||
if (issubclass(typ, list) and r is list.__repr__) or (
|
||||
issubclass(typ, tuple) and r is tuple.__repr__
|
||||
):
|
||||
if issubclass(typ, list):
|
||||
if not object:
|
||||
return "[]", True, False
|
||||
format = "[%s]"
|
||||
elif len(object) == 1:
|
||||
format = "(%s,)"
|
||||
else:
|
||||
if not object:
|
||||
return "()", True, False
|
||||
format = "(%s)"
|
||||
objid = id(object)
|
||||
if maxlevels and level >= maxlevels:
|
||||
return format % "...", False, objid in context
|
||||
if objid in context:
|
||||
return pprint._recursion(object), False, True
|
||||
context[objid] = 1
|
||||
readable = True
|
||||
recursive = False
|
||||
components = []
|
||||
append = components.append
|
||||
level += 1
|
||||
for o in object:
|
||||
orepr, oreadable, orecur = _safe_repr(
|
||||
o, context, maxlevels, level, changed_only=changed_only
|
||||
)
|
||||
append(orepr)
|
||||
if not oreadable:
|
||||
readable = False
|
||||
if orecur:
|
||||
recursive = True
|
||||
del context[objid]
|
||||
return format % ", ".join(components), readable, recursive
|
||||
|
||||
if issubclass(typ, BaseEstimator):
|
||||
objid = id(object)
|
||||
if maxlevels and level >= maxlevels:
|
||||
return "{...}", False, objid in context
|
||||
if objid in context:
|
||||
return pprint._recursion(object), False, True
|
||||
context[objid] = 1
|
||||
readable = True
|
||||
recursive = False
|
||||
if changed_only:
|
||||
params = _changed_params(object)
|
||||
else:
|
||||
params = object.get_params(deep=False)
|
||||
components = []
|
||||
append = components.append
|
||||
level += 1
|
||||
saferepr = _safe_repr
|
||||
items = sorted(params.items(), key=pprint._safe_tuple)
|
||||
for k, v in items:
|
||||
krepr, kreadable, krecur = saferepr(
|
||||
k, context, maxlevels, level, changed_only=changed_only
|
||||
)
|
||||
vrepr, vreadable, vrecur = saferepr(
|
||||
v, context, maxlevels, level, changed_only=changed_only
|
||||
)
|
||||
append("%s=%s" % (krepr.strip("'"), vrepr))
|
||||
readable = readable and kreadable and vreadable
|
||||
if krecur or vrecur:
|
||||
recursive = True
|
||||
del context[objid]
|
||||
return ("%s(%s)" % (typ.__name__, ", ".join(components)), readable, recursive)
|
||||
|
||||
rep = repr(object)
|
||||
return rep, (rep and not rep.startswith("<")), False
|
||||
Binary file not shown.
@@ -0,0 +1,44 @@
|
||||
# Authors: Arnaud Joly
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as np
|
||||
ctypedef np.npy_uint32 UINT32_t
|
||||
|
||||
cdef inline UINT32_t DEFAULT_SEED = 1
|
||||
|
||||
cdef enum:
|
||||
# Max value for our rand_r replacement (near the bottom).
|
||||
# We don't use RAND_MAX because it's different across platforms and
|
||||
# particularly tiny on Windows/MSVC.
|
||||
RAND_R_MAX = 0x7FFFFFFF
|
||||
|
||||
cpdef sample_without_replacement(np.int_t n_population,
|
||||
np.int_t n_samples,
|
||||
method=*,
|
||||
random_state=*)
|
||||
|
||||
# rand_r replacement using a 32bit XorShift generator
|
||||
# See http://www.jstatsoft.org/v08/i14/paper for details
|
||||
cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil:
|
||||
"""Generate a pseudo-random np.uint32 from a np.uint32 seed"""
|
||||
# seed shouldn't ever be 0.
|
||||
if (seed[0] == 0): seed[0] = DEFAULT_SEED
|
||||
|
||||
seed[0] ^= <UINT32_t>(seed[0] << 13)
|
||||
seed[0] ^= <UINT32_t>(seed[0] >> 17)
|
||||
seed[0] ^= <UINT32_t>(seed[0] << 5)
|
||||
|
||||
# Note: we must be careful with the final line cast to np.uint32 so that
|
||||
# the function behaves consistently across platforms.
|
||||
#
|
||||
# The following cast might yield different results on different platforms:
|
||||
# wrong_cast = <UINT32_t> RAND_R_MAX + 1
|
||||
#
|
||||
# We can use:
|
||||
# good_cast = <UINT32_t>(RAND_R_MAX + 1)
|
||||
# or:
|
||||
# cdef np.uint32_t another_good_cast = <UINT32_t>RAND_R_MAX + 1
|
||||
return seed[0] % <UINT32_t>(RAND_R_MAX + 1)
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,116 @@
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
"""
|
||||
Dataset abstractions for sequential data access.
|
||||
WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
|
||||
"""
|
||||
|
||||
cimport numpy as np
|
||||
|
||||
# SequentialDataset and its two concrete subclasses are (optionally randomized)
|
||||
# iterators over the rows of a matrix X and corresponding target values y.
|
||||
|
||||
|
||||
cdef class SequentialDataset64:
|
||||
cdef int current_index
|
||||
cdef np.ndarray index
|
||||
cdef int *index_data_ptr
|
||||
cdef Py_ssize_t n_samples
|
||||
cdef np.uint32_t seed
|
||||
|
||||
cdef void shuffle(self, np.uint32_t seed) nogil
|
||||
cdef int _get_next_index(self) nogil
|
||||
cdef int _get_random_index(self) nogil
|
||||
|
||||
cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
|
||||
int *nnz, double *y, double *sample_weight,
|
||||
int current_index) nogil
|
||||
cdef void next(self, double **x_data_ptr, int **x_ind_ptr,
|
||||
int *nnz, double *y, double *sample_weight) nogil
|
||||
cdef int random(self, double **x_data_ptr, int **x_ind_ptr,
|
||||
int *nnz, double *y, double *sample_weight) nogil
|
||||
|
||||
|
||||
cdef class ArrayDataset64(SequentialDataset64):
|
||||
cdef np.ndarray X
|
||||
cdef np.ndarray Y
|
||||
cdef np.ndarray sample_weights
|
||||
cdef Py_ssize_t n_features
|
||||
cdef np.npy_intp X_stride
|
||||
cdef double *X_data_ptr
|
||||
cdef double *Y_data_ptr
|
||||
cdef np.ndarray feature_indices
|
||||
cdef int *feature_indices_ptr
|
||||
cdef double *sample_weight_data
|
||||
|
||||
|
||||
cdef class CSRDataset64(SequentialDataset64):
|
||||
cdef np.ndarray X_data
|
||||
cdef np.ndarray X_indptr
|
||||
cdef np.ndarray X_indices
|
||||
cdef np.ndarray Y
|
||||
cdef np.ndarray sample_weights
|
||||
cdef double *X_data_ptr
|
||||
cdef int *X_indptr_ptr
|
||||
cdef int *X_indices_ptr
|
||||
cdef double *Y_data_ptr
|
||||
cdef double *sample_weight_data
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
"""
|
||||
Dataset abstractions for sequential data access.
|
||||
WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
|
||||
"""
|
||||
|
||||
cimport numpy as np
|
||||
|
||||
# SequentialDataset and its two concrete subclasses are (optionally randomized)
|
||||
# iterators over the rows of a matrix X and corresponding target values y.
|
||||
|
||||
|
||||
cdef class SequentialDataset32:
|
||||
cdef int current_index
|
||||
cdef np.ndarray index
|
||||
cdef int *index_data_ptr
|
||||
cdef Py_ssize_t n_samples
|
||||
cdef np.uint32_t seed
|
||||
|
||||
cdef void shuffle(self, np.uint32_t seed) nogil
|
||||
cdef int _get_next_index(self) nogil
|
||||
cdef int _get_random_index(self) nogil
|
||||
|
||||
cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
|
||||
int *nnz, float *y, float *sample_weight,
|
||||
int current_index) nogil
|
||||
cdef void next(self, float **x_data_ptr, int **x_ind_ptr,
|
||||
int *nnz, float *y, float *sample_weight) nogil
|
||||
cdef int random(self, float **x_data_ptr, int **x_ind_ptr,
|
||||
int *nnz, float *y, float *sample_weight) nogil
|
||||
|
||||
|
||||
cdef class ArrayDataset32(SequentialDataset32):
|
||||
cdef np.ndarray X
|
||||
cdef np.ndarray Y
|
||||
cdef np.ndarray sample_weights
|
||||
cdef Py_ssize_t n_features
|
||||
cdef np.npy_intp X_stride
|
||||
cdef float *X_data_ptr
|
||||
cdef float *Y_data_ptr
|
||||
cdef np.ndarray feature_indices
|
||||
cdef int *feature_indices_ptr
|
||||
cdef float *sample_weight_data
|
||||
|
||||
|
||||
cdef class CSRDataset32(SequentialDataset32):
|
||||
cdef np.ndarray X_data
|
||||
cdef np.ndarray X_indptr
|
||||
cdef np.ndarray X_indices
|
||||
cdef np.ndarray Y
|
||||
cdef np.ndarray sample_weights
|
||||
cdef float *X_data_ptr
|
||||
cdef int *X_indptr_ptr
|
||||
cdef int *X_indices_ptr
|
||||
cdef float *Y_data_ptr
|
||||
cdef float *sample_weight_data
|
||||
@@ -0,0 +1,129 @@
|
||||
"""
|
||||
Utility methods to print system info for debugging
|
||||
|
||||
adapted from :func:`pandas.show_versions`
|
||||
"""
|
||||
# License: BSD 3 clause
|
||||
|
||||
import platform
|
||||
import sys
|
||||
from ..utils.fixes import threadpool_info
|
||||
from .. import __version__
|
||||
|
||||
|
||||
from ._openmp_helpers import _openmp_parallelism_enabled
|
||||
|
||||
|
||||
def _get_sys_info():
|
||||
"""System information
|
||||
|
||||
Returns
|
||||
-------
|
||||
sys_info : dict
|
||||
system and Python version information
|
||||
|
||||
"""
|
||||
python = sys.version.replace("\n", " ")
|
||||
|
||||
blob = [
|
||||
("python", python),
|
||||
("executable", sys.executable),
|
||||
("machine", platform.platform()),
|
||||
]
|
||||
|
||||
return dict(blob)
|
||||
|
||||
|
||||
def _get_deps_info():
|
||||
"""Overview of the installed version of main dependencies
|
||||
|
||||
This function does not import the modules to collect the version numbers
|
||||
but instead relies on standard Python package metadata.
|
||||
|
||||
Returns
|
||||
-------
|
||||
deps_info: dict
|
||||
version information on relevant Python libraries
|
||||
|
||||
"""
|
||||
deps = [
|
||||
"pip",
|
||||
"setuptools",
|
||||
"numpy",
|
||||
"scipy",
|
||||
"Cython",
|
||||
"pandas",
|
||||
"matplotlib",
|
||||
"joblib",
|
||||
"threadpoolctl",
|
||||
]
|
||||
|
||||
deps_info = {
|
||||
"sklearn": __version__,
|
||||
}
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
# Backwards compatibility with Python < 3.8, primarily for PyPy.
|
||||
# TODO: remove once PyPy 3.8 is available on conda-forge and
|
||||
# therefore on our CI.
|
||||
# https://github.com/conda-forge/conda-forge-pinning-feedstock/issues/2089
|
||||
try:
|
||||
from pkg_resources import get_distribution, DistributionNotFound
|
||||
|
||||
for modname in deps:
|
||||
try:
|
||||
deps_info[modname] = get_distribution(modname).version
|
||||
except DistributionNotFound:
|
||||
deps_info[modname] = None
|
||||
|
||||
except ImportError:
|
||||
# Setuptools not installed
|
||||
for modname in deps:
|
||||
deps_info[modname] = None
|
||||
|
||||
else:
|
||||
from importlib.metadata import version, PackageNotFoundError
|
||||
|
||||
for modname in deps:
|
||||
try:
|
||||
deps_info[modname] = version(modname)
|
||||
except PackageNotFoundError:
|
||||
deps_info[modname] = None
|
||||
|
||||
return deps_info
|
||||
|
||||
|
||||
def show_versions():
|
||||
"""Print useful debugging information"
|
||||
|
||||
.. versionadded:: 0.20
|
||||
"""
|
||||
|
||||
sys_info = _get_sys_info()
|
||||
deps_info = _get_deps_info()
|
||||
|
||||
print("\nSystem:")
|
||||
for k, stat in sys_info.items():
|
||||
print("{k:>10}: {stat}".format(k=k, stat=stat))
|
||||
|
||||
print("\nPython dependencies:")
|
||||
for k, stat in deps_info.items():
|
||||
print("{k:>13}: {stat}".format(k=k, stat=stat))
|
||||
|
||||
print(
|
||||
"\n{k}: {stat}".format(
|
||||
k="Built with OpenMP", stat=_openmp_parallelism_enabled()
|
||||
)
|
||||
)
|
||||
|
||||
# show threadpoolctl results
|
||||
threadpool_results = threadpool_info()
|
||||
if threadpool_results:
|
||||
print()
|
||||
print("threadpoolctl info:")
|
||||
|
||||
for i, result in enumerate(threadpool_results):
|
||||
for key, val in result.items():
|
||||
print(f"{key:>15}: {val}")
|
||||
if i != len(threadpool_results) - 1:
|
||||
print()
|
||||
Binary file not shown.
@@ -0,0 +1,9 @@
|
||||
from ._typedefs cimport DTYPE_t, ITYPE_t
|
||||
|
||||
from cython cimport floating
|
||||
|
||||
cdef int simultaneous_sort(
|
||||
floating *dist,
|
||||
ITYPE_t *idx,
|
||||
ITYPE_t size,
|
||||
) nogil
|
||||
@@ -0,0 +1,67 @@
|
||||
import numpy as np
|
||||
|
||||
_DEFAULT_TAGS = {
|
||||
"non_deterministic": False,
|
||||
"requires_positive_X": False,
|
||||
"requires_positive_y": False,
|
||||
"X_types": ["2darray"],
|
||||
"poor_score": False,
|
||||
"no_validation": False,
|
||||
"multioutput": False,
|
||||
"allow_nan": False,
|
||||
"stateless": False,
|
||||
"multilabel": False,
|
||||
"_skip_test": False,
|
||||
"_xfail_checks": False,
|
||||
"multioutput_only": False,
|
||||
"binary_only": False,
|
||||
"requires_fit": True,
|
||||
"preserves_dtype": [np.float64],
|
||||
"requires_y": False,
|
||||
"pairwise": False,
|
||||
}
|
||||
|
||||
|
||||
def _safe_tags(estimator, key=None):
|
||||
"""Safely get estimator tags.
|
||||
|
||||
:class:`~sklearn.BaseEstimator` provides the estimator tags machinery.
|
||||
However, if an estimator does not inherit from this base class, we should
|
||||
fall-back to the default tags.
|
||||
|
||||
For scikit-learn built-in estimators, we should still rely on
|
||||
`self._get_tags()`. `_safe_tags(est)` should be used when we are not sure
|
||||
where `est` comes from: typically `_safe_tags(self.base_estimator)` where
|
||||
`self` is a meta-estimator, or in the common checks.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator object
|
||||
The estimator from which to get the tag.
|
||||
|
||||
key : str, default=None
|
||||
Tag name to get. By default (`None`), all tags are returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tags : dict or tag value
|
||||
The estimator tags. A single value is returned if `key` is not None.
|
||||
"""
|
||||
if hasattr(estimator, "_get_tags"):
|
||||
tags_provider = "_get_tags()"
|
||||
tags = estimator._get_tags()
|
||||
elif hasattr(estimator, "_more_tags"):
|
||||
tags_provider = "_more_tags()"
|
||||
tags = {**_DEFAULT_TAGS, **estimator._more_tags()}
|
||||
else:
|
||||
tags_provider = "_DEFAULT_TAGS"
|
||||
tags = _DEFAULT_TAGS
|
||||
|
||||
if key is not None:
|
||||
if key not in tags:
|
||||
raise ValueError(
|
||||
f"The key {key} is not defined in {tags_provider} for the "
|
||||
f"class {estimator.__class__.__name__}."
|
||||
)
|
||||
return tags[key]
|
||||
return tags
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,17 @@
|
||||
#!python
|
||||
cimport numpy as np
|
||||
|
||||
# Floating point/data type
|
||||
ctypedef np.float64_t DTYPE_t # WARNING: should match DTYPE in typedefs.pyx
|
||||
|
||||
cdef enum:
|
||||
DTYPECODE = np.NPY_FLOAT64
|
||||
ITYPECODE = np.NPY_INTP
|
||||
INT32TYPECODE = np.NPY_INT32
|
||||
INT64TYPECODE = np.NPY_INT64
|
||||
|
||||
# Index/integer type.
|
||||
# WARNING: ITYPE_t must be a signed integer type or you will have a bad time!
|
||||
ctypedef np.intp_t ITYPE_t # WARNING: should match ITYPE in typedefs.pyx
|
||||
ctypedef np.int32_t INT32TYPE_t # WARNING: should match INT32TYPE in typedefs.pyx
|
||||
ctypedef np.int64_t INT64TYPE_t # WARNING: should match INT32TYPE in typedefs.pyx
|
||||
Binary file not shown.
@@ -0,0 +1,12 @@
|
||||
cimport numpy as np
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
from ..utils._typedefs cimport ITYPE_t, DTYPE_t, INT32TYPE_t, INT64TYPE_t
|
||||
|
||||
ctypedef fused vector_typed:
|
||||
vector[DTYPE_t]
|
||||
vector[ITYPE_t]
|
||||
vector[INT32TYPE_t]
|
||||
vector[INT64TYPE_t]
|
||||
|
||||
cdef np.ndarray vector_to_nd_array(vector_typed * vect_ptr)
|
||||
Binary file not shown.
@@ -0,0 +1,45 @@
|
||||
|
||||
# WARNING: Do not edit this .pyx file directly, it is generated from its .pyx.tp
|
||||
cimport numpy as np
|
||||
|
||||
cdef class WeightVector64(object):
|
||||
cdef readonly double[::1] w
|
||||
cdef readonly double[::1] aw
|
||||
cdef double *w_data_ptr
|
||||
cdef double *aw_data_ptr
|
||||
cdef double wscale
|
||||
cdef double average_a
|
||||
cdef double average_b
|
||||
cdef int n_features
|
||||
cdef double sq_norm
|
||||
|
||||
cdef void add(self, double *x_data_ptr, int *x_ind_ptr,
|
||||
int xnnz, double c) nogil
|
||||
cdef void add_average(self, double *x_data_ptr, int *x_ind_ptr,
|
||||
int xnnz, double c, double num_iter) nogil
|
||||
cdef double dot(self, double *x_data_ptr, int *x_ind_ptr,
|
||||
int xnnz) nogil
|
||||
cdef void scale(self, double c) nogil
|
||||
cdef void reset_wscale(self) nogil
|
||||
cdef double norm(self) nogil
|
||||
|
||||
cdef class WeightVector32(object):
|
||||
cdef readonly float[::1] w
|
||||
cdef readonly float[::1] aw
|
||||
cdef float *w_data_ptr
|
||||
cdef float *aw_data_ptr
|
||||
cdef float wscale
|
||||
cdef float average_a
|
||||
cdef float average_b
|
||||
cdef int n_features
|
||||
cdef float sq_norm
|
||||
|
||||
cdef void add(self, float *x_data_ptr, int *x_ind_ptr,
|
||||
int xnnz, float c) nogil
|
||||
cdef void add_average(self, float *x_data_ptr, int *x_ind_ptr,
|
||||
int xnnz, float c, float num_iter) nogil
|
||||
cdef float dot(self, float *x_data_ptr, int *x_ind_ptr,
|
||||
int xnnz) nogil
|
||||
cdef void scale(self, float c) nogil
|
||||
cdef void reset_wscale(self) nogil
|
||||
cdef float norm(self) nogil
|
||||
Binary file not shown.
@@ -0,0 +1,194 @@
|
||||
# Authors: Andreas Mueller
|
||||
# Manoj Kumar
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy import sparse
|
||||
|
||||
|
||||
def compute_class_weight(class_weight, *, classes, y):
|
||||
"""Estimate class weights for unbalanced datasets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
class_weight : dict, 'balanced' or None
|
||||
If 'balanced', class weights will be given by
|
||||
``n_samples / (n_classes * np.bincount(y))``.
|
||||
If a dictionary is given, keys are classes and values
|
||||
are corresponding class weights.
|
||||
If None is given, the class weights will be uniform.
|
||||
|
||||
classes : ndarray
|
||||
Array of the classes occurring in the data, as given by
|
||||
``np.unique(y_org)`` with ``y_org`` the original class labels.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Array of original class labels per sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
class_weight_vect : ndarray of shape (n_classes,)
|
||||
Array with class_weight_vect[i] the weight for i-th class.
|
||||
|
||||
References
|
||||
----------
|
||||
The "balanced" heuristic is inspired by
|
||||
Logistic Regression in Rare Events Data, King, Zen, 2001.
|
||||
"""
|
||||
# Import error caused by circular imports.
|
||||
from ..preprocessing import LabelEncoder
|
||||
|
||||
if set(y) - set(classes):
|
||||
raise ValueError("classes should include all valid labels that can be in y")
|
||||
if class_weight is None or len(class_weight) == 0:
|
||||
# uniform class weights
|
||||
weight = np.ones(classes.shape[0], dtype=np.float64, order="C")
|
||||
elif class_weight == "balanced":
|
||||
# Find the weight of each class as present in y.
|
||||
le = LabelEncoder()
|
||||
y_ind = le.fit_transform(y)
|
||||
if not all(np.in1d(classes, le.classes_)):
|
||||
raise ValueError("classes should have valid labels that are in y")
|
||||
|
||||
recip_freq = len(y) / (len(le.classes_) * np.bincount(y_ind).astype(np.float64))
|
||||
weight = recip_freq[le.transform(classes)]
|
||||
else:
|
||||
# user-defined dictionary
|
||||
weight = np.ones(classes.shape[0], dtype=np.float64, order="C")
|
||||
if not isinstance(class_weight, dict):
|
||||
raise ValueError(
|
||||
"class_weight must be dict, 'balanced', or None, got: %r" % class_weight
|
||||
)
|
||||
unweighted_classes = []
|
||||
for i, c in enumerate(classes):
|
||||
if c in class_weight:
|
||||
weight[i] = class_weight[c]
|
||||
else:
|
||||
unweighted_classes.append(c)
|
||||
|
||||
n_weighted_classes = len(classes) - len(unweighted_classes)
|
||||
if unweighted_classes and n_weighted_classes != len(class_weight):
|
||||
raise ValueError(
|
||||
f"The classes, {unweighted_classes}, are not in class_weight"
|
||||
)
|
||||
|
||||
return weight
|
||||
|
||||
|
||||
def compute_sample_weight(class_weight, y, *, indices=None):
|
||||
"""Estimate sample weights by class for unbalanced datasets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
class_weight : dict, list of dicts, "balanced", or None
|
||||
Weights associated with classes in the form ``{class_label: weight}``.
|
||||
If not given, all classes are supposed to have weight one. For
|
||||
multi-output problems, a list of dicts can be provided in the same
|
||||
order as the columns of y.
|
||||
|
||||
Note that for multioutput (including multilabel) weights should be
|
||||
defined for each class of every column in its own dict. For example,
|
||||
for four-class multilabel classification weights should be
|
||||
[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
|
||||
[{1:1}, {2:5}, {3:1}, {4:1}].
|
||||
|
||||
The "balanced" mode uses the values of y to automatically adjust
|
||||
weights inversely proportional to class frequencies in the input data:
|
||||
``n_samples / (n_classes * np.bincount(y))``.
|
||||
|
||||
For multi-output, the weights of each column of y will be multiplied.
|
||||
|
||||
y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)
|
||||
Array of original class labels per sample.
|
||||
|
||||
indices : array-like of shape (n_subsample,), default=None
|
||||
Array of indices to be used in a subsample. Can be of length less than
|
||||
n_samples in the case of a subsample, or equal to n_samples in the
|
||||
case of a bootstrap subsample with repeated indices. If None, the
|
||||
sample weight will be calculated over the full sample. Only "balanced"
|
||||
is supported for class_weight if this is provided.
|
||||
|
||||
Returns
|
||||
-------
|
||||
sample_weight_vect : ndarray of shape (n_samples,)
|
||||
Array with sample weights as applied to the original y.
|
||||
"""
|
||||
|
||||
# Ensure y is 2D. Sparse matrices are already 2D.
|
||||
if not sparse.issparse(y):
|
||||
y = np.atleast_1d(y)
|
||||
if y.ndim == 1:
|
||||
y = np.reshape(y, (-1, 1))
|
||||
n_outputs = y.shape[1]
|
||||
|
||||
if isinstance(class_weight, str):
|
||||
if class_weight not in ["balanced"]:
|
||||
raise ValueError(
|
||||
'The only valid preset for class_weight is "balanced". Given "%s".'
|
||||
% class_weight
|
||||
)
|
||||
elif indices is not None and not isinstance(class_weight, str):
|
||||
raise ValueError(
|
||||
'The only valid class_weight for subsampling is "balanced". Given "%s".'
|
||||
% class_weight
|
||||
)
|
||||
elif n_outputs > 1:
|
||||
if not hasattr(class_weight, "__iter__") or isinstance(class_weight, dict):
|
||||
raise ValueError(
|
||||
"For multi-output, class_weight should be a "
|
||||
"list of dicts, or a valid string."
|
||||
)
|
||||
if len(class_weight) != n_outputs:
|
||||
raise ValueError(
|
||||
"For multi-output, number of elements in "
|
||||
"class_weight should match number of outputs."
|
||||
)
|
||||
|
||||
expanded_class_weight = []
|
||||
for k in range(n_outputs):
|
||||
|
||||
y_full = y[:, k]
|
||||
if sparse.issparse(y_full):
|
||||
# Ok to densify a single column at a time
|
||||
y_full = y_full.toarray().flatten()
|
||||
classes_full = np.unique(y_full)
|
||||
classes_missing = None
|
||||
|
||||
if class_weight == "balanced" or n_outputs == 1:
|
||||
class_weight_k = class_weight
|
||||
else:
|
||||
class_weight_k = class_weight[k]
|
||||
|
||||
if indices is not None:
|
||||
# Get class weights for the subsample, covering all classes in
|
||||
# case some labels that were present in the original data are
|
||||
# missing from the sample.
|
||||
y_subsample = y_full[indices]
|
||||
classes_subsample = np.unique(y_subsample)
|
||||
|
||||
weight_k = np.take(
|
||||
compute_class_weight(
|
||||
class_weight_k, classes=classes_subsample, y=y_subsample
|
||||
),
|
||||
np.searchsorted(classes_subsample, classes_full),
|
||||
mode="clip",
|
||||
)
|
||||
|
||||
classes_missing = set(classes_full) - set(classes_subsample)
|
||||
else:
|
||||
weight_k = compute_class_weight(
|
||||
class_weight_k, classes=classes_full, y=y_full
|
||||
)
|
||||
|
||||
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
|
||||
|
||||
if classes_missing:
|
||||
# Make missing classes' weight zero
|
||||
weight_k[np.in1d(y_full, list(classes_missing))] = 0.0
|
||||
|
||||
expanded_class_weight.append(weight_k)
|
||||
|
||||
expanded_class_weight = np.prod(expanded_class_weight, axis=0, dtype=np.float64)
|
||||
|
||||
return expanded_class_weight
|
||||
@@ -0,0 +1,127 @@
|
||||
import warnings
|
||||
import functools
|
||||
|
||||
|
||||
__all__ = ["deprecated"]
|
||||
|
||||
|
||||
class deprecated:
|
||||
"""Decorator to mark a function or class as deprecated.
|
||||
|
||||
Issue a warning when the function is called/the class is instantiated and
|
||||
adds a warning to the docstring.
|
||||
|
||||
The optional extra argument will be appended to the deprecation message
|
||||
and the docstring. Note: to use this with the default value for extra, put
|
||||
in an empty of parentheses:
|
||||
|
||||
>>> from sklearn.utils import deprecated
|
||||
>>> deprecated()
|
||||
<sklearn.utils.deprecation.deprecated object at ...>
|
||||
|
||||
>>> @deprecated()
|
||||
... def some_function(): pass
|
||||
|
||||
Parameters
|
||||
----------
|
||||
extra : str, default=''
|
||||
To be added to the deprecation messages.
|
||||
"""
|
||||
|
||||
# Adapted from https://wiki.python.org/moin/PythonDecoratorLibrary,
|
||||
# but with many changes.
|
||||
|
||||
def __init__(self, extra=""):
|
||||
self.extra = extra
|
||||
|
||||
def __call__(self, obj):
|
||||
"""Call method
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
"""
|
||||
if isinstance(obj, type):
|
||||
return self._decorate_class(obj)
|
||||
elif isinstance(obj, property):
|
||||
# Note that this is only triggered properly if the `property`
|
||||
# decorator comes before the `deprecated` decorator, like so:
|
||||
#
|
||||
# @deprecated(msg)
|
||||
# @property
|
||||
# def deprecated_attribute_(self):
|
||||
# ...
|
||||
return self._decorate_property(obj)
|
||||
else:
|
||||
return self._decorate_fun(obj)
|
||||
|
||||
def _decorate_class(self, cls):
|
||||
msg = "Class %s is deprecated" % cls.__name__
|
||||
if self.extra:
|
||||
msg += "; %s" % self.extra
|
||||
|
||||
# FIXME: we should probably reset __new__ for full generality
|
||||
init = cls.__init__
|
||||
|
||||
def wrapped(*args, **kwargs):
|
||||
warnings.warn(msg, category=FutureWarning)
|
||||
return init(*args, **kwargs)
|
||||
|
||||
cls.__init__ = wrapped
|
||||
|
||||
wrapped.__name__ = "__init__"
|
||||
wrapped.__doc__ = self._update_doc(init.__doc__)
|
||||
wrapped.deprecated_original = init
|
||||
|
||||
return cls
|
||||
|
||||
def _decorate_fun(self, fun):
|
||||
"""Decorate function fun"""
|
||||
|
||||
msg = "Function %s is deprecated" % fun.__name__
|
||||
if self.extra:
|
||||
msg += "; %s" % self.extra
|
||||
|
||||
@functools.wraps(fun)
|
||||
def wrapped(*args, **kwargs):
|
||||
warnings.warn(msg, category=FutureWarning)
|
||||
return fun(*args, **kwargs)
|
||||
|
||||
wrapped.__doc__ = self._update_doc(wrapped.__doc__)
|
||||
# Add a reference to the wrapped function so that we can introspect
|
||||
# on function arguments in Python 2 (already works in Python 3)
|
||||
wrapped.__wrapped__ = fun
|
||||
|
||||
return wrapped
|
||||
|
||||
def _decorate_property(self, prop):
|
||||
msg = self.extra
|
||||
|
||||
@property
|
||||
@functools.wraps(prop)
|
||||
def wrapped(*args, **kwargs):
|
||||
warnings.warn(msg, category=FutureWarning)
|
||||
return prop.fget(*args, **kwargs)
|
||||
|
||||
wrapped.__doc__ = self._update_doc(wrapped.__doc__)
|
||||
|
||||
return wrapped
|
||||
|
||||
def _update_doc(self, olddoc):
|
||||
newdoc = "DEPRECATED"
|
||||
if self.extra:
|
||||
newdoc = "%s: %s" % (newdoc, self.extra)
|
||||
if olddoc:
|
||||
newdoc = "%s\n\n %s" % (newdoc, olddoc)
|
||||
return newdoc
|
||||
|
||||
|
||||
def _is_deprecated(func):
|
||||
"""Helper to check if func is wrapped by our deprecated decorator"""
|
||||
closures = getattr(func, "__closure__", [])
|
||||
if closures is None:
|
||||
closures = []
|
||||
is_deprecated = "deprecated" in "".join(
|
||||
[c.cell_contents for c in closures if isinstance(c.cell_contents, str)]
|
||||
)
|
||||
return is_deprecated
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,165 @@
|
||||
"""Compatibility fixes for older version of python, numpy and scipy
|
||||
|
||||
If you add content to this file, please give the version of the package
|
||||
at which the fix is no longer needed.
|
||||
"""
|
||||
# Authors: Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>
|
||||
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Fabian Pedregosa <fpedregosa@acm.org>
|
||||
# Lars Buitinck
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
from functools import update_wrapper
|
||||
import functools
|
||||
|
||||
import sklearn
|
||||
import numpy as np
|
||||
import scipy
|
||||
import scipy.stats
|
||||
import threadpoolctl
|
||||
from .._config import config_context, get_config
|
||||
from ..externals._packaging.version import parse as parse_version
|
||||
|
||||
|
||||
np_version = parse_version(np.__version__)
|
||||
sp_version = parse_version(scipy.__version__)
|
||||
|
||||
|
||||
if sp_version >= parse_version("1.4"):
|
||||
from scipy.sparse.linalg import lobpcg
|
||||
else:
|
||||
# Backport of lobpcg functionality from scipy 1.4.0, can be removed
|
||||
# once support for sp_version < parse_version('1.4') is dropped
|
||||
# mypy error: Name 'lobpcg' already defined (possibly by an import)
|
||||
from ..externals._lobpcg import lobpcg # type: ignore # noqa
|
||||
|
||||
try:
|
||||
from scipy.optimize._linesearch import line_search_wolfe2, line_search_wolfe1
|
||||
except ImportError: # SciPy < 1.8
|
||||
from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1 # type: ignore # noqa
|
||||
|
||||
|
||||
def _object_dtype_isnan(X):
|
||||
return X != X
|
||||
|
||||
|
||||
class loguniform(scipy.stats.reciprocal):
|
||||
"""A class supporting log-uniform random variables.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low : float
|
||||
The minimum value
|
||||
high : float
|
||||
The maximum value
|
||||
|
||||
Methods
|
||||
-------
|
||||
rvs(self, size=None, random_state=None)
|
||||
Generate log-uniform random variables
|
||||
|
||||
The most useful method for Scikit-learn usage is highlighted here.
|
||||
For a full list, see
|
||||
`scipy.stats.reciprocal
|
||||
<https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.reciprocal.html>`_.
|
||||
This list includes all functions of ``scipy.stats`` continuous
|
||||
distributions such as ``pdf``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This class generates values between ``low`` and ``high`` or
|
||||
|
||||
low <= loguniform(low, high).rvs() <= high
|
||||
|
||||
The logarithmic probability density function (PDF) is uniform. When
|
||||
``x`` is a uniformly distributed random variable between 0 and 1, ``10**x``
|
||||
are random variables that are equally likely to be returned.
|
||||
|
||||
This class is an alias to ``scipy.stats.reciprocal``, which uses the
|
||||
reciprocal distribution:
|
||||
https://en.wikipedia.org/wiki/Reciprocal_distribution
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from sklearn.utils.fixes import loguniform
|
||||
>>> rv = loguniform(1e-3, 1e1)
|
||||
>>> rvs = rv.rvs(random_state=42, size=1000)
|
||||
>>> rvs.min() # doctest: +SKIP
|
||||
0.0010435856341129003
|
||||
>>> rvs.max() # doctest: +SKIP
|
||||
9.97403052786026
|
||||
"""
|
||||
|
||||
|
||||
# remove when https://github.com/joblib/joblib/issues/1071 is fixed
|
||||
def delayed(function):
|
||||
"""Decorator used to capture the arguments of a function."""
|
||||
|
||||
@functools.wraps(function)
|
||||
def delayed_function(*args, **kwargs):
|
||||
return _FuncWrapper(function), args, kwargs
|
||||
|
||||
return delayed_function
|
||||
|
||||
|
||||
class _FuncWrapper:
|
||||
""" "Load the global configuration before calling the function."""
|
||||
|
||||
def __init__(self, function):
|
||||
self.function = function
|
||||
self.config = get_config()
|
||||
update_wrapper(self, self.function)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
with config_context(**self.config):
|
||||
return self.function(*args, **kwargs)
|
||||
|
||||
|
||||
# Rename the `method` kwarg to `interpolation` for NumPy < 1.22, because
|
||||
# `interpolation` kwarg was deprecated in favor of `method` in NumPy >= 1.22.
|
||||
def _percentile(a, q, *, method="linear", **kwargs):
|
||||
return np.percentile(a, q, interpolation=method, **kwargs)
|
||||
|
||||
|
||||
if np_version < parse_version("1.22"):
|
||||
percentile = _percentile
|
||||
else: # >= 1.22
|
||||
from numpy import percentile # type: ignore # noqa
|
||||
|
||||
|
||||
# compatibility fix for threadpoolctl >= 3.0.0
|
||||
# since version 3 it's possible to setup a global threadpool controller to avoid
|
||||
# looping through all loaded shared libraries each time.
|
||||
# the global controller is created during the first call to threadpoolctl.
|
||||
def _get_threadpool_controller():
|
||||
if not hasattr(threadpoolctl, "ThreadpoolController"):
|
||||
return None
|
||||
|
||||
if not hasattr(sklearn, "_sklearn_threadpool_controller"):
|
||||
sklearn._sklearn_threadpool_controller = threadpoolctl.ThreadpoolController()
|
||||
|
||||
return sklearn._sklearn_threadpool_controller
|
||||
|
||||
|
||||
def threadpool_limits(limits=None, user_api=None):
|
||||
controller = _get_threadpool_controller()
|
||||
if controller is not None:
|
||||
return controller.limit(limits=limits, user_api=user_api)
|
||||
else:
|
||||
return threadpoolctl.threadpool_limits(limits=limits, user_api=user_api)
|
||||
|
||||
|
||||
threadpool_limits.__doc__ = threadpoolctl.threadpool_limits.__doc__
|
||||
|
||||
|
||||
def threadpool_info():
|
||||
controller = _get_threadpool_controller()
|
||||
if controller is not None:
|
||||
return controller.info()
|
||||
else:
|
||||
return threadpoolctl.threadpool_info()
|
||||
|
||||
|
||||
threadpool_info.__doc__ = threadpoolctl.threadpool_info.__doc__
|
||||
@@ -0,0 +1,204 @@
|
||||
"""
|
||||
Graph utilities and algorithms
|
||||
|
||||
Graphs are represented with their adjacency matrices, preferably using
|
||||
sparse matrices.
|
||||
"""
|
||||
|
||||
# Authors: Aric Hagberg <hagberg@lanl.gov>
|
||||
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Jake Vanderplas <vanderplas@astro.washington.edu>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from .deprecation import deprecated
|
||||
from ..metrics.pairwise import pairwise_distances
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Path and connected component analysis.
|
||||
# Code adapted from networkx
|
||||
def single_source_shortest_path_length(graph, source, *, cutoff=None):
|
||||
"""Return the shortest path length from source to all reachable nodes.
|
||||
|
||||
Returns a dictionary of shortest path lengths keyed by target.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : {sparse matrix, ndarray} of shape (n, n)
|
||||
Adjacency matrix of the graph. Sparse matrix of format LIL is
|
||||
preferred.
|
||||
|
||||
source : int
|
||||
Starting node for path.
|
||||
|
||||
cutoff : int, default=None
|
||||
Depth to stop the search - only paths of length <= cutoff are returned.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.utils.graph import single_source_shortest_path_length
|
||||
>>> import numpy as np
|
||||
>>> graph = np.array([[ 0, 1, 0, 0],
|
||||
... [ 1, 0, 1, 0],
|
||||
... [ 0, 1, 0, 1],
|
||||
... [ 0, 0, 1, 0]])
|
||||
>>> list(sorted(single_source_shortest_path_length(graph, 0).items()))
|
||||
[(0, 0), (1, 1), (2, 2), (3, 3)]
|
||||
>>> graph = np.ones((6, 6))
|
||||
>>> list(sorted(single_source_shortest_path_length(graph, 2).items()))
|
||||
[(0, 1), (1, 1), (2, 0), (3, 1), (4, 1), (5, 1)]
|
||||
"""
|
||||
if sparse.isspmatrix(graph):
|
||||
graph = graph.tolil()
|
||||
else:
|
||||
graph = sparse.lil_matrix(graph)
|
||||
seen = {} # level (number of hops) when seen in BFS
|
||||
level = 0 # the current level
|
||||
next_level = [source] # dict of nodes to check at next level
|
||||
while next_level:
|
||||
this_level = next_level # advance to next level
|
||||
next_level = set() # and start a new list (fringe)
|
||||
for v in this_level:
|
||||
if v not in seen:
|
||||
seen[v] = level # set the level of vertex v
|
||||
next_level.update(graph.rows[v])
|
||||
if cutoff is not None and cutoff <= level:
|
||||
break
|
||||
level += 1
|
||||
return seen # return all path lengths as dictionary
|
||||
|
||||
|
||||
@deprecated(
|
||||
"`graph_shortest_path` is deprecated in 1.0 (renaming of 0.25) and will "
|
||||
"be removed in 1.2. Use `scipy.sparse.csgraph.shortest_path` instead."
|
||||
)
|
||||
def graph_shortest_path(dist_matrix, directed=True, method="auto"):
|
||||
"""Shortest-path graph search on a positive directed or undirected graph.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dist_matrix : arraylike or sparse matrix, shape = (N,N)
|
||||
Array of positive distances.
|
||||
If vertex i is connected to vertex j, then dist_matrix[i,j] gives
|
||||
the distance between the vertices.
|
||||
If vertex i is not connected to vertex j, then dist_matrix[i,j] = 0
|
||||
|
||||
directed : boolean
|
||||
if True, then find the shortest path on a directed graph: only
|
||||
progress from a point to its neighbors, not the other way around.
|
||||
if False, then find the shortest path on an undirected graph: the
|
||||
algorithm can progress from a point to its neighbors and vice versa.
|
||||
|
||||
method : {'auto', 'FW', 'D'}, default='auto'
|
||||
method to use. Options are
|
||||
'auto' : attempt to choose the best method for the current problem
|
||||
'FW' : Floyd-Warshall algorithm. O[N^3]
|
||||
'D' : Dijkstra's algorithm with Fibonacci stacks. O[(k+log(N))N^2]
|
||||
|
||||
Returns
|
||||
-------
|
||||
G : np.ndarray, float, shape = [N,N]
|
||||
G[i,j] gives the shortest distance from point i to point j
|
||||
along the graph.
|
||||
|
||||
Notes
|
||||
-----
|
||||
As currently implemented, Dijkstra's algorithm does not work for
|
||||
graphs with direction-dependent distances when directed == False.
|
||||
i.e., if dist_matrix[i,j] and dist_matrix[j,i] are not equal and
|
||||
both are nonzero, method='D' will not necessarily yield the correct
|
||||
result.
|
||||
Also, these routines have not been tested for graphs with negative
|
||||
distances. Negative distances can lead to infinite cycles that must
|
||||
be handled by specialized algorithms.
|
||||
"""
|
||||
return sparse.csgraph.shortest_path(dist_matrix, method=method, directed=directed)
|
||||
|
||||
|
||||
def _fix_connected_components(
|
||||
X,
|
||||
graph,
|
||||
n_connected_components,
|
||||
component_labels,
|
||||
mode="distance",
|
||||
metric="euclidean",
|
||||
**kwargs,
|
||||
):
|
||||
"""Add connections to sparse graph to connect unconnected components.
|
||||
|
||||
For each pair of unconnected components, compute all pairwise distances
|
||||
from one component to the other, and add a connection on the closest pair
|
||||
of samples. This is a hacky way to get a graph with a single connected
|
||||
component, which is necessary for example to compute a shortest path
|
||||
between all pairs of samples in the graph.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array of shape (n_samples, n_features) or (n_samples, n_samples)
|
||||
Features to compute the pairwise distances. If `metric =
|
||||
"precomputed"`, X is the matrix of pairwise distances.
|
||||
|
||||
graph : sparse matrix of shape (n_samples, n_samples)
|
||||
Graph of connection between samples.
|
||||
|
||||
n_connected_components : int
|
||||
Number of connected components, as computed by
|
||||
`scipy.sparse.csgraph.connected_components`.
|
||||
|
||||
component_labels : array of shape (n_samples)
|
||||
Labels of connected components, as computed by
|
||||
`scipy.sparse.csgraph.connected_components`.
|
||||
|
||||
mode : {'connectivity', 'distance'}, default='distance'
|
||||
Type of graph matrix: 'connectivity' corresponds to the connectivity
|
||||
matrix with ones and zeros, and 'distance' corresponds to the distances
|
||||
between neighbors according to the given metric.
|
||||
|
||||
metric : str
|
||||
Metric used in `sklearn.metrics.pairwise.pairwise_distances`.
|
||||
|
||||
kwargs : kwargs
|
||||
Keyword arguments passed to
|
||||
`sklearn.metrics.pairwise.pairwise_distances`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
graph : sparse matrix of shape (n_samples, n_samples)
|
||||
Graph of connection between samples, with a single connected component.
|
||||
"""
|
||||
if metric == "precomputed" and sparse.issparse(X):
|
||||
raise RuntimeError(
|
||||
"_fix_connected_components with metric='precomputed' requires the "
|
||||
"full distance matrix in X, and does not work with a sparse "
|
||||
"neighbors graph."
|
||||
)
|
||||
|
||||
for i in range(n_connected_components):
|
||||
idx_i = np.flatnonzero(component_labels == i)
|
||||
Xi = X[idx_i]
|
||||
for j in range(i):
|
||||
idx_j = np.flatnonzero(component_labels == j)
|
||||
Xj = X[idx_j]
|
||||
|
||||
if metric == "precomputed":
|
||||
D = X[np.ix_(idx_i, idx_j)]
|
||||
else:
|
||||
D = pairwise_distances(Xi, Xj, metric=metric, **kwargs)
|
||||
|
||||
ii, jj = np.unravel_index(D.argmin(axis=None), D.shape)
|
||||
if mode == "connectivity":
|
||||
graph[idx_i[ii], idx_j[jj]] = 1
|
||||
graph[idx_j[jj], idx_i[ii]] = 1
|
||||
elif mode == "distance":
|
||||
graph[idx_i[ii], idx_j[jj]] = D[ii, jj]
|
||||
graph[idx_j[jj], idx_i[ii]] = D[ii, jj]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unknown mode=%r, should be one of ['connectivity', 'distance']."
|
||||
% mode
|
||||
)
|
||||
|
||||
return graph
|
||||
@@ -0,0 +1,315 @@
|
||||
"""Utilities for meta-estimators"""
|
||||
# Author: Joel Nothman
|
||||
# Andreas Mueller
|
||||
# License: BSD
|
||||
from typing import List, Any
|
||||
from types import MethodType
|
||||
import warnings
|
||||
from functools import wraps
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from operator import attrgetter
|
||||
from functools import update_wrapper
|
||||
import numpy as np
|
||||
from contextlib import suppress
|
||||
|
||||
from ..utils import _safe_indexing
|
||||
from ..utils._tags import _safe_tags
|
||||
from ..base import BaseEstimator
|
||||
|
||||
__all__ = ["available_if", "if_delegate_has_method"]
|
||||
|
||||
|
||||
class _BaseComposition(BaseEstimator, metaclass=ABCMeta):
|
||||
"""Handles parameter management for classifiers composed of named estimators."""
|
||||
|
||||
steps: List[Any]
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def _get_params(self, attr, deep=True):
|
||||
out = super().get_params(deep=deep)
|
||||
if not deep:
|
||||
return out
|
||||
|
||||
estimators = getattr(self, attr)
|
||||
try:
|
||||
out.update(estimators)
|
||||
except (TypeError, ValueError):
|
||||
# Ignore TypeError for cases where estimators is not a list of
|
||||
# (name, estimator) and ignore ValueError when the list is not
|
||||
# formatted correctly. This is to prevent errors when calling
|
||||
# `set_params`. `BaseEstimator.set_params` calls `get_params` which
|
||||
# can error for invalid values for `estimators`.
|
||||
return out
|
||||
|
||||
for name, estimator in estimators:
|
||||
if hasattr(estimator, "get_params"):
|
||||
for key, value in estimator.get_params(deep=True).items():
|
||||
out["%s__%s" % (name, key)] = value
|
||||
return out
|
||||
|
||||
def _set_params(self, attr, **params):
|
||||
# Ensure strict ordering of parameter setting:
|
||||
# 1. All steps
|
||||
if attr in params:
|
||||
setattr(self, attr, params.pop(attr))
|
||||
# 2. Replace items with estimators in params
|
||||
items = getattr(self, attr)
|
||||
if isinstance(items, list) and items:
|
||||
# Get item names used to identify valid names in params
|
||||
# `zip` raises a TypeError when `items` does not contains
|
||||
# elements of length 2
|
||||
with suppress(TypeError):
|
||||
item_names, _ = zip(*items)
|
||||
for name in list(params.keys()):
|
||||
if "__" not in name and name in item_names:
|
||||
self._replace_estimator(attr, name, params.pop(name))
|
||||
|
||||
# 3. Step parameters and other initialisation arguments
|
||||
super().set_params(**params)
|
||||
return self
|
||||
|
||||
def _replace_estimator(self, attr, name, new_val):
|
||||
# assumes `name` is a valid estimator name
|
||||
new_estimators = list(getattr(self, attr))
|
||||
for i, (estimator_name, _) in enumerate(new_estimators):
|
||||
if estimator_name == name:
|
||||
new_estimators[i] = (name, new_val)
|
||||
break
|
||||
setattr(self, attr, new_estimators)
|
||||
|
||||
def _validate_names(self, names):
|
||||
if len(set(names)) != len(names):
|
||||
raise ValueError("Names provided are not unique: {0!r}".format(list(names)))
|
||||
invalid_names = set(names).intersection(self.get_params(deep=False))
|
||||
if invalid_names:
|
||||
raise ValueError(
|
||||
"Estimator names conflict with constructor arguments: {0!r}".format(
|
||||
sorted(invalid_names)
|
||||
)
|
||||
)
|
||||
invalid_names = [name for name in names if "__" in name]
|
||||
if invalid_names:
|
||||
raise ValueError(
|
||||
"Estimator names must not contain __: got {0!r}".format(invalid_names)
|
||||
)
|
||||
|
||||
|
||||
class _AvailableIfDescriptor:
|
||||
"""Implements a conditional property using the descriptor protocol.
|
||||
|
||||
Using this class to create a decorator will raise an ``AttributeError``
|
||||
if check(self) returns a falsey value. Note that if check raises an error
|
||||
this will also result in hasattr returning false.
|
||||
|
||||
See https://docs.python.org/3/howto/descriptor.html for an explanation of
|
||||
descriptors.
|
||||
"""
|
||||
|
||||
def __init__(self, fn, check, attribute_name):
|
||||
self.fn = fn
|
||||
self.check = check
|
||||
self.attribute_name = attribute_name
|
||||
|
||||
# update the docstring of the descriptor
|
||||
update_wrapper(self, fn)
|
||||
|
||||
def __get__(self, obj, owner=None):
|
||||
attr_err = AttributeError(
|
||||
f"This {repr(owner.__name__)} has no attribute {repr(self.attribute_name)}"
|
||||
)
|
||||
if obj is not None:
|
||||
# delegate only on instances, not the classes.
|
||||
# this is to allow access to the docstrings.
|
||||
if not self.check(obj):
|
||||
raise attr_err
|
||||
out = MethodType(self.fn, obj)
|
||||
|
||||
else:
|
||||
# This makes it possible to use the decorated method as an unbound method,
|
||||
# for instance when monkeypatching.
|
||||
@wraps(self.fn)
|
||||
def out(*args, **kwargs):
|
||||
if not self.check(args[0]):
|
||||
raise attr_err
|
||||
return self.fn(*args, **kwargs)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def available_if(check):
|
||||
"""An attribute that is available only if check returns a truthy value
|
||||
|
||||
Parameters
|
||||
----------
|
||||
check : callable
|
||||
When passed the object with the decorated method, this should return
|
||||
a truthy value if the attribute is available, and either return False
|
||||
or raise an AttributeError if not available.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.utils.metaestimators import available_if
|
||||
>>> class HelloIfEven:
|
||||
... def __init__(self, x):
|
||||
... self.x = x
|
||||
...
|
||||
... def _x_is_even(self):
|
||||
... return self.x % 2 == 0
|
||||
...
|
||||
... @available_if(_x_is_even)
|
||||
... def say_hello(self):
|
||||
... print("Hello")
|
||||
...
|
||||
>>> obj = HelloIfEven(1)
|
||||
>>> hasattr(obj, "say_hello")
|
||||
False
|
||||
>>> obj.x = 2
|
||||
>>> hasattr(obj, "say_hello")
|
||||
True
|
||||
>>> obj.say_hello()
|
||||
Hello
|
||||
"""
|
||||
return lambda fn: _AvailableIfDescriptor(fn, check, attribute_name=fn.__name__)
|
||||
|
||||
|
||||
# TODO(1.3) remove
|
||||
class _IffHasAttrDescriptor(_AvailableIfDescriptor):
|
||||
"""Implements a conditional property using the descriptor protocol.
|
||||
|
||||
Using this class to create a decorator will raise an ``AttributeError``
|
||||
if none of the delegates (specified in ``delegate_names``) is an attribute
|
||||
of the base object or the first found delegate does not have an attribute
|
||||
``attribute_name``.
|
||||
|
||||
This allows ducktyping of the decorated method based on
|
||||
``delegate.attribute_name``. Here ``delegate`` is the first item in
|
||||
``delegate_names`` for which ``hasattr(object, delegate) is True``.
|
||||
|
||||
See https://docs.python.org/3/howto/descriptor.html for an explanation of
|
||||
descriptors.
|
||||
"""
|
||||
|
||||
def __init__(self, fn, delegate_names, attribute_name):
|
||||
super().__init__(fn, self._check, attribute_name)
|
||||
self.delegate_names = delegate_names
|
||||
|
||||
def _check(self, obj):
|
||||
warnings.warn(
|
||||
"if_delegate_has_method was deprecated in version 1.1 and will be "
|
||||
"removed in version 1.3. Use if_available instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
|
||||
delegate = None
|
||||
for delegate_name in self.delegate_names:
|
||||
try:
|
||||
delegate = attrgetter(delegate_name)(obj)
|
||||
break
|
||||
except AttributeError:
|
||||
continue
|
||||
|
||||
if delegate is None:
|
||||
return False
|
||||
# raise original AttributeError
|
||||
getattr(delegate, self.attribute_name)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# TODO(1.3) remove
|
||||
def if_delegate_has_method(delegate):
|
||||
"""Create a decorator for methods that are delegated to a sub-estimator
|
||||
|
||||
This enables ducktyping by hasattr returning True according to the
|
||||
sub-estimator.
|
||||
|
||||
.. deprecated:: 1.3
|
||||
`if_delegate_has_method` is deprecated in version 1.1 and will be removed in
|
||||
version 1.3. Use `available_if` instead.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
delegate : str, list of str or tuple of str
|
||||
Name of the sub-estimator that can be accessed as an attribute of the
|
||||
base object. If a list or a tuple of names are provided, the first
|
||||
sub-estimator that is an attribute of the base object will be used.
|
||||
|
||||
"""
|
||||
if isinstance(delegate, list):
|
||||
delegate = tuple(delegate)
|
||||
if not isinstance(delegate, tuple):
|
||||
delegate = (delegate,)
|
||||
|
||||
return lambda fn: _IffHasAttrDescriptor(fn, delegate, attribute_name=fn.__name__)
|
||||
|
||||
|
||||
def _safe_split(estimator, X, y, indices, train_indices=None):
|
||||
"""Create subset of dataset and properly handle kernels.
|
||||
|
||||
Slice X, y according to indices for cross-validation, but take care of
|
||||
precomputed kernel-matrices or pairwise affinities / distances.
|
||||
|
||||
If ``estimator._pairwise is True``, X needs to be square and
|
||||
we slice rows and columns. If ``train_indices`` is not None,
|
||||
we slice rows using ``indices`` (assumed the test set) and columns
|
||||
using ``train_indices``, indicating the training set.
|
||||
|
||||
Labels y will always be indexed only along the first axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : object
|
||||
Estimator to determine whether we should slice only rows or rows and
|
||||
columns.
|
||||
|
||||
X : array-like, sparse matrix or iterable
|
||||
Data to be indexed. If ``estimator._pairwise is True``,
|
||||
this needs to be a square array-like or sparse matrix.
|
||||
|
||||
y : array-like, sparse matrix or iterable
|
||||
Targets to be indexed.
|
||||
|
||||
indices : array of int
|
||||
Rows to select from X and y.
|
||||
If ``estimator._pairwise is True`` and ``train_indices is None``
|
||||
then ``indices`` will also be used to slice columns.
|
||||
|
||||
train_indices : array of int or None, default=None
|
||||
If ``estimator._pairwise is True`` and ``train_indices is not None``,
|
||||
then ``train_indices`` will be use to slice the columns of X.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_subset : array-like, sparse matrix or list
|
||||
Indexed data.
|
||||
|
||||
y_subset : array-like, sparse matrix or list
|
||||
Indexed targets.
|
||||
|
||||
"""
|
||||
if _safe_tags(estimator, key="pairwise"):
|
||||
if not hasattr(X, "shape"):
|
||||
raise ValueError(
|
||||
"Precomputed kernels or affinity matrices have "
|
||||
"to be passed as arrays or sparse matrices."
|
||||
)
|
||||
# X is a precomputed square kernel matrix
|
||||
if X.shape[0] != X.shape[1]:
|
||||
raise ValueError("X should be a square kernel matrix")
|
||||
if train_indices is None:
|
||||
X_subset = X[np.ix_(indices, indices)]
|
||||
else:
|
||||
X_subset = X[np.ix_(indices, train_indices)]
|
||||
else:
|
||||
X_subset = _safe_indexing(X, indices)
|
||||
|
||||
if y is not None:
|
||||
y_subset = _safe_indexing(y, indices)
|
||||
else:
|
||||
y_subset = None
|
||||
|
||||
return X_subset, y_subset
|
||||
@@ -0,0 +1,493 @@
|
||||
# Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
"""
|
||||
Multi-class / multi-label utility function
|
||||
==========================================
|
||||
|
||||
"""
|
||||
from collections.abc import Sequence
|
||||
from itertools import chain
|
||||
import warnings
|
||||
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse import dok_matrix
|
||||
from scipy.sparse import lil_matrix
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .validation import check_array, _assert_all_finite
|
||||
|
||||
|
||||
def _unique_multiclass(y):
|
||||
if hasattr(y, "__array__"):
|
||||
return np.unique(np.asarray(y))
|
||||
else:
|
||||
return set(y)
|
||||
|
||||
|
||||
def _unique_indicator(y):
|
||||
return np.arange(
|
||||
check_array(y, input_name="y", accept_sparse=["csr", "csc", "coo"]).shape[1]
|
||||
)
|
||||
|
||||
|
||||
_FN_UNIQUE_LABELS = {
|
||||
"binary": _unique_multiclass,
|
||||
"multiclass": _unique_multiclass,
|
||||
"multilabel-indicator": _unique_indicator,
|
||||
}
|
||||
|
||||
|
||||
def unique_labels(*ys):
|
||||
"""Extract an ordered array of unique labels.
|
||||
|
||||
We don't allow:
|
||||
- mix of multilabel and multiclass (single label) targets
|
||||
- mix of label indicator matrix and anything else,
|
||||
because there are no explicit labels)
|
||||
- mix of label indicator matrices of different sizes
|
||||
- mix of string and integer labels
|
||||
|
||||
At the moment, we also don't allow "multiclass-multioutput" input type.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
*ys : array-likes
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : ndarray of shape (n_unique_labels,)
|
||||
An ordered array of unique labels.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.utils.multiclass import unique_labels
|
||||
>>> unique_labels([3, 5, 5, 5, 7, 7])
|
||||
array([3, 5, 7])
|
||||
>>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
|
||||
array([1, 2, 3, 4])
|
||||
>>> unique_labels([1, 2, 10], [5, 11])
|
||||
array([ 1, 2, 5, 10, 11])
|
||||
"""
|
||||
if not ys:
|
||||
raise ValueError("No argument has been passed.")
|
||||
# Check that we don't mix label format
|
||||
|
||||
ys_types = set(type_of_target(x) for x in ys)
|
||||
if ys_types == {"binary", "multiclass"}:
|
||||
ys_types = {"multiclass"}
|
||||
|
||||
if len(ys_types) > 1:
|
||||
raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
|
||||
|
||||
label_type = ys_types.pop()
|
||||
|
||||
# Check consistency for the indicator format
|
||||
if (
|
||||
label_type == "multilabel-indicator"
|
||||
and len(
|
||||
set(
|
||||
check_array(y, accept_sparse=["csr", "csc", "coo"]).shape[1] for y in ys
|
||||
)
|
||||
)
|
||||
> 1
|
||||
):
|
||||
raise ValueError(
|
||||
"Multi-label binary indicator input with different numbers of labels"
|
||||
)
|
||||
|
||||
# Get the unique set of labels
|
||||
_unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
|
||||
if not _unique_labels:
|
||||
raise ValueError("Unknown label type: %s" % repr(ys))
|
||||
|
||||
ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys))
|
||||
|
||||
# Check that we don't mix string type with number type
|
||||
if len(set(isinstance(label, str) for label in ys_labels)) > 1:
|
||||
raise ValueError("Mix of label input types (string and number)")
|
||||
|
||||
return np.array(sorted(ys_labels))
|
||||
|
||||
|
||||
def _is_integral_float(y):
|
||||
return y.dtype.kind == "f" and np.all(y.astype(int) == y)
|
||||
|
||||
|
||||
def is_multilabel(y):
|
||||
"""Check if ``y`` is in a multilabel format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : ndarray of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : bool
|
||||
Return ``True``, if ``y`` is in a multilabel format, else ```False``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.utils.multiclass import is_multilabel
|
||||
>>> is_multilabel([0, 1, 0, 1])
|
||||
False
|
||||
>>> is_multilabel([[1], [0, 2], []])
|
||||
False
|
||||
>>> is_multilabel(np.array([[1, 0], [0, 0]]))
|
||||
True
|
||||
>>> is_multilabel(np.array([[1], [0], [0]]))
|
||||
False
|
||||
>>> is_multilabel(np.array([[1, 0, 0]]))
|
||||
True
|
||||
"""
|
||||
if hasattr(y, "__array__") or isinstance(y, Sequence):
|
||||
# DeprecationWarning will be replaced by ValueError, see NEP 34
|
||||
# https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", np.VisibleDeprecationWarning)
|
||||
try:
|
||||
y = np.asarray(y)
|
||||
except np.VisibleDeprecationWarning:
|
||||
# dtype=object should be provided explicitly for ragged arrays,
|
||||
# see NEP 34
|
||||
y = np.array(y, dtype=object)
|
||||
|
||||
if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
|
||||
return False
|
||||
|
||||
if issparse(y):
|
||||
if isinstance(y, (dok_matrix, lil_matrix)):
|
||||
y = y.tocsr()
|
||||
return (
|
||||
len(y.data) == 0
|
||||
or np.unique(y.data).size == 1
|
||||
and (
|
||||
y.dtype.kind in "biu"
|
||||
or _is_integral_float(np.unique(y.data)) # bool, int, uint
|
||||
)
|
||||
)
|
||||
else:
|
||||
labels = np.unique(y)
|
||||
|
||||
return len(labels) < 3 and (
|
||||
y.dtype.kind in "biu" or _is_integral_float(labels) # bool, int, uint
|
||||
)
|
||||
|
||||
|
||||
def check_classification_targets(y):
|
||||
"""Ensure that target y is of a non-regression type.
|
||||
|
||||
Only the following target types (as defined in type_of_target) are allowed:
|
||||
'binary', 'multiclass', 'multiclass-multioutput',
|
||||
'multilabel-indicator', 'multilabel-sequences'
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like
|
||||
Target values.
|
||||
"""
|
||||
y_type = type_of_target(y, input_name="y")
|
||||
if y_type not in [
|
||||
"binary",
|
||||
"multiclass",
|
||||
"multiclass-multioutput",
|
||||
"multilabel-indicator",
|
||||
"multilabel-sequences",
|
||||
]:
|
||||
raise ValueError("Unknown label type: %r" % y_type)
|
||||
|
||||
|
||||
def type_of_target(y, input_name=""):
|
||||
"""Determine the type of data indicated by the target.
|
||||
|
||||
Note that this type is the most specific type that can be inferred.
|
||||
For example:
|
||||
|
||||
* ``binary`` is more specific but compatible with ``multiclass``.
|
||||
* ``multiclass`` of integers is more specific but compatible with
|
||||
``continuous``.
|
||||
* ``multilabel-indicator`` is more specific but compatible with
|
||||
``multiclass-multioutput``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array-like
|
||||
|
||||
input_name : str, default=""
|
||||
The data name used to construct the error message.
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
target_type : str
|
||||
One of:
|
||||
|
||||
* 'continuous': `y` is an array-like of floats that are not all
|
||||
integers, and is 1d or a column vector.
|
||||
* 'continuous-multioutput': `y` is a 2d array of floats that are
|
||||
not all integers, and both dimensions are of size > 1.
|
||||
* 'binary': `y` contains <= 2 discrete values and is 1d or a column
|
||||
vector.
|
||||
* 'multiclass': `y` contains more than two discrete values, is not a
|
||||
sequence of sequences, and is 1d or a column vector.
|
||||
* 'multiclass-multioutput': `y` is a 2d array that contains more
|
||||
than two discrete values, is not a sequence of sequences, and both
|
||||
dimensions are of size > 1.
|
||||
* 'multilabel-indicator': `y` is a label indicator matrix, an array
|
||||
of two dimensions with at least two columns, and at most 2 unique
|
||||
values.
|
||||
* 'unknown': `y` is array-like but none of the above, such as a 3d
|
||||
array, sequence of sequences, or an array of non-sequence objects.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.utils.multiclass import type_of_target
|
||||
>>> import numpy as np
|
||||
>>> type_of_target([0.1, 0.6])
|
||||
'continuous'
|
||||
>>> type_of_target([1, -1, -1, 1])
|
||||
'binary'
|
||||
>>> type_of_target(['a', 'b', 'a'])
|
||||
'binary'
|
||||
>>> type_of_target([1.0, 2.0])
|
||||
'binary'
|
||||
>>> type_of_target([1, 0, 2])
|
||||
'multiclass'
|
||||
>>> type_of_target([1.0, 0.0, 3.0])
|
||||
'multiclass'
|
||||
>>> type_of_target(['a', 'b', 'c'])
|
||||
'multiclass'
|
||||
>>> type_of_target(np.array([[1, 2], [3, 1]]))
|
||||
'multiclass-multioutput'
|
||||
>>> type_of_target([[1, 2]])
|
||||
'multilabel-indicator'
|
||||
>>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
|
||||
'continuous-multioutput'
|
||||
>>> type_of_target(np.array([[0, 1], [1, 1]]))
|
||||
'multilabel-indicator'
|
||||
"""
|
||||
valid = (
|
||||
isinstance(y, Sequence) or issparse(y) or hasattr(y, "__array__")
|
||||
) and not isinstance(y, str)
|
||||
|
||||
if not valid:
|
||||
raise ValueError(
|
||||
"Expected array-like (array or non-string sequence), got %r" % y
|
||||
)
|
||||
|
||||
sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"]
|
||||
if sparse_pandas:
|
||||
raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
|
||||
|
||||
if is_multilabel(y):
|
||||
return "multilabel-indicator"
|
||||
|
||||
# DeprecationWarning will be replaced by ValueError, see NEP 34
|
||||
# https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", np.VisibleDeprecationWarning)
|
||||
try:
|
||||
y = np.asarray(y)
|
||||
except np.VisibleDeprecationWarning:
|
||||
# dtype=object should be provided explicitly for ragged arrays,
|
||||
# see NEP 34
|
||||
y = np.asarray(y, dtype=object)
|
||||
|
||||
# The old sequence of sequences format
|
||||
try:
|
||||
if (
|
||||
not hasattr(y[0], "__array__")
|
||||
and isinstance(y[0], Sequence)
|
||||
and not isinstance(y[0], str)
|
||||
):
|
||||
raise ValueError(
|
||||
"You appear to be using a legacy multi-label data"
|
||||
" representation. Sequence of sequences are no"
|
||||
" longer supported; use a binary array or sparse"
|
||||
" matrix instead - the MultiLabelBinarizer"
|
||||
" transformer can convert to this format."
|
||||
)
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
# Invalid inputs
|
||||
if y.ndim > 2 or (y.dtype == object and len(y) and not isinstance(y.flat[0], str)):
|
||||
return "unknown" # [[[1, 2]]] or [obj_1] and not ["label_1"]
|
||||
|
||||
if y.ndim == 2 and y.shape[1] == 0:
|
||||
return "unknown" # [[]]
|
||||
|
||||
if y.ndim == 2 and y.shape[1] > 1:
|
||||
suffix = "-multioutput" # [[1, 2], [1, 2]]
|
||||
else:
|
||||
suffix = "" # [1, 2, 3] or [[1], [2], [3]]
|
||||
|
||||
# check float and contains non-integer float values
|
||||
if y.dtype.kind == "f" and np.any(y != y.astype(int)):
|
||||
# [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
|
||||
_assert_all_finite(y, input_name=input_name)
|
||||
return "continuous" + suffix
|
||||
|
||||
if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
|
||||
return "multiclass" + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
|
||||
else:
|
||||
return "binary" # [1, 2] or [["a"], ["b"]]
|
||||
|
||||
|
||||
def _check_partial_fit_first_call(clf, classes=None):
|
||||
"""Private helper function for factorizing common classes param logic.
|
||||
|
||||
Estimators that implement the ``partial_fit`` API need to be provided with
|
||||
the list of possible classes at the first call to partial_fit.
|
||||
|
||||
Subsequent calls to partial_fit should check that ``classes`` is still
|
||||
consistent with a previous value of ``clf.classes_`` when provided.
|
||||
|
||||
This function returns True if it detects that this was the first call to
|
||||
``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
|
||||
set on ``clf``.
|
||||
|
||||
"""
|
||||
if getattr(clf, "classes_", None) is None and classes is None:
|
||||
raise ValueError("classes must be passed on the first call to partial_fit.")
|
||||
|
||||
elif classes is not None:
|
||||
if getattr(clf, "classes_", None) is not None:
|
||||
if not np.array_equal(clf.classes_, unique_labels(classes)):
|
||||
raise ValueError(
|
||||
"`classes=%r` is not the same as on last call "
|
||||
"to partial_fit, was: %r" % (classes, clf.classes_)
|
||||
)
|
||||
|
||||
else:
|
||||
# This is the first call to partial_fit
|
||||
clf.classes_ = unique_labels(classes)
|
||||
return True
|
||||
|
||||
# classes is None and clf.classes_ has already previously been set:
|
||||
# nothing to do
|
||||
return False
|
||||
|
||||
|
||||
def class_distribution(y, sample_weight=None):
|
||||
"""Compute class priors from multioutput-multiclass target data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : {array-like, sparse matrix} of size (n_samples, n_outputs)
|
||||
The labels for each example.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
classes : list of size n_outputs of ndarray of size (n_classes,)
|
||||
List of classes for each column.
|
||||
|
||||
n_classes : list of int of size n_outputs
|
||||
Number of classes in each column.
|
||||
|
||||
class_prior : list of size n_outputs of ndarray of size (n_classes,)
|
||||
Class distribution of each column.
|
||||
|
||||
"""
|
||||
classes = []
|
||||
n_classes = []
|
||||
class_prior = []
|
||||
|
||||
n_samples, n_outputs = y.shape
|
||||
if sample_weight is not None:
|
||||
sample_weight = np.asarray(sample_weight)
|
||||
|
||||
if issparse(y):
|
||||
y = y.tocsc()
|
||||
y_nnz = np.diff(y.indptr)
|
||||
|
||||
for k in range(n_outputs):
|
||||
col_nonzero = y.indices[y.indptr[k] : y.indptr[k + 1]]
|
||||
# separate sample weights for zero and non-zero elements
|
||||
if sample_weight is not None:
|
||||
nz_samp_weight = sample_weight[col_nonzero]
|
||||
zeros_samp_weight_sum = np.sum(sample_weight) - np.sum(nz_samp_weight)
|
||||
else:
|
||||
nz_samp_weight = None
|
||||
zeros_samp_weight_sum = y.shape[0] - y_nnz[k]
|
||||
|
||||
classes_k, y_k = np.unique(
|
||||
y.data[y.indptr[k] : y.indptr[k + 1]], return_inverse=True
|
||||
)
|
||||
class_prior_k = np.bincount(y_k, weights=nz_samp_weight)
|
||||
|
||||
# An explicit zero was found, combine its weight with the weight
|
||||
# of the implicit zeros
|
||||
if 0 in classes_k:
|
||||
class_prior_k[classes_k == 0] += zeros_samp_weight_sum
|
||||
|
||||
# If an there is an implicit zero and it is not in classes and
|
||||
# class_prior, make an entry for it
|
||||
if 0 not in classes_k and y_nnz[k] < y.shape[0]:
|
||||
classes_k = np.insert(classes_k, 0, 0)
|
||||
class_prior_k = np.insert(class_prior_k, 0, zeros_samp_weight_sum)
|
||||
|
||||
classes.append(classes_k)
|
||||
n_classes.append(classes_k.shape[0])
|
||||
class_prior.append(class_prior_k / class_prior_k.sum())
|
||||
else:
|
||||
for k in range(n_outputs):
|
||||
classes_k, y_k = np.unique(y[:, k], return_inverse=True)
|
||||
classes.append(classes_k)
|
||||
n_classes.append(classes_k.shape[0])
|
||||
class_prior_k = np.bincount(y_k, weights=sample_weight)
|
||||
class_prior.append(class_prior_k / class_prior_k.sum())
|
||||
|
||||
return (classes, n_classes, class_prior)
|
||||
|
||||
|
||||
def _ovr_decision_function(predictions, confidences, n_classes):
|
||||
"""Compute a continuous, tie-breaking OvR decision function from OvO.
|
||||
|
||||
It is important to include a continuous value, not only votes,
|
||||
to make computing AUC or calibration meaningful.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
predictions : array-like of shape (n_samples, n_classifiers)
|
||||
Predicted classes for each binary classifier.
|
||||
|
||||
confidences : array-like of shape (n_samples, n_classifiers)
|
||||
Decision functions or predicted probabilities for positive class
|
||||
for each binary classifier.
|
||||
|
||||
n_classes : int
|
||||
Number of classes. n_classifiers must be
|
||||
``n_classes * (n_classes - 1 ) / 2``.
|
||||
"""
|
||||
n_samples = predictions.shape[0]
|
||||
votes = np.zeros((n_samples, n_classes))
|
||||
sum_of_confidences = np.zeros((n_samples, n_classes))
|
||||
|
||||
k = 0
|
||||
for i in range(n_classes):
|
||||
for j in range(i + 1, n_classes):
|
||||
sum_of_confidences[:, i] -= confidences[:, k]
|
||||
sum_of_confidences[:, j] += confidences[:, k]
|
||||
votes[predictions[:, k] == 0, i] += 1
|
||||
votes[predictions[:, k] == 1, j] += 1
|
||||
k += 1
|
||||
|
||||
# Monotonically transform the sum_of_confidences to (-1/3, 1/3)
|
||||
# and add it with votes. The monotonic transformation is
|
||||
# f: x -> x / (3 * (|x| + 1)), it uses 1/3 instead of 1/2
|
||||
# to ensure that we won't reach the limits and change vote order.
|
||||
# The motivation is to use confidence levels as a way to break ties in
|
||||
# the votes without switching any decision made based on a difference
|
||||
# of 1 vote.
|
||||
transformed_confidences = sum_of_confidences / (
|
||||
3 * (np.abs(sum_of_confidences) + 1)
|
||||
)
|
||||
return votes + transformed_confidences
|
||||
Binary file not shown.
@@ -0,0 +1,21 @@
|
||||
"""Export fast murmurhash C/C++ routines + cython wrappers"""
|
||||
|
||||
cimport numpy as np
|
||||
|
||||
# The C API is disabled for now, since it requires -I flags to get
|
||||
# compilation to work even when these functions are not used.
|
||||
#cdef extern from "MurmurHash3.h":
|
||||
# void MurmurHash3_x86_32(void* key, int len, unsigned int seed,
|
||||
# void* out)
|
||||
#
|
||||
# void MurmurHash3_x86_128(void* key, int len, unsigned int seed,
|
||||
# void* out)
|
||||
#
|
||||
# void MurmurHash3_x64_128(void* key, int len, unsigned int seed,
|
||||
# void* out)
|
||||
|
||||
|
||||
cpdef np.uint32_t murmurhash3_int_u32(int key, unsigned int seed)
|
||||
cpdef np.int32_t murmurhash3_int_s32(int key, unsigned int seed)
|
||||
cpdef np.uint32_t murmurhash3_bytes_u32(bytes key, unsigned int seed)
|
||||
cpdef np.int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed)
|
||||
@@ -0,0 +1,266 @@
|
||||
"""
|
||||
Our own implementation of the Newton algorithm
|
||||
|
||||
Unlike the scipy.optimize version, this version of the Newton conjugate
|
||||
gradient solver uses only one function call to retrieve the
|
||||
func value, the gradient value and a callable for the Hessian matvec
|
||||
product. If the function call is very expensive (e.g. for logistic
|
||||
regression with large design matrix), this approach gives very
|
||||
significant speedups.
|
||||
"""
|
||||
# This is a modified file from scipy.optimize
|
||||
# Original authors: Travis Oliphant, Eric Jones
|
||||
# Modifications by Gael Varoquaux, Mathieu Blondel and Tom Dupre la Tour
|
||||
# License: BSD
|
||||
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from .fixes import line_search_wolfe1, line_search_wolfe2
|
||||
from ..exceptions import ConvergenceWarning
|
||||
|
||||
|
||||
class _LineSearchError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs):
|
||||
"""
|
||||
Same as line_search_wolfe1, but fall back to line_search_wolfe2 if
|
||||
suitable step length is not found, and raise an exception if a
|
||||
suitable step length is not found.
|
||||
|
||||
Raises
|
||||
------
|
||||
_LineSearchError
|
||||
If no suitable step size is found.
|
||||
|
||||
"""
|
||||
ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs)
|
||||
|
||||
if ret[0] is None:
|
||||
# line search failed: try different one.
|
||||
ret = line_search_wolfe2(
|
||||
f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs
|
||||
)
|
||||
|
||||
if ret[0] is None:
|
||||
raise _LineSearchError()
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def _cg(fhess_p, fgrad, maxiter, tol):
|
||||
"""
|
||||
Solve iteratively the linear system 'fhess_p . xsupi = fgrad'
|
||||
with a conjugate gradient descent.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fhess_p : callable
|
||||
Function that takes the gradient as a parameter and returns the
|
||||
matrix product of the Hessian and gradient.
|
||||
|
||||
fgrad : ndarray of shape (n_features,) or (n_features + 1,)
|
||||
Gradient vector.
|
||||
|
||||
maxiter : int
|
||||
Number of CG iterations.
|
||||
|
||||
tol : float
|
||||
Stopping criterion.
|
||||
|
||||
Returns
|
||||
-------
|
||||
xsupi : ndarray of shape (n_features,) or (n_features + 1,)
|
||||
Estimated solution.
|
||||
"""
|
||||
xsupi = np.zeros(len(fgrad), dtype=fgrad.dtype)
|
||||
ri = fgrad
|
||||
psupi = -ri
|
||||
i = 0
|
||||
dri0 = np.dot(ri, ri)
|
||||
|
||||
while i <= maxiter:
|
||||
if np.sum(np.abs(ri)) <= tol:
|
||||
break
|
||||
|
||||
Ap = fhess_p(psupi)
|
||||
# check curvature
|
||||
curv = np.dot(psupi, Ap)
|
||||
if 0 <= curv <= 3 * np.finfo(np.float64).eps:
|
||||
break
|
||||
elif curv < 0:
|
||||
if i > 0:
|
||||
break
|
||||
else:
|
||||
# fall back to steepest descent direction
|
||||
xsupi += dri0 / curv * psupi
|
||||
break
|
||||
alphai = dri0 / curv
|
||||
xsupi += alphai * psupi
|
||||
ri = ri + alphai * Ap
|
||||
dri1 = np.dot(ri, ri)
|
||||
betai = dri1 / dri0
|
||||
psupi = -ri + betai * psupi
|
||||
i = i + 1
|
||||
dri0 = dri1 # update np.dot(ri,ri) for next time.
|
||||
|
||||
return xsupi
|
||||
|
||||
|
||||
def _newton_cg(
|
||||
grad_hess,
|
||||
func,
|
||||
grad,
|
||||
x0,
|
||||
args=(),
|
||||
tol=1e-4,
|
||||
maxiter=100,
|
||||
maxinner=200,
|
||||
line_search=True,
|
||||
warn=True,
|
||||
):
|
||||
"""
|
||||
Minimization of scalar function of one or more variables using the
|
||||
Newton-CG algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
grad_hess : callable
|
||||
Should return the gradient and a callable returning the matvec product
|
||||
of the Hessian.
|
||||
|
||||
func : callable
|
||||
Should return the value of the function.
|
||||
|
||||
grad : callable
|
||||
Should return the function value and the gradient. This is used
|
||||
by the linesearch functions.
|
||||
|
||||
x0 : array of float
|
||||
Initial guess.
|
||||
|
||||
args : tuple, default=()
|
||||
Arguments passed to func_grad_hess, func and grad.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Stopping criterion. The iteration will stop when
|
||||
``max{|g_i | i = 1, ..., n} <= tol``
|
||||
where ``g_i`` is the i-th component of the gradient.
|
||||
|
||||
maxiter : int, default=100
|
||||
Number of Newton iterations.
|
||||
|
||||
maxinner : int, default=200
|
||||
Number of CG iterations.
|
||||
|
||||
line_search : bool, default=True
|
||||
Whether to use a line search or not.
|
||||
|
||||
warn : bool, default=True
|
||||
Whether to warn when didn't converge.
|
||||
|
||||
Returns
|
||||
-------
|
||||
xk : ndarray of float
|
||||
Estimated minimum.
|
||||
"""
|
||||
x0 = np.asarray(x0).flatten()
|
||||
xk = x0
|
||||
k = 0
|
||||
|
||||
if line_search:
|
||||
old_fval = func(x0, *args)
|
||||
old_old_fval = None
|
||||
|
||||
# Outer loop: our Newton iteration
|
||||
while k < maxiter:
|
||||
# Compute a search direction pk by applying the CG method to
|
||||
# del2 f(xk) p = - fgrad f(xk) starting from 0.
|
||||
fgrad, fhess_p = grad_hess(xk, *args)
|
||||
|
||||
absgrad = np.abs(fgrad)
|
||||
if np.max(absgrad) <= tol:
|
||||
break
|
||||
|
||||
maggrad = np.sum(absgrad)
|
||||
eta = min([0.5, np.sqrt(maggrad)])
|
||||
termcond = eta * maggrad
|
||||
|
||||
# Inner loop: solve the Newton update by conjugate gradient, to
|
||||
# avoid inverting the Hessian
|
||||
xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond)
|
||||
|
||||
alphak = 1.0
|
||||
|
||||
if line_search:
|
||||
try:
|
||||
alphak, fc, gc, old_fval, old_old_fval, gfkp1 = _line_search_wolfe12(
|
||||
func, grad, xk, xsupi, fgrad, old_fval, old_old_fval, args=args
|
||||
)
|
||||
except _LineSearchError:
|
||||
warnings.warn("Line Search failed")
|
||||
break
|
||||
|
||||
xk = xk + alphak * xsupi # upcast if necessary
|
||||
k += 1
|
||||
|
||||
if warn and k >= maxiter:
|
||||
warnings.warn(
|
||||
"newton-cg failed to converge. Increase the number of iterations.",
|
||||
ConvergenceWarning,
|
||||
)
|
||||
return xk, k
|
||||
|
||||
|
||||
def _check_optimize_result(solver, result, max_iter=None, extra_warning_msg=None):
|
||||
"""Check the OptimizeResult for successful convergence
|
||||
|
||||
Parameters
|
||||
----------
|
||||
solver : str
|
||||
Solver name. Currently only `lbfgs` is supported.
|
||||
|
||||
result : OptimizeResult
|
||||
Result of the scipy.optimize.minimize function.
|
||||
|
||||
max_iter : int, default=None
|
||||
Expected maximum number of iterations.
|
||||
|
||||
extra_warning_msg : str, default=None
|
||||
Extra warning message.
|
||||
|
||||
Returns
|
||||
-------
|
||||
n_iter : int
|
||||
Number of iterations.
|
||||
"""
|
||||
# handle both scipy and scikit-learn solver names
|
||||
if solver == "lbfgs":
|
||||
if result.status != 0:
|
||||
try:
|
||||
# The message is already decoded in scipy>=1.6.0
|
||||
result_message = result.message.decode("latin1")
|
||||
except AttributeError:
|
||||
result_message = result.message
|
||||
warning_msg = (
|
||||
"{} failed to converge (status={}):\n{}.\n\n"
|
||||
"Increase the number of iterations (max_iter) "
|
||||
"or scale the data as shown in:\n"
|
||||
" https://scikit-learn.org/stable/modules/"
|
||||
"preprocessing.html"
|
||||
).format(solver, result.status, result_message)
|
||||
if extra_warning_msg is not None:
|
||||
warning_msg += "\n" + extra_warning_msg
|
||||
warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2)
|
||||
if max_iter is not None:
|
||||
# In scipy <= 1.0.0, nit may exceed maxiter for lbfgs.
|
||||
# See https://github.com/scipy/scipy/issues/7854
|
||||
n_iter_i = min(result.nit, max_iter)
|
||||
else:
|
||||
n_iter_i = result.nit
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
return n_iter_i
|
||||
@@ -0,0 +1,97 @@
|
||||
# Author: Hamzeh Alsalhi <ha258@cornell.edu>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
import array
|
||||
|
||||
from . import check_random_state
|
||||
from ._random import sample_without_replacement
|
||||
|
||||
__all__ = ["sample_without_replacement"]
|
||||
|
||||
|
||||
def _random_choice_csc(n_samples, classes, class_probability=None, random_state=None):
|
||||
"""Generate a sparse random matrix given column class distributions
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_samples : int,
|
||||
Number of samples to draw in each column.
|
||||
|
||||
classes : list of size n_outputs of arrays of size (n_classes,)
|
||||
List of classes for each column.
|
||||
|
||||
class_probability : list of size n_outputs of arrays of \
|
||||
shape (n_classes,), default=None
|
||||
Class distribution of each column. If None, uniform distribution is
|
||||
assumed.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Controls the randomness of the sampled classes.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
random_matrix : sparse csc matrix of size (n_samples, n_outputs)
|
||||
|
||||
"""
|
||||
data = array.array("i")
|
||||
indices = array.array("i")
|
||||
indptr = array.array("i", [0])
|
||||
|
||||
for j in range(len(classes)):
|
||||
classes[j] = np.asarray(classes[j])
|
||||
if classes[j].dtype.kind != "i":
|
||||
raise ValueError("class dtype %s is not supported" % classes[j].dtype)
|
||||
classes[j] = classes[j].astype(np.int64, copy=False)
|
||||
|
||||
# use uniform distribution if no class_probability is given
|
||||
if class_probability is None:
|
||||
class_prob_j = np.empty(shape=classes[j].shape[0])
|
||||
class_prob_j.fill(1 / classes[j].shape[0])
|
||||
else:
|
||||
class_prob_j = np.asarray(class_probability[j])
|
||||
|
||||
if not np.isclose(np.sum(class_prob_j), 1.0):
|
||||
raise ValueError(
|
||||
"Probability array at index {0} does not sum to one".format(j)
|
||||
)
|
||||
|
||||
if class_prob_j.shape[0] != classes[j].shape[0]:
|
||||
raise ValueError(
|
||||
"classes[{0}] (length {1}) and "
|
||||
"class_probability[{0}] (length {2}) have "
|
||||
"different length.".format(
|
||||
j, classes[j].shape[0], class_prob_j.shape[0]
|
||||
)
|
||||
)
|
||||
|
||||
# If 0 is not present in the classes insert it with a probability 0.0
|
||||
if 0 not in classes[j]:
|
||||
classes[j] = np.insert(classes[j], 0, 0)
|
||||
class_prob_j = np.insert(class_prob_j, 0, 0.0)
|
||||
|
||||
# If there are nonzero classes choose randomly using class_probability
|
||||
rng = check_random_state(random_state)
|
||||
if classes[j].shape[0] > 1:
|
||||
p_nonzero = 1 - class_prob_j[classes[j] == 0]
|
||||
nnz = int(n_samples * p_nonzero)
|
||||
ind_sample = sample_without_replacement(
|
||||
n_population=n_samples, n_samples=nnz, random_state=random_state
|
||||
)
|
||||
indices.extend(ind_sample)
|
||||
|
||||
# Normalize probabilities for the nonzero elements
|
||||
classes_j_nonzero = classes[j] != 0
|
||||
class_probability_nz = class_prob_j[classes_j_nonzero]
|
||||
class_probability_nz_norm = class_probability_nz / np.sum(
|
||||
class_probability_nz
|
||||
)
|
||||
classes_ind = np.searchsorted(
|
||||
class_probability_nz_norm.cumsum(), rng.uniform(size=nnz)
|
||||
)
|
||||
data.extend(classes[j][classes_j_nonzero][classes_ind])
|
||||
indptr.append(len(indices))
|
||||
|
||||
return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int)
|
||||
@@ -0,0 +1,128 @@
|
||||
import os
|
||||
from os.path import join
|
||||
|
||||
from sklearn._build_utils import gen_from_templates
|
||||
|
||||
|
||||
def configuration(parent_package="", top_path=None):
|
||||
import numpy
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
|
||||
config = Configuration("utils", parent_package, top_path)
|
||||
|
||||
libraries = []
|
||||
if os.name == "posix":
|
||||
libraries.append("m")
|
||||
|
||||
config.add_extension(
|
||||
"sparsefuncs_fast", sources=["sparsefuncs_fast.pyx"], libraries=libraries
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_cython_blas", sources=["_cython_blas.pyx"], libraries=libraries
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"arrayfuncs",
|
||||
sources=["arrayfuncs.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"murmurhash",
|
||||
sources=["murmurhash.pyx", join("src", "MurmurHash3.cpp")],
|
||||
include_dirs=["src"],
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_fast_dict",
|
||||
sources=["_fast_dict.pyx"],
|
||||
language="c++",
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_openmp_helpers", sources=["_openmp_helpers.pyx"], libraries=libraries
|
||||
)
|
||||
|
||||
# generate files from a template
|
||||
templates = [
|
||||
"sklearn/utils/_seq_dataset.pyx.tp",
|
||||
"sklearn/utils/_seq_dataset.pxd.tp",
|
||||
"sklearn/utils/_weight_vector.pyx.tp",
|
||||
"sklearn/utils/_weight_vector.pxd.tp",
|
||||
]
|
||||
|
||||
gen_from_templates(templates)
|
||||
|
||||
config.add_extension(
|
||||
"_seq_dataset", sources=["_seq_dataset.pyx"], include_dirs=[numpy.get_include()]
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_weight_vector",
|
||||
sources=["_weight_vector.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_random",
|
||||
sources=["_random.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_logistic_sigmoid",
|
||||
sources=["_logistic_sigmoid.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_readonly_array_wrapper",
|
||||
sources=["_readonly_array_wrapper.pyx"],
|
||||
libraries=libraries,
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_typedefs",
|
||||
sources=["_typedefs.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_heap",
|
||||
sources=["_heap.pyx"],
|
||||
libraries=libraries,
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_sorting",
|
||||
sources=["_sorting.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
language="c++",
|
||||
libraries=libraries,
|
||||
)
|
||||
|
||||
config.add_extension(
|
||||
"_vector_sentinel",
|
||||
sources=["_vector_sentinel.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
language="c++",
|
||||
)
|
||||
|
||||
config.add_subpackage("tests")
|
||||
|
||||
return config
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from numpy.distutils.core import setup
|
||||
|
||||
setup(**configuration(top_path="").todict())
|
||||
@@ -0,0 +1,624 @@
|
||||
# Authors: Manoj Kumar
|
||||
# Thomas Unterthiner
|
||||
# Giorgio Patrini
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
import scipy.sparse as sp
|
||||
import numpy as np
|
||||
|
||||
from .sparsefuncs_fast import (
|
||||
csr_mean_variance_axis0 as _csr_mean_var_axis0,
|
||||
csc_mean_variance_axis0 as _csc_mean_var_axis0,
|
||||
incr_mean_variance_axis0 as _incr_mean_var_axis0,
|
||||
)
|
||||
from ..utils.validation import _check_sample_weight
|
||||
|
||||
|
||||
def _raise_typeerror(X):
|
||||
"""Raises a TypeError if X is not a CSR or CSC matrix"""
|
||||
input_type = X.format if sp.issparse(X) else type(X)
|
||||
err = "Expected a CSR or CSC sparse matrix, got %s." % input_type
|
||||
raise TypeError(err)
|
||||
|
||||
|
||||
def _raise_error_wrong_axis(axis):
|
||||
if axis not in (0, 1):
|
||||
raise ValueError(
|
||||
"Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis
|
||||
)
|
||||
|
||||
|
||||
def inplace_csr_column_scale(X, scale):
|
||||
"""Inplace column scaling of a CSR matrix.
|
||||
|
||||
Scale each feature of the data matrix by multiplying with specific scale
|
||||
provided by the caller assuming a (n_samples, n_features) shape.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Matrix to normalize using the variance of the features.
|
||||
It should be of CSR format.
|
||||
|
||||
scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
|
||||
Array of precomputed feature-wise values to use for scaling.
|
||||
"""
|
||||
assert scale.shape[0] == X.shape[1]
|
||||
X.data *= scale.take(X.indices, mode="clip")
|
||||
|
||||
|
||||
def inplace_csr_row_scale(X, scale):
|
||||
"""Inplace row scaling of a CSR matrix.
|
||||
|
||||
Scale each sample of the data matrix by multiplying with specific scale
|
||||
provided by the caller assuming a (n_samples, n_features) shape.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Matrix to be scaled. It should be of CSR format.
|
||||
|
||||
scale : ndarray of float of shape (n_samples,)
|
||||
Array of precomputed sample-wise values to use for scaling.
|
||||
"""
|
||||
assert scale.shape[0] == X.shape[0]
|
||||
X.data *= np.repeat(scale, np.diff(X.indptr))
|
||||
|
||||
|
||||
def mean_variance_axis(X, axis, weights=None, return_sum_weights=False):
|
||||
"""Compute mean and variance along an axis on a CSR or CSC matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Input data. It can be of CSR or CSC format.
|
||||
|
||||
axis : {0, 1}
|
||||
Axis along which the axis should be computed.
|
||||
|
||||
weights : ndarray of shape (n_samples,) or (n_features,), default=None
|
||||
if axis is set to 0 shape is (n_samples,) or
|
||||
if axis is set to 1 shape is (n_features,).
|
||||
If it is set to None, then samples are equally weighted.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
return_sum_weights : bool, default=False
|
||||
If True, returns the sum of weights seen for each feature
|
||||
if `axis=0` or each sample if `axis=1`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
means : ndarray of shape (n_features,), dtype=floating
|
||||
Feature-wise means.
|
||||
|
||||
variances : ndarray of shape (n_features,), dtype=floating
|
||||
Feature-wise variances.
|
||||
|
||||
sum_weights : ndarray of shape (n_features,), dtype=floating
|
||||
Returned if `return_sum_weights` is `True`.
|
||||
"""
|
||||
_raise_error_wrong_axis(axis)
|
||||
|
||||
if isinstance(X, sp.csr_matrix):
|
||||
if axis == 0:
|
||||
return _csr_mean_var_axis0(
|
||||
X, weights=weights, return_sum_weights=return_sum_weights
|
||||
)
|
||||
else:
|
||||
return _csc_mean_var_axis0(
|
||||
X.T, weights=weights, return_sum_weights=return_sum_weights
|
||||
)
|
||||
elif isinstance(X, sp.csc_matrix):
|
||||
if axis == 0:
|
||||
return _csc_mean_var_axis0(
|
||||
X, weights=weights, return_sum_weights=return_sum_weights
|
||||
)
|
||||
else:
|
||||
return _csr_mean_var_axis0(
|
||||
X.T, weights=weights, return_sum_weights=return_sum_weights
|
||||
)
|
||||
else:
|
||||
_raise_typeerror(X)
|
||||
|
||||
|
||||
def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=None):
|
||||
"""Compute incremental mean and variance along an axis on a CSR or
|
||||
CSC matrix.
|
||||
|
||||
last_mean, last_var are the statistics computed at the last step by this
|
||||
function. Both must be initialized to 0-arrays of the proper size, i.e.
|
||||
the number of features in X. last_n is the number of samples encountered
|
||||
until now.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : CSR or CSC sparse matrix of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
axis : {0, 1}
|
||||
Axis along which the axis should be computed.
|
||||
|
||||
last_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating
|
||||
Array of means to update with the new data X.
|
||||
Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
|
||||
|
||||
last_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating
|
||||
Array of variances to update with the new data X.
|
||||
Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
|
||||
|
||||
last_n : float or ndarray of shape (n_features,) or (n_samples,), \
|
||||
dtype=floating
|
||||
Sum of the weights seen so far, excluding the current weights
|
||||
If not float, it should be of shape (n_samples,) if
|
||||
axis=0 or (n_features,) if axis=1. If float it corresponds to
|
||||
having same weights for all samples (or features).
|
||||
|
||||
weights : ndarray of shape (n_samples,) or (n_features,), default=None
|
||||
If axis is set to 0 shape is (n_samples,) or
|
||||
if axis is set to 1 shape is (n_features,).
|
||||
If it is set to None, then samples are equally weighted.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
Returns
|
||||
-------
|
||||
means : ndarray of shape (n_features,) or (n_samples,), dtype=floating
|
||||
Updated feature-wise means if axis = 0 or
|
||||
sample-wise means if axis = 1.
|
||||
|
||||
variances : ndarray of shape (n_features,) or (n_samples,), dtype=floating
|
||||
Updated feature-wise variances if axis = 0 or
|
||||
sample-wise variances if axis = 1.
|
||||
|
||||
n : ndarray of shape (n_features,) or (n_samples,), dtype=integral
|
||||
Updated number of seen samples per feature if axis=0
|
||||
or number of seen features per sample if axis=1.
|
||||
|
||||
If weights is not None, n is a sum of the weights of the seen
|
||||
samples or features instead of the actual number of seen
|
||||
samples or features.
|
||||
|
||||
Notes
|
||||
-----
|
||||
NaNs are ignored in the algorithm.
|
||||
"""
|
||||
_raise_error_wrong_axis(axis)
|
||||
|
||||
if not isinstance(X, (sp.csr_matrix, sp.csc_matrix)):
|
||||
_raise_typeerror(X)
|
||||
|
||||
if np.size(last_n) == 1:
|
||||
last_n = np.full(last_mean.shape, last_n, dtype=last_mean.dtype)
|
||||
|
||||
if not (np.size(last_mean) == np.size(last_var) == np.size(last_n)):
|
||||
raise ValueError("last_mean, last_var, last_n do not have the same shapes.")
|
||||
|
||||
if axis == 1:
|
||||
if np.size(last_mean) != X.shape[0]:
|
||||
raise ValueError(
|
||||
"If axis=1, then last_mean, last_n, last_var should be of "
|
||||
f"size n_samples {X.shape[0]} (Got {np.size(last_mean)})."
|
||||
)
|
||||
else: # axis == 0
|
||||
if np.size(last_mean) != X.shape[1]:
|
||||
raise ValueError(
|
||||
"If axis=0, then last_mean, last_n, last_var should be of "
|
||||
f"size n_features {X.shape[1]} (Got {np.size(last_mean)})."
|
||||
)
|
||||
|
||||
X = X.T if axis == 1 else X
|
||||
|
||||
if weights is not None:
|
||||
weights = _check_sample_weight(weights, X, dtype=X.dtype)
|
||||
|
||||
return _incr_mean_var_axis0(
|
||||
X, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights
|
||||
)
|
||||
|
||||
|
||||
def inplace_column_scale(X, scale):
|
||||
"""Inplace column scaling of a CSC/CSR matrix.
|
||||
|
||||
Scale each feature of the data matrix by multiplying with specific scale
|
||||
provided by the caller assuming a (n_samples, n_features) shape.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Matrix to normalize using the variance of the features. It should be
|
||||
of CSC or CSR format.
|
||||
|
||||
scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
|
||||
Array of precomputed feature-wise values to use for scaling.
|
||||
"""
|
||||
if isinstance(X, sp.csc_matrix):
|
||||
inplace_csr_row_scale(X.T, scale)
|
||||
elif isinstance(X, sp.csr_matrix):
|
||||
inplace_csr_column_scale(X, scale)
|
||||
else:
|
||||
_raise_typeerror(X)
|
||||
|
||||
|
||||
def inplace_row_scale(X, scale):
|
||||
"""Inplace row scaling of a CSR or CSC matrix.
|
||||
|
||||
Scale each row of the data matrix by multiplying with specific scale
|
||||
provided by the caller assuming a (n_samples, n_features) shape.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Matrix to be scaled. It should be of CSR or CSC format.
|
||||
|
||||
scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
|
||||
Array of precomputed sample-wise values to use for scaling.
|
||||
"""
|
||||
if isinstance(X, sp.csc_matrix):
|
||||
inplace_csr_column_scale(X.T, scale)
|
||||
elif isinstance(X, sp.csr_matrix):
|
||||
inplace_csr_row_scale(X, scale)
|
||||
else:
|
||||
_raise_typeerror(X)
|
||||
|
||||
|
||||
def inplace_swap_row_csc(X, m, n):
|
||||
"""
|
||||
Swaps two rows of a CSC matrix in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Matrix whose two rows are to be swapped. It should be of
|
||||
CSC format.
|
||||
|
||||
m : int
|
||||
Index of the row of X to be swapped.
|
||||
|
||||
n : int
|
||||
Index of the row of X to be swapped.
|
||||
"""
|
||||
for t in [m, n]:
|
||||
if isinstance(t, np.ndarray):
|
||||
raise TypeError("m and n should be valid integers")
|
||||
|
||||
if m < 0:
|
||||
m += X.shape[0]
|
||||
if n < 0:
|
||||
n += X.shape[0]
|
||||
|
||||
m_mask = X.indices == m
|
||||
X.indices[X.indices == n] = m
|
||||
X.indices[m_mask] = n
|
||||
|
||||
|
||||
def inplace_swap_row_csr(X, m, n):
|
||||
"""
|
||||
Swaps two rows of a CSR matrix in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Matrix whose two rows are to be swapped. It should be of
|
||||
CSR format.
|
||||
|
||||
m : int
|
||||
Index of the row of X to be swapped.
|
||||
|
||||
n : int
|
||||
Index of the row of X to be swapped.
|
||||
"""
|
||||
for t in [m, n]:
|
||||
if isinstance(t, np.ndarray):
|
||||
raise TypeError("m and n should be valid integers")
|
||||
|
||||
if m < 0:
|
||||
m += X.shape[0]
|
||||
if n < 0:
|
||||
n += X.shape[0]
|
||||
|
||||
# The following swapping makes life easier since m is assumed to be the
|
||||
# smaller integer below.
|
||||
if m > n:
|
||||
m, n = n, m
|
||||
|
||||
indptr = X.indptr
|
||||
m_start = indptr[m]
|
||||
m_stop = indptr[m + 1]
|
||||
n_start = indptr[n]
|
||||
n_stop = indptr[n + 1]
|
||||
nz_m = m_stop - m_start
|
||||
nz_n = n_stop - n_start
|
||||
|
||||
if nz_m != nz_n:
|
||||
# Modify indptr first
|
||||
X.indptr[m + 2 : n] += nz_n - nz_m
|
||||
X.indptr[m + 1] = m_start + nz_n
|
||||
X.indptr[n] = n_stop - nz_m
|
||||
|
||||
X.indices = np.concatenate(
|
||||
[
|
||||
X.indices[:m_start],
|
||||
X.indices[n_start:n_stop],
|
||||
X.indices[m_stop:n_start],
|
||||
X.indices[m_start:m_stop],
|
||||
X.indices[n_stop:],
|
||||
]
|
||||
)
|
||||
X.data = np.concatenate(
|
||||
[
|
||||
X.data[:m_start],
|
||||
X.data[n_start:n_stop],
|
||||
X.data[m_stop:n_start],
|
||||
X.data[m_start:m_stop],
|
||||
X.data[n_stop:],
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def inplace_swap_row(X, m, n):
|
||||
"""
|
||||
Swaps two rows of a CSC/CSR matrix in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Matrix whose two rows are to be swapped. It should be of CSR or
|
||||
CSC format.
|
||||
|
||||
m : int
|
||||
Index of the row of X to be swapped.
|
||||
|
||||
n : int
|
||||
Index of the row of X to be swapped.
|
||||
"""
|
||||
if isinstance(X, sp.csc_matrix):
|
||||
inplace_swap_row_csc(X, m, n)
|
||||
elif isinstance(X, sp.csr_matrix):
|
||||
inplace_swap_row_csr(X, m, n)
|
||||
else:
|
||||
_raise_typeerror(X)
|
||||
|
||||
|
||||
def inplace_swap_column(X, m, n):
|
||||
"""
|
||||
Swaps two columns of a CSC/CSR matrix in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Matrix whose two columns are to be swapped. It should be of
|
||||
CSR or CSC format.
|
||||
|
||||
m : int
|
||||
Index of the column of X to be swapped.
|
||||
|
||||
n : int
|
||||
Index of the column of X to be swapped.
|
||||
"""
|
||||
if m < 0:
|
||||
m += X.shape[1]
|
||||
if n < 0:
|
||||
n += X.shape[1]
|
||||
if isinstance(X, sp.csc_matrix):
|
||||
inplace_swap_row_csr(X, m, n)
|
||||
elif isinstance(X, sp.csr_matrix):
|
||||
inplace_swap_row_csc(X, m, n)
|
||||
else:
|
||||
_raise_typeerror(X)
|
||||
|
||||
|
||||
def _minor_reduce(X, ufunc):
|
||||
major_index = np.flatnonzero(np.diff(X.indptr))
|
||||
|
||||
# reduceat tries casts X.indptr to intp, which errors
|
||||
# if it is int64 on a 32 bit system.
|
||||
# Reinitializing prevents this where possible, see #13737
|
||||
X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
|
||||
value = ufunc.reduceat(X.data, X.indptr[major_index])
|
||||
return major_index, value
|
||||
|
||||
|
||||
def _min_or_max_axis(X, axis, min_or_max):
|
||||
N = X.shape[axis]
|
||||
if N == 0:
|
||||
raise ValueError("zero-size array to reduction operation")
|
||||
M = X.shape[1 - axis]
|
||||
mat = X.tocsc() if axis == 0 else X.tocsr()
|
||||
mat.sum_duplicates()
|
||||
major_index, value = _minor_reduce(mat, min_or_max)
|
||||
not_full = np.diff(mat.indptr)[major_index] < N
|
||||
value[not_full] = min_or_max(value[not_full], 0)
|
||||
mask = value != 0
|
||||
major_index = np.compress(mask, major_index)
|
||||
value = np.compress(mask, value)
|
||||
|
||||
if axis == 0:
|
||||
res = sp.coo_matrix(
|
||||
(value, (np.zeros(len(value)), major_index)), dtype=X.dtype, shape=(1, M)
|
||||
)
|
||||
else:
|
||||
res = sp.coo_matrix(
|
||||
(value, (major_index, np.zeros(len(value)))), dtype=X.dtype, shape=(M, 1)
|
||||
)
|
||||
return res.A.ravel()
|
||||
|
||||
|
||||
def _sparse_min_or_max(X, axis, min_or_max):
|
||||
if axis is None:
|
||||
if 0 in X.shape:
|
||||
raise ValueError("zero-size array to reduction operation")
|
||||
zero = X.dtype.type(0)
|
||||
if X.nnz == 0:
|
||||
return zero
|
||||
m = min_or_max.reduce(X.data.ravel())
|
||||
if X.nnz != np.product(X.shape):
|
||||
m = min_or_max(zero, m)
|
||||
return m
|
||||
if axis < 0:
|
||||
axis += 2
|
||||
if (axis == 0) or (axis == 1):
|
||||
return _min_or_max_axis(X, axis, min_or_max)
|
||||
else:
|
||||
raise ValueError("invalid axis, use 0 for rows, or 1 for columns")
|
||||
|
||||
|
||||
def _sparse_min_max(X, axis):
|
||||
return (
|
||||
_sparse_min_or_max(X, axis, np.minimum),
|
||||
_sparse_min_or_max(X, axis, np.maximum),
|
||||
)
|
||||
|
||||
|
||||
def _sparse_nan_min_max(X, axis):
|
||||
return (_sparse_min_or_max(X, axis, np.fmin), _sparse_min_or_max(X, axis, np.fmax))
|
||||
|
||||
|
||||
def min_max_axis(X, axis, ignore_nan=False):
|
||||
"""Compute minimium and maximum along an axis on a CSR or CSC matrix.
|
||||
|
||||
Optionally ignore NaN values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Input data. It should be of CSR or CSC format.
|
||||
|
||||
axis : {0, 1}
|
||||
Axis along which the axis should be computed.
|
||||
|
||||
ignore_nan : bool, default=False
|
||||
Ignore or passing through NaN values.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
mins : ndarray of shape (n_features,), dtype={np.float32, np.float64}
|
||||
Feature-wise minima.
|
||||
|
||||
maxs : ndarray of shape (n_features,), dtype={np.float32, np.float64}
|
||||
Feature-wise maxima.
|
||||
"""
|
||||
if isinstance(X, (sp.csr_matrix, sp.csc_matrix)):
|
||||
if ignore_nan:
|
||||
return _sparse_nan_min_max(X, axis=axis)
|
||||
else:
|
||||
return _sparse_min_max(X, axis=axis)
|
||||
else:
|
||||
_raise_typeerror(X)
|
||||
|
||||
|
||||
def count_nonzero(X, axis=None, sample_weight=None):
|
||||
"""A variant of X.getnnz() with extension to weighting on axis 0
|
||||
|
||||
Useful in efficiently calculating multilabel metrics.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_labels)
|
||||
Input data. It should be of CSR format.
|
||||
|
||||
axis : {0, 1}, default=None
|
||||
The axis on which the data is aggregated.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weight for each row of X.
|
||||
"""
|
||||
if axis == -1:
|
||||
axis = 1
|
||||
elif axis == -2:
|
||||
axis = 0
|
||||
elif X.format != "csr":
|
||||
raise TypeError("Expected CSR sparse format, got {0}".format(X.format))
|
||||
|
||||
# We rely here on the fact that np.diff(Y.indptr) for a CSR
|
||||
# will return the number of nonzero entries in each row.
|
||||
# A bincount over Y.indices will return the number of nonzeros
|
||||
# in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14.
|
||||
if axis is None:
|
||||
if sample_weight is None:
|
||||
return X.nnz
|
||||
else:
|
||||
return np.dot(np.diff(X.indptr), sample_weight)
|
||||
elif axis == 1:
|
||||
out = np.diff(X.indptr)
|
||||
if sample_weight is None:
|
||||
# astype here is for consistency with axis=0 dtype
|
||||
return out.astype("intp")
|
||||
return out * sample_weight
|
||||
elif axis == 0:
|
||||
if sample_weight is None:
|
||||
return np.bincount(X.indices, minlength=X.shape[1])
|
||||
else:
|
||||
weights = np.repeat(sample_weight, np.diff(X.indptr))
|
||||
return np.bincount(X.indices, minlength=X.shape[1], weights=weights)
|
||||
else:
|
||||
raise ValueError("Unsupported axis: {0}".format(axis))
|
||||
|
||||
|
||||
def _get_median(data, n_zeros):
|
||||
"""Compute the median of data with n_zeros additional zeros.
|
||||
|
||||
This function is used to support sparse matrices; it modifies data
|
||||
in-place.
|
||||
"""
|
||||
n_elems = len(data) + n_zeros
|
||||
if not n_elems:
|
||||
return np.nan
|
||||
n_negative = np.count_nonzero(data < 0)
|
||||
middle, is_odd = divmod(n_elems, 2)
|
||||
data.sort()
|
||||
|
||||
if is_odd:
|
||||
return _get_elem_at_rank(middle, data, n_negative, n_zeros)
|
||||
|
||||
return (
|
||||
_get_elem_at_rank(middle - 1, data, n_negative, n_zeros)
|
||||
+ _get_elem_at_rank(middle, data, n_negative, n_zeros)
|
||||
) / 2.0
|
||||
|
||||
|
||||
def _get_elem_at_rank(rank, data, n_negative, n_zeros):
|
||||
"""Find the value in data augmented with n_zeros for the given rank"""
|
||||
if rank < n_negative:
|
||||
return data[rank]
|
||||
if rank - n_negative < n_zeros:
|
||||
return 0
|
||||
return data[rank - n_zeros]
|
||||
|
||||
|
||||
def csc_median_axis_0(X):
|
||||
"""Find the median across axis 0 of a CSC matrix.
|
||||
It is equivalent to doing np.median(X, axis=0).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Input data. It should be of CSC format.
|
||||
|
||||
Returns
|
||||
-------
|
||||
median : ndarray of shape (n_features,)
|
||||
Median.
|
||||
|
||||
"""
|
||||
if not isinstance(X, sp.csc_matrix):
|
||||
raise TypeError("Expected matrix of CSC format, got %s" % X.format)
|
||||
|
||||
indptr = X.indptr
|
||||
n_samples, n_features = X.shape
|
||||
median = np.zeros(n_features)
|
||||
|
||||
for f_ind, (start, end) in enumerate(zip(indptr[:-1], indptr[1:])):
|
||||
|
||||
# Prevent modifying X in place
|
||||
data = np.copy(X.data[start:end])
|
||||
nz = n_samples - data.size
|
||||
median[f_ind] = _get_median(data, nz)
|
||||
|
||||
return median
|
||||
Binary file not shown.
@@ -0,0 +1,69 @@
|
||||
import numpy as np
|
||||
|
||||
from .extmath import stable_cumsum
|
||||
|
||||
|
||||
def _weighted_percentile(array, sample_weight, percentile=50):
|
||||
"""Compute weighted percentile
|
||||
|
||||
Computes lower weighted percentile. If `array` is a 2D array, the
|
||||
`percentile` is computed along the axis 0.
|
||||
|
||||
.. versionchanged:: 0.24
|
||||
Accepts 2D `array`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
array : 1D or 2D array
|
||||
Values to take the weighted percentile of.
|
||||
|
||||
sample_weight: 1D or 2D array
|
||||
Weights for each value in `array`. Must be same shape as `array` or
|
||||
of shape `(array.shape[0],)`.
|
||||
|
||||
percentile: int or float, default=50
|
||||
Percentile to compute. Must be value between 0 and 100.
|
||||
|
||||
Returns
|
||||
-------
|
||||
percentile : int if `array` 1D, ndarray if `array` 2D
|
||||
Weighted percentile.
|
||||
"""
|
||||
n_dim = array.ndim
|
||||
if n_dim == 0:
|
||||
return array[()]
|
||||
if array.ndim == 1:
|
||||
array = array.reshape((-1, 1))
|
||||
# When sample_weight 1D, repeat for each array.shape[1]
|
||||
if array.shape != sample_weight.shape and array.shape[0] == sample_weight.shape[0]:
|
||||
sample_weight = np.tile(sample_weight, (array.shape[1], 1)).T
|
||||
sorted_idx = np.argsort(array, axis=0)
|
||||
sorted_weights = np.take_along_axis(sample_weight, sorted_idx, axis=0)
|
||||
|
||||
# Find index of median prediction for each sample
|
||||
weight_cdf = stable_cumsum(sorted_weights, axis=0)
|
||||
adjusted_percentile = percentile / 100 * weight_cdf[-1]
|
||||
|
||||
# For percentile=0, ignore leading observations with sample_weight=0. GH20528
|
||||
mask = adjusted_percentile == 0
|
||||
adjusted_percentile[mask] = np.nextafter(
|
||||
adjusted_percentile[mask], adjusted_percentile[mask] + 1
|
||||
)
|
||||
|
||||
percentile_idx = np.array(
|
||||
[
|
||||
np.searchsorted(weight_cdf[:, i], adjusted_percentile[i])
|
||||
for i in range(weight_cdf.shape[1])
|
||||
]
|
||||
)
|
||||
percentile_idx = np.array(percentile_idx)
|
||||
# In rare cases, percentile_idx equals to sorted_idx.shape[0]
|
||||
max_idx = sorted_idx.shape[0] - 1
|
||||
percentile_idx = np.apply_along_axis(
|
||||
lambda x: np.clip(x, 0, max_idx), axis=0, arr=percentile_idx
|
||||
)
|
||||
|
||||
col_index = np.arange(array.shape[1])
|
||||
percentile_in_sorted = sorted_idx[percentile_idx, col_index]
|
||||
percentile = array[percentile_in_sorted, col_index]
|
||||
return percentile[0] if n_dim == 1 else percentile
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user