first commit
This commit is contained in:
@@ -0,0 +1,481 @@
|
||||
"""
|
||||
.. _statsrefmanual:
|
||||
|
||||
==========================================
|
||||
Statistical functions (:mod:`scipy.stats`)
|
||||
==========================================
|
||||
|
||||
.. currentmodule:: scipy.stats
|
||||
|
||||
This module contains a large number of probability distributions,
|
||||
summary and frequency statistics, correlation functions and statistical
|
||||
tests, masked statistics, kernel density estimation, quasi-Monte Carlo
|
||||
functionality, and more.
|
||||
|
||||
Statistics is a very large area, and there are topics that are out of scope
|
||||
for SciPy and are covered by other packages. Some of the most important ones
|
||||
are:
|
||||
|
||||
- `statsmodels <https://www.statsmodels.org/stable/index.html>`__:
|
||||
regression, linear models, time series analysis, extensions to topics
|
||||
also covered by ``scipy.stats``.
|
||||
- `Pandas <https://pandas.pydata.org/>`__: tabular data, time series
|
||||
functionality, interfaces to other statistical languages.
|
||||
- `PyMC <https://docs.pymc.io/>`__: Bayesian statistical
|
||||
modeling, probabilistic machine learning.
|
||||
- `scikit-learn <https://scikit-learn.org/>`__: classification, regression,
|
||||
model selection.
|
||||
- `Seaborn <https://seaborn.pydata.org/>`__: statistical data visualization.
|
||||
- `rpy2 <https://rpy2.github.io/>`__: Python to R bridge.
|
||||
|
||||
|
||||
Probability distributions
|
||||
=========================
|
||||
|
||||
Each univariate distribution is an instance of a subclass of `rv_continuous`
|
||||
(`rv_discrete` for discrete distributions):
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rv_continuous
|
||||
rv_discrete
|
||||
rv_histogram
|
||||
|
||||
Continuous distributions
|
||||
------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
alpha -- Alpha
|
||||
anglit -- Anglit
|
||||
arcsine -- Arcsine
|
||||
argus -- Argus
|
||||
beta -- Beta
|
||||
betaprime -- Beta Prime
|
||||
bradford -- Bradford
|
||||
burr -- Burr (Type III)
|
||||
burr12 -- Burr (Type XII)
|
||||
cauchy -- Cauchy
|
||||
chi -- Chi
|
||||
chi2 -- Chi-squared
|
||||
cosine -- Cosine
|
||||
crystalball -- Crystalball
|
||||
dgamma -- Double Gamma
|
||||
dweibull -- Double Weibull
|
||||
erlang -- Erlang
|
||||
expon -- Exponential
|
||||
exponnorm -- Exponentially Modified Normal
|
||||
exponweib -- Exponentiated Weibull
|
||||
exponpow -- Exponential Power
|
||||
f -- F (Snecdor F)
|
||||
fatiguelife -- Fatigue Life (Birnbaum-Saunders)
|
||||
fisk -- Fisk
|
||||
foldcauchy -- Folded Cauchy
|
||||
foldnorm -- Folded Normal
|
||||
genlogistic -- Generalized Logistic
|
||||
gennorm -- Generalized normal
|
||||
genpareto -- Generalized Pareto
|
||||
genexpon -- Generalized Exponential
|
||||
genextreme -- Generalized Extreme Value
|
||||
gausshyper -- Gauss Hypergeometric
|
||||
gamma -- Gamma
|
||||
gengamma -- Generalized gamma
|
||||
genhalflogistic -- Generalized Half Logistic
|
||||
genhyperbolic -- Generalized Hyperbolic
|
||||
geninvgauss -- Generalized Inverse Gaussian
|
||||
gilbrat -- Gilbrat
|
||||
gompertz -- Gompertz (Truncated Gumbel)
|
||||
gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
|
||||
gumbel_l -- Left Sided Gumbel, etc.
|
||||
halfcauchy -- Half Cauchy
|
||||
halflogistic -- Half Logistic
|
||||
halfnorm -- Half Normal
|
||||
halfgennorm -- Generalized Half Normal
|
||||
hypsecant -- Hyperbolic Secant
|
||||
invgamma -- Inverse Gamma
|
||||
invgauss -- Inverse Gaussian
|
||||
invweibull -- Inverse Weibull
|
||||
johnsonsb -- Johnson SB
|
||||
johnsonsu -- Johnson SU
|
||||
kappa4 -- Kappa 4 parameter
|
||||
kappa3 -- Kappa 3 parameter
|
||||
ksone -- Distribution of Kolmogorov-Smirnov one-sided test statistic
|
||||
kstwo -- Distribution of Kolmogorov-Smirnov two-sided test statistic
|
||||
kstwobign -- Limiting Distribution of scaled Kolmogorov-Smirnov two-sided test statistic.
|
||||
laplace -- Laplace
|
||||
laplace_asymmetric -- Asymmetric Laplace
|
||||
levy -- Levy
|
||||
levy_l
|
||||
levy_stable
|
||||
logistic -- Logistic
|
||||
loggamma -- Log-Gamma
|
||||
loglaplace -- Log-Laplace (Log Double Exponential)
|
||||
lognorm -- Log-Normal
|
||||
loguniform -- Log-Uniform
|
||||
lomax -- Lomax (Pareto of the second kind)
|
||||
maxwell -- Maxwell
|
||||
mielke -- Mielke's Beta-Kappa
|
||||
moyal -- Moyal
|
||||
nakagami -- Nakagami
|
||||
ncx2 -- Non-central chi-squared
|
||||
ncf -- Non-central F
|
||||
nct -- Non-central Student's T
|
||||
norm -- Normal (Gaussian)
|
||||
norminvgauss -- Normal Inverse Gaussian
|
||||
pareto -- Pareto
|
||||
pearson3 -- Pearson type III
|
||||
powerlaw -- Power-function
|
||||
powerlognorm -- Power log normal
|
||||
powernorm -- Power normal
|
||||
rdist -- R-distribution
|
||||
rayleigh -- Rayleigh
|
||||
rice -- Rice
|
||||
recipinvgauss -- Reciprocal Inverse Gaussian
|
||||
semicircular -- Semicircular
|
||||
skewcauchy -- Skew Cauchy
|
||||
skewnorm -- Skew normal
|
||||
studentized_range -- Studentized Range
|
||||
t -- Student's T
|
||||
trapezoid -- Trapezoidal
|
||||
triang -- Triangular
|
||||
truncexpon -- Truncated Exponential
|
||||
truncnorm -- Truncated Normal
|
||||
tukeylambda -- Tukey-Lambda
|
||||
uniform -- Uniform
|
||||
vonmises -- Von-Mises (Circular)
|
||||
vonmises_line -- Von-Mises (Line)
|
||||
wald -- Wald
|
||||
weibull_min -- Minimum Weibull (see Frechet)
|
||||
weibull_max -- Maximum Weibull (see Frechet)
|
||||
wrapcauchy -- Wrapped Cauchy
|
||||
|
||||
Multivariate distributions
|
||||
--------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
multivariate_normal -- Multivariate normal distribution
|
||||
matrix_normal -- Matrix normal distribution
|
||||
dirichlet -- Dirichlet
|
||||
wishart -- Wishart
|
||||
invwishart -- Inverse Wishart
|
||||
multinomial -- Multinomial distribution
|
||||
special_ortho_group -- SO(N) group
|
||||
ortho_group -- O(N) group
|
||||
unitary_group -- U(N) group
|
||||
random_correlation -- random correlation matrices
|
||||
multivariate_t -- Multivariate t-distribution
|
||||
multivariate_hypergeom -- Multivariate hypergeometric distribution
|
||||
|
||||
Discrete distributions
|
||||
----------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
bernoulli -- Bernoulli
|
||||
betabinom -- Beta-Binomial
|
||||
binom -- Binomial
|
||||
boltzmann -- Boltzmann (Truncated Discrete Exponential)
|
||||
dlaplace -- Discrete Laplacian
|
||||
geom -- Geometric
|
||||
hypergeom -- Hypergeometric
|
||||
logser -- Logarithmic (Log-Series, Series)
|
||||
nbinom -- Negative Binomial
|
||||
nchypergeom_fisher -- Fisher's Noncentral Hypergeometric
|
||||
nchypergeom_wallenius -- Wallenius's Noncentral Hypergeometric
|
||||
nhypergeom -- Negative Hypergeometric
|
||||
planck -- Planck (Discrete Exponential)
|
||||
poisson -- Poisson
|
||||
randint -- Discrete Uniform
|
||||
skellam -- Skellam
|
||||
yulesimon -- Yule-Simon
|
||||
zipf -- Zipf (Zeta)
|
||||
zipfian -- Zipfian
|
||||
|
||||
An overview of statistical functions is given below. Many of these functions
|
||||
have a similar version in `scipy.stats.mstats` which work for masked arrays.
|
||||
|
||||
Summary statistics
|
||||
==================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
describe -- Descriptive statistics
|
||||
gmean -- Geometric mean
|
||||
hmean -- Harmonic mean
|
||||
kurtosis -- Fisher or Pearson kurtosis
|
||||
mode -- Modal value
|
||||
moment -- Central moment
|
||||
skew -- Skewness
|
||||
kstat --
|
||||
kstatvar --
|
||||
tmean -- Truncated arithmetic mean
|
||||
tvar -- Truncated variance
|
||||
tmin --
|
||||
tmax --
|
||||
tstd --
|
||||
tsem --
|
||||
variation -- Coefficient of variation
|
||||
find_repeats
|
||||
trim_mean
|
||||
gstd -- Geometric Standard Deviation
|
||||
iqr
|
||||
sem
|
||||
bayes_mvs
|
||||
mvsdist
|
||||
entropy
|
||||
differential_entropy
|
||||
median_absolute_deviation
|
||||
median_abs_deviation
|
||||
bootstrap
|
||||
|
||||
Frequency statistics
|
||||
====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
cumfreq
|
||||
itemfreq
|
||||
percentileofscore
|
||||
scoreatpercentile
|
||||
relfreq
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
binned_statistic -- Compute a binned statistic for a set of data.
|
||||
binned_statistic_2d -- Compute a 2-D binned statistic for a set of data.
|
||||
binned_statistic_dd -- Compute a d-D binned statistic for a set of data.
|
||||
|
||||
Correlation functions
|
||||
=====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
f_oneway
|
||||
alexandergovern
|
||||
pearsonr
|
||||
spearmanr
|
||||
pointbiserialr
|
||||
kendalltau
|
||||
weightedtau
|
||||
somersd
|
||||
linregress
|
||||
siegelslopes
|
||||
theilslopes
|
||||
multiscale_graphcorr
|
||||
|
||||
Statistical tests
|
||||
=================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_1samp
|
||||
ttest_ind
|
||||
ttest_ind_from_stats
|
||||
ttest_rel
|
||||
chisquare
|
||||
cramervonmises
|
||||
cramervonmises_2samp
|
||||
power_divergence
|
||||
kstest
|
||||
ks_1samp
|
||||
ks_2samp
|
||||
epps_singleton_2samp
|
||||
mannwhitneyu
|
||||
tiecorrect
|
||||
rankdata
|
||||
ranksums
|
||||
wilcoxon
|
||||
kruskal
|
||||
friedmanchisquare
|
||||
brunnermunzel
|
||||
combine_pvalues
|
||||
jarque_bera
|
||||
page_trend_test
|
||||
permutation_test
|
||||
tukey_hsd
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ansari
|
||||
bartlett
|
||||
levene
|
||||
shapiro
|
||||
anderson
|
||||
anderson_ksamp
|
||||
binom_test
|
||||
binomtest
|
||||
fligner
|
||||
median_test
|
||||
mood
|
||||
skewtest
|
||||
kurtosistest
|
||||
normaltest
|
||||
|
||||
|
||||
Quasi-Monte Carlo
|
||||
=================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
stats.qmc
|
||||
|
||||
|
||||
Masked statistics functions
|
||||
===========================
|
||||
|
||||
.. toctree::
|
||||
|
||||
stats.mstats
|
||||
|
||||
|
||||
Other statistical functionality
|
||||
===============================
|
||||
|
||||
Transformations
|
||||
---------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
boxcox
|
||||
boxcox_normmax
|
||||
boxcox_llf
|
||||
yeojohnson
|
||||
yeojohnson_normmax
|
||||
yeojohnson_llf
|
||||
obrientransform
|
||||
sigmaclip
|
||||
trimboth
|
||||
trim1
|
||||
zmap
|
||||
zscore
|
||||
gzscore
|
||||
|
||||
Statistical distances
|
||||
---------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
wasserstein_distance
|
||||
energy_distance
|
||||
|
||||
Sampling
|
||||
--------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
stats.sampling
|
||||
|
||||
Random variate generation / CDF Inversion
|
||||
-----------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rvs_ratio_uniforms
|
||||
NumericalInverseHermite
|
||||
|
||||
Circular statistical functions
|
||||
------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
circmean
|
||||
circvar
|
||||
circstd
|
||||
|
||||
Contingency table functions
|
||||
---------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
chi2_contingency
|
||||
contingency.crosstab
|
||||
contingency.expected_freq
|
||||
contingency.margins
|
||||
contingency.relative_risk
|
||||
contingency.association
|
||||
fisher_exact
|
||||
barnard_exact
|
||||
boschloo_exact
|
||||
|
||||
Plot-tests
|
||||
----------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ppcc_max
|
||||
ppcc_plot
|
||||
probplot
|
||||
boxcox_normplot
|
||||
yeojohnson_normplot
|
||||
|
||||
Univariate and multivariate kernel density estimation
|
||||
-----------------------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
gaussian_kde
|
||||
|
||||
Warnings / Errors used in :mod:`scipy.stats`
|
||||
--------------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
F_onewayConstantInputWarning
|
||||
F_onewayBadInputSizesWarning
|
||||
PearsonRConstantInputWarning
|
||||
PearsonRNearConstantInputWarning
|
||||
SpearmanRConstantInputWarning
|
||||
BootstrapDegenerateDistributionWarning
|
||||
|
||||
"""
|
||||
|
||||
from ._stats_py import *
|
||||
from ._variation import variation
|
||||
from .distributions import *
|
||||
from ._morestats import *
|
||||
from ._binomtest import binomtest
|
||||
from ._binned_statistic import *
|
||||
from ._kde import gaussian_kde
|
||||
from . import mstats
|
||||
from . import qmc
|
||||
from ._multivariate import *
|
||||
from . import contingency
|
||||
from .contingency import chi2_contingency
|
||||
from ._bootstrap import bootstrap, BootstrapDegenerateDistributionWarning
|
||||
from ._entropy import *
|
||||
from ._hypotests import *
|
||||
from ._rvs_sampling import rvs_ratio_uniforms, NumericalInverseHermite # noqa
|
||||
from ._page_trend_test import page_trend_test
|
||||
from ._mannwhitneyu import mannwhitneyu
|
||||
|
||||
# Deprecated namespaces, to be removed in v2.0.0
|
||||
from . import (
|
||||
biasedurn, kde, morestats, mstats_basic, mstats_extras, mvn, statlib, stats
|
||||
)
|
||||
|
||||
__all__ = [s for s in dir() if not s.startswith("_")] # Remove dunders.
|
||||
|
||||
from scipy._lib._testutils import PytestTester
|
||||
test = PytestTester(__name__)
|
||||
del PytestTester
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,347 @@
|
||||
# Many scipy.stats functions support `axis` and `nan_policy` parameters.
|
||||
# When the two are combined, it can be tricky to get all the behavior just
|
||||
# right. This file contains utility functions useful for scipy.stats functions
|
||||
# that support `axis` and `nan_policy`, including a decorator that
|
||||
# automatically adds `axis` and `nan_policy` arguments to a function.
|
||||
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
import scipy.stats._stats_py
|
||||
from functools import wraps
|
||||
from scipy._lib._docscrape import FunctionDoc, Parameter
|
||||
import inspect
|
||||
|
||||
|
||||
def _broadcast_array_shapes_remove_axis(arrays, axis=None):
|
||||
"""
|
||||
Broadcast shapes of arrays, dropping specified axes
|
||||
|
||||
Given a sequence of arrays `arrays` and an integer or tuple `axis`, find
|
||||
the shape of the broadcast result after consuming/dropping `axis`.
|
||||
In other words, return output shape of a typical hypothesis test on
|
||||
`arrays` vectorized along `axis`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> a = np.zeros((5, 2, 1))
|
||||
>>> b = np.zeros((9, 3))
|
||||
>>> _broadcast_array_shapes((a, b), 1)
|
||||
(5, 3)
|
||||
"""
|
||||
# Note that here, `axis=None` means do not consume/drop any axes - _not_
|
||||
# ravel arrays before broadcasting.
|
||||
shapes = [arr.shape for arr in arrays]
|
||||
return _broadcast_shapes_remove_axis(shapes, axis)
|
||||
|
||||
|
||||
def _broadcast_shapes_remove_axis(shapes, axis=None):
|
||||
"""
|
||||
Broadcast shapes, dropping specified axes
|
||||
|
||||
Same as _broadcast_array_shapes, but given a sequence
|
||||
of array shapes `shapes` instead of the arrays themselves.
|
||||
"""
|
||||
n_dims = max([len(shape) for shape in shapes])
|
||||
new_shapes = np.ones((len(shapes), n_dims), dtype=int)
|
||||
for row, shape in zip(new_shapes, shapes):
|
||||
row[len(row)-len(shape):] = shape # can't use negative indices (-0:)
|
||||
if axis is not None:
|
||||
new_shapes = np.delete(new_shapes, axis, axis=1)
|
||||
new_shape = np.max(new_shapes, axis=0)
|
||||
new_shape *= new_shapes.all(axis=0)
|
||||
if np.any(~((new_shapes == 1) | (new_shapes == new_shape))):
|
||||
raise ValueError("Array shapes are incompatible for broadcasting.")
|
||||
return tuple(new_shape)
|
||||
|
||||
|
||||
def _broadcast_concatenate(xs, axis):
|
||||
"""Concatenate arrays along an axis with broadcasting."""
|
||||
# prepend 1s to array shapes as needed
|
||||
ndim = max([x.ndim for x in xs])
|
||||
xs = [x.reshape([1]*(ndim-x.ndim) + list(x.shape)) for x in xs]
|
||||
# move the axis we're concatenating along to the end
|
||||
xs = [np.swapaxes(x, axis, -1) for x in xs]
|
||||
# determine final shape of all but the last axis
|
||||
shape = _broadcast_array_shapes_remove_axis(xs, axis=-1)
|
||||
# broadcast along all but the last axis
|
||||
xs = [np.broadcast_to(x, shape + (x.shape[-1],)) for x in xs]
|
||||
# concatenate along last axis
|
||||
res = np.concatenate(xs, axis=-1)
|
||||
# move the last axis back to where it was
|
||||
res = np.swapaxes(res, axis, -1)
|
||||
return res
|
||||
|
||||
|
||||
# TODO: add support for `axis` tuples
|
||||
def _remove_nans(samples, paired):
|
||||
"Remove nans from paired or unpaired samples"
|
||||
# potential optimization: don't copy arrays that don't contain nans
|
||||
if not paired:
|
||||
return [sample[~np.isnan(sample)] for sample in samples]
|
||||
|
||||
# for paired samples, we need to remove the whole pair when any part
|
||||
# has a nan
|
||||
nans = np.isnan(samples[0])
|
||||
for sample in samples[1:]:
|
||||
nans = nans | np.isnan(sample)
|
||||
not_nans = ~nans
|
||||
return [sample[not_nans] for sample in samples]
|
||||
|
||||
|
||||
def _check_empty_inputs(samples, axis):
|
||||
"""
|
||||
Check for empty sample; return appropriate output for a vectorized hypotest
|
||||
"""
|
||||
# if none of the samples are empty, we need to perform the test
|
||||
if not any((sample.size == 0 for sample in samples)):
|
||||
return None
|
||||
# otherwise, the statistic and p-value will be either empty arrays or
|
||||
# arrays with NaNs. Produce the appropriate array and return it.
|
||||
output_shape = _broadcast_array_shapes_remove_axis(samples, axis)
|
||||
output = np.ones(output_shape) * np.nan
|
||||
return output
|
||||
|
||||
|
||||
# Standard docstring / signature entries for `axis` and `nan_policy`
|
||||
_name = 'axis'
|
||||
_type = "int or None, default: 0"
|
||||
_desc = (
|
||||
"""If an int, the axis of the input along which to compute the statistic.
|
||||
The statistic of each axis-slice (e.g. row) of the input will appear in a
|
||||
corresponding element of the output.
|
||||
If ``None``, the input will be raveled before computing the statistic."""
|
||||
.split('\n'))
|
||||
_axis_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_axis_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default=0)
|
||||
|
||||
_name = 'nan_policy'
|
||||
_type = "{'propagate', 'omit', 'raise'}"
|
||||
_desc = (
|
||||
"""Defines how to handle input NaNs.
|
||||
|
||||
- ``propagate``: if a NaN is present in the axis slice (e.g. row) along
|
||||
which the statistic is computed, the corresponding entry of the output
|
||||
will be NaN.
|
||||
- ``omit``: NaNs will be omitted when performing the calculation.
|
||||
If insufficient data remains in the axis slice along which the
|
||||
statistic is computed, the corresponding entry of the output will be
|
||||
NaN.
|
||||
- ``raise``: if a NaN is present, a ``ValueError`` will be raised."""
|
||||
.split('\n'))
|
||||
_nan_policy_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_nan_policy_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default='propagate')
|
||||
|
||||
|
||||
def _axis_nan_policy_factory(result_object, default_axis=0,
|
||||
n_samples=1, paired=False,
|
||||
result_unpacker=None, too_small=0):
|
||||
"""Factory for a wrapper that adds axis/nan_policy params to a function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
result_object : callable
|
||||
Callable that returns an object of the type returned by the function
|
||||
being wrapped (e.g. the namedtuple or dataclass returned by a
|
||||
statistical test) provided the separate components (e.g. statistic,
|
||||
pvalue).
|
||||
default_axis : int, default: 0
|
||||
The default value of the axis argument. Standard is 0 except when
|
||||
backwards compatibility demands otherwise (e.g. `None`).
|
||||
n_samples : int or callable, default: 1
|
||||
The number of data samples accepted by the function
|
||||
(e.g. `mannwhitneyu`), a callable that accepts a dictionary of
|
||||
parameters passed into the function and returns the number of data
|
||||
samples (e.g. `wilcoxon`), or `None` to indicate an arbitrary number
|
||||
of samples (e.g. `kruskal`).
|
||||
paired : {False, True}
|
||||
Whether the function being wrapped treats the samples as paired (i.e.
|
||||
corresponding elements of each sample should be considered as different
|
||||
components of the same sample.)
|
||||
result_unpacker : callable, optional
|
||||
Function that unpacks the results of the function being wrapped into
|
||||
a tuple. This is essentially the inverse of `result_object`. Default
|
||||
is `None`, which is appropriate for statistical tests that return a
|
||||
statistic, pvalue tuple (rather than, e.g., a non-iterable datalass).
|
||||
too_small : int, default: 0
|
||||
The largest unnacceptably small sample for the function being wrapped.
|
||||
For example, some functions require samples of size two or more or they
|
||||
raise an error. This argument prevents the error from being raised when
|
||||
input is not 1D and instead places a NaN in the corresponding element
|
||||
of the result.
|
||||
"""
|
||||
|
||||
if result_unpacker is None:
|
||||
def result_unpacker(res):
|
||||
return res[..., 0], res[..., 1]
|
||||
|
||||
def is_too_small(samples):
|
||||
for sample in samples:
|
||||
if len(sample) <= too_small:
|
||||
return True
|
||||
return False
|
||||
|
||||
def axis_nan_policy_decorator(hypotest_fun_in):
|
||||
@wraps(hypotest_fun_in)
|
||||
def axis_nan_policy_wrapper(*args, _no_deco=False, **kwds):
|
||||
|
||||
if _no_deco: # for testing, decorator does nothing
|
||||
return hypotest_fun_in(*args, **kwds)
|
||||
|
||||
# We need to be flexible about whether position or keyword
|
||||
# arguments are used, but we need to make sure users don't pass
|
||||
# both for the same parameter. To complicate matters, some
|
||||
# functions accept samples with *args, and some functions already
|
||||
# accept `axis` and `nan_policy` as positional arguments.
|
||||
# The strategy is to make sure that there is no duplication
|
||||
# between `args` and `kwds`, combine the two into `kwds`, then
|
||||
# the samples, `nan_policy`, and `axis` from `kwds`, as they are
|
||||
# dealt with separately.
|
||||
|
||||
# Check for intersection between positional and keyword args
|
||||
params = list(inspect.signature(hypotest_fun_in).parameters)
|
||||
if n_samples is None:
|
||||
# Give unique names to each positional sample argument
|
||||
# Note that *args can't be provided as a keyword argument
|
||||
params = [f"arg{i}" for i in range(len(args))] + params[1:]
|
||||
|
||||
d_args = dict(zip(params, args))
|
||||
intersection = set(d_args) & set(kwds)
|
||||
if intersection:
|
||||
message = (f"{hypotest_fun_in.__name__}() got multiple values "
|
||||
f"for argument '{list(intersection)[0]}'")
|
||||
raise TypeError(message)
|
||||
|
||||
# Consolidate other positional and keyword args into `kwds`
|
||||
kwds.update(d_args)
|
||||
|
||||
# rename avoids UnboundLocalError
|
||||
if callable(n_samples):
|
||||
n_samp = n_samples(kwds)
|
||||
else:
|
||||
n_samp = n_samples or len(args)
|
||||
|
||||
# Extract the things we need here
|
||||
samples = [np.atleast_1d(kwds.pop(param))
|
||||
for param in params[:n_samp]]
|
||||
vectorized = True if 'axis' in params else False
|
||||
axis = kwds.pop('axis', default_axis)
|
||||
nan_policy = kwds.pop('nan_policy', 'propagate')
|
||||
del args # avoid the possibility of passing both `args` and `kwds`
|
||||
|
||||
if axis is None:
|
||||
samples = [sample.ravel() for sample in samples]
|
||||
axis = 0
|
||||
elif axis != int(axis):
|
||||
raise ValueError('`axis` must be an integer')
|
||||
axis = int(axis)
|
||||
|
||||
# if axis is not needed, just handle nan_policy and return
|
||||
ndims = np.array([sample.ndim for sample in samples])
|
||||
if np.all(ndims <= 1):
|
||||
# Addresses nan_policy == "raise"
|
||||
contains_nans = []
|
||||
for sample in samples:
|
||||
contains_nan, _ = (
|
||||
scipy.stats._stats_py._contains_nan(sample, nan_policy))
|
||||
contains_nans.append(contains_nan)
|
||||
|
||||
# Addresses nan_policy == "propagate"
|
||||
# Consider adding option to let function propagate nans, but
|
||||
# currently the hypothesis tests this is applied to do not
|
||||
# propagate nans in a sensible way
|
||||
if any(contains_nans) and nan_policy == 'propagate':
|
||||
return result_object(np.nan, np.nan)
|
||||
|
||||
# Addresses nan_policy == "omit"
|
||||
if any(contains_nans) and nan_policy == 'omit':
|
||||
# consider passing in contains_nans
|
||||
samples = _remove_nans(samples, paired)
|
||||
|
||||
# ideally, this is what the behavior would be, but some
|
||||
# existing functions raise exceptions, so overriding it
|
||||
# would break backward compatibility.
|
||||
# if is_too_small(samples):
|
||||
# return result_object(np.nan, np.nan)
|
||||
|
||||
return hypotest_fun_in(*samples, **kwds)
|
||||
|
||||
# check for empty input
|
||||
# ideally, move this to the top, but some existing functions raise
|
||||
# exceptions for empty input, so overriding it would break
|
||||
# backward compatibility.
|
||||
empty_output = _check_empty_inputs(samples, axis)
|
||||
if empty_output is not None:
|
||||
statistic = empty_output
|
||||
pvalue = empty_output.copy()
|
||||
return result_object(statistic, pvalue)
|
||||
|
||||
# otherwise, concatenate all samples along axis, remembering where
|
||||
# each separate sample begins
|
||||
lengths = np.array([sample.shape[axis] for sample in samples])
|
||||
split_indices = np.cumsum(lengths)
|
||||
x = _broadcast_concatenate(samples, axis)
|
||||
|
||||
# Addresses nan_policy == "raise"
|
||||
contains_nan, _ = (
|
||||
scipy.stats._stats_py._contains_nan(x, nan_policy))
|
||||
|
||||
if vectorized and not contains_nan:
|
||||
return hypotest_fun_in(*samples, axis=axis, **kwds)
|
||||
|
||||
# Addresses nan_policy == "omit"
|
||||
if contains_nan and nan_policy == 'omit':
|
||||
def hypotest_fun(x):
|
||||
samples = np.split(x, split_indices)[:n_samp]
|
||||
samples = _remove_nans(samples, paired)
|
||||
if is_too_small(samples):
|
||||
return result_object(np.nan, np.nan)
|
||||
return hypotest_fun_in(*samples, **kwds)
|
||||
|
||||
# Addresses nan_policy == "propagate"
|
||||
elif contains_nan and nan_policy == 'propagate':
|
||||
def hypotest_fun(x):
|
||||
if np.isnan(x).any():
|
||||
return result_object(np.nan, np.nan)
|
||||
samples = np.split(x, split_indices)[:n_samp]
|
||||
return hypotest_fun_in(*samples, **kwds)
|
||||
|
||||
else:
|
||||
def hypotest_fun(x):
|
||||
samples = np.split(x, split_indices)[:n_samp]
|
||||
return hypotest_fun_in(*samples, **kwds)
|
||||
|
||||
x = np.moveaxis(x, axis, -1)
|
||||
res = np.apply_along_axis(hypotest_fun, axis=-1, arr=x)
|
||||
return result_object(*result_unpacker(res))
|
||||
|
||||
doc = FunctionDoc(axis_nan_policy_wrapper)
|
||||
parameter_names = [param.name for param in doc['Parameters']]
|
||||
if 'axis' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('axis')] = (
|
||||
_axis_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_axis_parameter_doc)
|
||||
if 'nan_policy' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('nan_policy')] = (
|
||||
_nan_policy_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_nan_policy_parameter_doc)
|
||||
doc = str(doc).split("\n", 1)[1] # remove signature
|
||||
axis_nan_policy_wrapper.__doc__ = str(doc)
|
||||
|
||||
sig = inspect.signature(axis_nan_policy_wrapper)
|
||||
parameters = sig.parameters
|
||||
parameter_list = list(parameters.values())
|
||||
if 'axis' not in parameters:
|
||||
parameter_list.append(_axis_parameter)
|
||||
if 'nan_policy' not in parameters:
|
||||
parameter_list.append(_nan_policy_parameter)
|
||||
sig = sig.replace(parameters=parameter_list)
|
||||
axis_nan_policy_wrapper.__signature__ = sig
|
||||
|
||||
return axis_nan_policy_wrapper
|
||||
return axis_nan_policy_decorator
|
||||
Binary file not shown.
@@ -0,0 +1,27 @@
|
||||
# Declare the class with cdef
|
||||
cdef extern from "biasedurn/stocc.h" nogil:
|
||||
cdef cppclass CFishersNCHypergeometric:
|
||||
CFishersNCHypergeometric(int, int, int, double, double) except +
|
||||
int mode()
|
||||
double mean()
|
||||
double variance()
|
||||
double probability(int x)
|
||||
double moments(double * mean, double * var)
|
||||
|
||||
cdef cppclass CWalleniusNCHypergeometric:
|
||||
CWalleniusNCHypergeometric() except +
|
||||
CWalleniusNCHypergeometric(int, int, int, double, double) except +
|
||||
int mode()
|
||||
double mean()
|
||||
double variance()
|
||||
double probability(int x)
|
||||
double moments(double * mean, double * var)
|
||||
|
||||
cdef cppclass StochasticLib3:
|
||||
StochasticLib3(int seed) except +
|
||||
double Random() except +
|
||||
void SetAccuracy(double accur)
|
||||
int FishersNCHyp (int n, int m, int N, double odds) except +
|
||||
int WalleniusNCHyp (int n, int m, int N, double odds) except +
|
||||
double(*next_double)()
|
||||
double(*next_normal)(const double m, const double s)
|
||||
@@ -0,0 +1,763 @@
|
||||
import builtins
|
||||
import numpy as np
|
||||
from numpy.testing import suppress_warnings
|
||||
from operator import index
|
||||
from collections import namedtuple
|
||||
|
||||
__all__ = ['binned_statistic',
|
||||
'binned_statistic_2d',
|
||||
'binned_statistic_dd']
|
||||
|
||||
|
||||
BinnedStatisticResult = namedtuple('BinnedStatisticResult',
|
||||
('statistic', 'bin_edges', 'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic(x, values, statistic='mean',
|
||||
bins=10, range=None):
|
||||
"""
|
||||
Compute a binned statistic for one or more sets of data.
|
||||
|
||||
This is a generalization of a histogram function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values (or set of values) within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `x`, or a set of sequences - each the same shape as
|
||||
`x`. If `values` is a set of sequences, the statistic will be computed
|
||||
on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or sequence of scalars, optional
|
||||
If `bins` is an int, it defines the number of equal-width bins in the
|
||||
given range (10 by default). If `bins` is a sequence, it defines the
|
||||
bin edges, including the rightmost edge, allowing for non-uniform bin
|
||||
widths. Values in `x` that are smaller than lowest bin edge are
|
||||
assigned to bin number 0, values beyond the highest bin are assigned to
|
||||
``bins[-1]``. If the bin edges are specified, the number of bins will
|
||||
be, (nx = len(bins)-1).
|
||||
range : (float, float) or [(float, float)], optional
|
||||
The lower and upper range of the bins. If not provided, range
|
||||
is simply ``(x.min(), x.max())``. Values outside the range are
|
||||
ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : array
|
||||
The values of the selected statistic in each bin.
|
||||
bin_edges : array of dtype float
|
||||
Return the bin edges ``(length(statistic)+1)``.
|
||||
binnumber: 1-D ndarray of ints
|
||||
Indices of the bins (corresponding to `bin_edges`) in which each value
|
||||
of `x` belongs. Same length as `values`. A binnumber of `i` means the
|
||||
corresponding value is between (bin_edges[i-1], bin_edges[i]).
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
|
||||
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
|
||||
``[3, 4]``, which *includes* 4.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
First some basic examples:
|
||||
|
||||
Create two evenly spaced bins in the range of the given sample, and sum the
|
||||
corresponding values in each of those bins:
|
||||
|
||||
>>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
|
||||
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
|
||||
BinnedStatisticResult(statistic=array([4. , 4.5]),
|
||||
bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
|
||||
|
||||
Multiple arrays of values can also be passed. The statistic is calculated
|
||||
on each set independently:
|
||||
|
||||
>>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
|
||||
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
|
||||
BinnedStatisticResult(statistic=array([[4. , 4.5],
|
||||
[8. , 9. ]]), bin_edges=array([1., 4., 7.]),
|
||||
binnumber=array([1, 1, 1, 2, 2]))
|
||||
|
||||
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
|
||||
... bins=3)
|
||||
BinnedStatisticResult(statistic=array([1., 2., 4.]),
|
||||
bin_edges=array([1., 2., 3., 4.]),
|
||||
binnumber=array([1, 2, 1, 2, 3]))
|
||||
|
||||
As a second example, we now generate some random data of sailing boat speed
|
||||
as a function of wind speed, and then determine how fast our boat is for
|
||||
certain wind speeds:
|
||||
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> windspeed = 8 * rng.random(500)
|
||||
>>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
|
||||
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
|
||||
... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
|
||||
>>> plt.figure()
|
||||
>>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
|
||||
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
|
||||
... label='binned statistic of data')
|
||||
>>> plt.legend()
|
||||
|
||||
Now we can use ``binnumber`` to select all datapoints with a windspeed
|
||||
below 1:
|
||||
|
||||
>>> low_boatspeed = boatspeed[binnumber == 0]
|
||||
|
||||
As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
|
||||
plot of a distribution that shows the mean and distribution around that
|
||||
mean per bin, on top of a regular histogram and the probability
|
||||
distribution function:
|
||||
|
||||
>>> x = np.linspace(0, 5, num=500)
|
||||
>>> x_pdf = stats.maxwell.pdf(x)
|
||||
>>> samples = stats.maxwell.rvs(size=10000)
|
||||
|
||||
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
|
||||
... statistic='mean', bins=25)
|
||||
>>> bin_width = (bin_edges[1] - bin_edges[0])
|
||||
>>> bin_centers = bin_edges[1:] - bin_width/2
|
||||
|
||||
>>> plt.figure()
|
||||
>>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
|
||||
... alpha=0.2, label='histogram of data')
|
||||
>>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
|
||||
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
|
||||
... label='binned statistic of data')
|
||||
>>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
|
||||
>>> plt.legend(fontsize=10)
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1:
|
||||
bins = [np.asarray(bins, float)]
|
||||
|
||||
if range is not None:
|
||||
if len(range) == 2:
|
||||
range = [range]
|
||||
|
||||
medians, edges, binnumbers = binned_statistic_dd(
|
||||
[x], values, statistic, bins, range)
|
||||
|
||||
return BinnedStatisticResult(medians, edges[0], binnumbers)
|
||||
|
||||
|
||||
BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
|
||||
('statistic', 'x_edge', 'y_edge',
|
||||
'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic_2d(x, y, values, statistic='mean',
|
||||
bins=10, range=None, expand_binnumbers=False):
|
||||
"""
|
||||
Compute a bidimensional binned statistic for one or more sets of data.
|
||||
|
||||
This is a generalization of a histogram2d function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values (or set of values) within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned along the first dimension.
|
||||
y : (N,) array_like
|
||||
A sequence of values to be binned along the second dimension.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `x`, or a list of sequences - each with the same
|
||||
shape as `x`. If `values` is such a list, the statistic will be
|
||||
computed on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or [int, int] or array_like or [array, array], optional
|
||||
The bin specification:
|
||||
|
||||
* the number of bins for the two dimensions (nx = ny = bins),
|
||||
* the number of bins in each dimension (nx, ny = bins),
|
||||
* the bin edges for the two dimensions (x_edge = y_edge = bins),
|
||||
* the bin edges in each dimension (x_edge, y_edge = bins).
|
||||
|
||||
If the bin edges are specified, the number of bins will be,
|
||||
(nx = len(x_edge)-1, ny = len(y_edge)-1).
|
||||
|
||||
range : (2,2) array_like, optional
|
||||
The leftmost and rightmost edges of the bins along each dimension
|
||||
(if not specified explicitly in the `bins` parameters):
|
||||
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
|
||||
considered outliers and not tallied in the histogram.
|
||||
expand_binnumbers : bool, optional
|
||||
'False' (default): the returned `binnumber` is a shape (N,) array of
|
||||
linearized bin indices.
|
||||
'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
|
||||
ndarray, where each row gives the bin numbers in the corresponding
|
||||
dimension.
|
||||
See the `binnumber` returned value, and the `Examples` section.
|
||||
|
||||
.. versionadded:: 0.17.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : (nx, ny) ndarray
|
||||
The values of the selected statistic in each two-dimensional bin.
|
||||
x_edge : (nx + 1) ndarray
|
||||
The bin edges along the first dimension.
|
||||
y_edge : (ny + 1) ndarray
|
||||
The bin edges along the second dimension.
|
||||
binnumber : (N,) array of ints or (2,N) ndarray of ints
|
||||
This assigns to each element of `sample` an integer that represents the
|
||||
bin in which this observation falls. The representation depends on the
|
||||
`expand_binnumbers` argument. See `Notes` for details.
|
||||
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
Binedges:
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
|
||||
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
|
||||
``[3, 4]``, which *includes* 4.
|
||||
|
||||
`binnumber`:
|
||||
This returned argument assigns to each element of `sample` an integer that
|
||||
represents the bin in which it belongs. The representation depends on the
|
||||
`expand_binnumbers` argument. If 'False' (default): The returned
|
||||
`binnumber` is a shape (N,) array of linearized indices mapping each
|
||||
element of `sample` to its corresponding bin (using row-major ordering).
|
||||
Note that the returned linearized bin indices are used for an array with
|
||||
extra bins on the outer binedges to capture values outside of the defined
|
||||
bin bounds.
|
||||
If 'True': The returned `binnumber` is a shape (2,N) ndarray where
|
||||
each row indicates bin placements for each dimension respectively. In each
|
||||
dimension, a binnumber of `i` means the corresponding value is between
|
||||
(D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
|
||||
Calculate the counts with explicit bin-edges:
|
||||
|
||||
>>> x = [0.1, 0.1, 0.1, 0.6]
|
||||
>>> y = [2.1, 2.6, 2.1, 2.1]
|
||||
>>> binx = [0.0, 0.5, 1.0]
|
||||
>>> biny = [2.0, 2.5, 3.0]
|
||||
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
|
||||
>>> ret.statistic
|
||||
array([[2., 1.],
|
||||
[1., 0.]])
|
||||
|
||||
The bin in which each sample is placed is given by the `binnumber`
|
||||
returned parameter. By default, these are the linearized bin indices:
|
||||
|
||||
>>> ret.binnumber
|
||||
array([5, 6, 5, 9])
|
||||
|
||||
The bin indices can also be expanded into separate entries for each
|
||||
dimension using the `expand_binnumbers` parameter:
|
||||
|
||||
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
|
||||
... expand_binnumbers=True)
|
||||
>>> ret.binnumber
|
||||
array([[1, 1, 1, 2],
|
||||
[1, 2, 1, 1]])
|
||||
|
||||
Which shows that the first three elements belong in the xbin 1, and the
|
||||
fourth into xbin 2; and so on for y.
|
||||
|
||||
"""
|
||||
|
||||
# This code is based on np.histogram2d
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1 and N != 2:
|
||||
xedges = yedges = np.asarray(bins, float)
|
||||
bins = [xedges, yedges]
|
||||
|
||||
medians, edges, binnumbers = binned_statistic_dd(
|
||||
[x, y], values, statistic, bins, range,
|
||||
expand_binnumbers=expand_binnumbers)
|
||||
|
||||
return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
|
||||
|
||||
|
||||
BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
|
||||
('statistic', 'bin_edges',
|
||||
'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic_dd(sample, values, statistic='mean',
|
||||
bins=10, range=None, expand_binnumbers=False,
|
||||
binned_statistic_result=None):
|
||||
"""
|
||||
Compute a multidimensional binned statistic for a set of data.
|
||||
|
||||
This is a generalization of a histogramdd function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample : array_like
|
||||
Data to histogram passed as a sequence of N arrays of length D, or
|
||||
as an (N,D) array.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `sample`, or a list of sequences - each with the
|
||||
same shape as `sample`. If `values` is such a list, the statistic
|
||||
will be computed on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0. If the number of values
|
||||
within a given bin is 0 or 1, the computed standard deviation value
|
||||
will be 0 for the bin.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : sequence or positive int, optional
|
||||
The bin specification must be in one of the following forms:
|
||||
|
||||
* A sequence of arrays describing the bin edges along each dimension.
|
||||
* The number of bins for each dimension (nx, ny, ... = bins).
|
||||
* The number of bins for all dimensions (nx = ny = ... = bins).
|
||||
range : sequence, optional
|
||||
A sequence of lower and upper bin edges to be used if the edges are
|
||||
not given explicitly in `bins`. Defaults to the minimum and maximum
|
||||
values along each dimension.
|
||||
expand_binnumbers : bool, optional
|
||||
'False' (default): the returned `binnumber` is a shape (N,) array of
|
||||
linearized bin indices.
|
||||
'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
|
||||
ndarray, where each row gives the bin numbers in the corresponding
|
||||
dimension.
|
||||
See the `binnumber` returned value, and the `Examples` section of
|
||||
`binned_statistic_2d`.
|
||||
binned_statistic_result : binnedStatisticddResult
|
||||
Result of a previous call to the function in order to reuse bin edges
|
||||
and bin numbers with new values and/or a different statistic.
|
||||
To reuse bin numbers, `expand_binnumbers` must have been set to False
|
||||
(the default)
|
||||
|
||||
.. versionadded:: 0.17.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : ndarray, shape(nx1, nx2, nx3,...)
|
||||
The values of the selected statistic in each two-dimensional bin.
|
||||
bin_edges : list of ndarrays
|
||||
A list of D arrays describing the (nxi + 1) bin edges for each
|
||||
dimension.
|
||||
binnumber : (N,) array of ints or (D,N) ndarray of ints
|
||||
This assigns to each element of `sample` an integer that represents the
|
||||
bin in which this observation falls. The representation depends on the
|
||||
`expand_binnumbers` argument. See `Notes` for details.
|
||||
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
|
||||
|
||||
Notes
|
||||
-----
|
||||
Binedges:
|
||||
All but the last (righthand-most) bin is half-open in each dimension. In
|
||||
other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
|
||||
``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
|
||||
last bin, however, is ``[3, 4]``, which *includes* 4.
|
||||
|
||||
`binnumber`:
|
||||
This returned argument assigns to each element of `sample` an integer that
|
||||
represents the bin in which it belongs. The representation depends on the
|
||||
`expand_binnumbers` argument. If 'False' (default): The returned
|
||||
`binnumber` is a shape (N,) array of linearized indices mapping each
|
||||
element of `sample` to its corresponding bin (using row-major ordering).
|
||||
If 'True': The returned `binnumber` is a shape (D,N) ndarray where
|
||||
each row indicates bin placements for each dimension respectively. In each
|
||||
dimension, a binnumber of `i` means the corresponding value is between
|
||||
(bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from mpl_toolkits.mplot3d import Axes3D
|
||||
|
||||
Take an array of 600 (x, y) coordinates as an example.
|
||||
`binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
|
||||
of dimension `D+1` is required.
|
||||
|
||||
>>> mu = np.array([0., 1.])
|
||||
>>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
|
||||
>>> multinormal = stats.multivariate_normal(mu, sigma)
|
||||
>>> data = multinormal.rvs(size=600, random_state=235412)
|
||||
>>> data.shape
|
||||
(600, 2)
|
||||
|
||||
Create bins and count how many arrays fall in each bin:
|
||||
|
||||
>>> N = 60
|
||||
>>> x = np.linspace(-3, 3, N)
|
||||
>>> y = np.linspace(-3, 4, N)
|
||||
>>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
|
||||
... statistic='count')
|
||||
>>> bincounts = ret.statistic
|
||||
|
||||
Set the volume and the location of bars:
|
||||
|
||||
>>> dx = x[1] - x[0]
|
||||
>>> dy = y[1] - y[0]
|
||||
>>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
|
||||
>>> z = 0
|
||||
|
||||
>>> bincounts = bincounts.ravel()
|
||||
>>> x = x.ravel()
|
||||
>>> y = y.ravel()
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111, projection='3d')
|
||||
>>> with np.errstate(divide='ignore'): # silence random axes3d warning
|
||||
... ax.bar3d(x, y, z, dx, dy, bincounts)
|
||||
|
||||
Reuse bin numbers and bin edges with new values:
|
||||
|
||||
>>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
|
||||
... binned_statistic_result=ret,
|
||||
... statistic='mean')
|
||||
"""
|
||||
known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
|
||||
if not callable(statistic) and statistic not in known_stats:
|
||||
raise ValueError('invalid statistic %r' % (statistic,))
|
||||
|
||||
try:
|
||||
bins = index(bins)
|
||||
except TypeError:
|
||||
# bins is not an integer
|
||||
pass
|
||||
# If bins was an integer-like object, now it is an actual Python int.
|
||||
|
||||
# NOTE: for _bin_edges(), see e.g. gh-11365
|
||||
if isinstance(bins, int) and not np.isfinite(sample).all():
|
||||
raise ValueError('%r contains non-finite values.' % (sample,))
|
||||
|
||||
# `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
|
||||
# `Dlen` is the length of elements along each dimension.
|
||||
# This code is based on np.histogramdd
|
||||
try:
|
||||
# `sample` is an ND-array.
|
||||
Dlen, Ndim = sample.shape
|
||||
except (AttributeError, ValueError):
|
||||
# `sample` is a sequence of 1D arrays.
|
||||
sample = np.atleast_2d(sample).T
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
# Store initial shape of `values` to preserve it in the output
|
||||
values = np.asarray(values)
|
||||
input_shape = list(values.shape)
|
||||
# Make sure that `values` is 2D to iterate over rows
|
||||
values = np.atleast_2d(values)
|
||||
Vdim, Vlen = values.shape
|
||||
|
||||
# Make sure `values` match `sample`
|
||||
if(statistic != 'count' and Vlen != Dlen):
|
||||
raise AttributeError('The number of `values` elements must match the '
|
||||
'length of each `sample` dimension.')
|
||||
|
||||
try:
|
||||
M = len(bins)
|
||||
if M != Ndim:
|
||||
raise AttributeError('The dimension of bins must be equal '
|
||||
'to the dimension of the sample x.')
|
||||
except TypeError:
|
||||
bins = Ndim * [bins]
|
||||
|
||||
if binned_statistic_result is None:
|
||||
nbin, edges, dedges = _bin_edges(sample, bins, range)
|
||||
binnumbers = _bin_numbers(sample, nbin, edges, dedges)
|
||||
else:
|
||||
edges = binned_statistic_result.bin_edges
|
||||
nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
|
||||
# +1 for outlier bins
|
||||
dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
|
||||
binnumbers = binned_statistic_result.binnumber
|
||||
|
||||
result = np.empty([Vdim, nbin.prod()], float)
|
||||
|
||||
if statistic == 'mean':
|
||||
result.fill(np.nan)
|
||||
flatcount = np.bincount(binnumbers, None)
|
||||
a = flatcount.nonzero()
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = np.bincount(binnumbers, values[vv])
|
||||
result[vv, a] = flatsum[a] / flatcount[a]
|
||||
elif statistic == 'std':
|
||||
result.fill(np.nan)
|
||||
flatcount = np.bincount(binnumbers, None)
|
||||
a = flatcount.nonzero()
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = np.bincount(binnumbers, values[vv])
|
||||
delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
|
||||
std = np.sqrt(np.bincount(binnumbers, delta**2)[a] / flatcount[a])
|
||||
result[vv, a] = std
|
||||
elif statistic == 'count':
|
||||
result.fill(0)
|
||||
flatcount = np.bincount(binnumbers, None)
|
||||
a = np.arange(len(flatcount))
|
||||
result[:, a] = flatcount[np.newaxis, :]
|
||||
elif statistic == 'sum':
|
||||
result.fill(0)
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = np.bincount(binnumbers, values[vv])
|
||||
a = np.arange(len(flatsum))
|
||||
result[vv, a] = flatsum
|
||||
elif statistic == 'median':
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.lexsort((values[vv], binnumbers))
|
||||
_, j, counts = np.unique(binnumbers[i],
|
||||
return_index=True, return_counts=True)
|
||||
mid = j + (counts - 1) / 2
|
||||
mid_a = values[vv, i][np.floor(mid).astype(int)]
|
||||
mid_b = values[vv, i][np.ceil(mid).astype(int)]
|
||||
medians = (mid_a + mid_b) / 2
|
||||
result[vv, binnumbers[i][j]] = medians
|
||||
elif statistic == 'min':
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.argsort(values[vv])[::-1] # Reversed so the min is last
|
||||
result[vv, binnumbers[i]] = values[vv, i]
|
||||
elif statistic == 'max':
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.argsort(values[vv])
|
||||
result[vv, binnumbers[i]] = values[vv, i]
|
||||
elif callable(statistic):
|
||||
with np.errstate(invalid='ignore'), suppress_warnings() as sup:
|
||||
sup.filter(RuntimeWarning)
|
||||
try:
|
||||
null = statistic([])
|
||||
except Exception:
|
||||
null = np.nan
|
||||
result.fill(null)
|
||||
_calc_binned_statistic(Vdim, binnumbers, result, values, statistic)
|
||||
|
||||
# Shape into a proper matrix
|
||||
result = result.reshape(np.append(Vdim, nbin))
|
||||
|
||||
# Remove outliers (indices 0 and -1 for each bin-dimension).
|
||||
core = tuple([slice(None)] + Ndim * [slice(1, -1)])
|
||||
result = result[core]
|
||||
|
||||
# Unravel binnumbers into an ndarray, each row the bins for each dimension
|
||||
if(expand_binnumbers and Ndim > 1):
|
||||
binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
|
||||
|
||||
if np.any(result.shape[1:] != nbin - 2):
|
||||
raise RuntimeError('Internal Shape Error')
|
||||
|
||||
# Reshape to have output (`result`) match input (`values`) shape
|
||||
result = result.reshape(input_shape[:-1] + list(nbin-2))
|
||||
|
||||
return BinnedStatisticddResult(result, edges, binnumbers)
|
||||
|
||||
|
||||
def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
|
||||
unique_bin_numbers = np.unique(bin_numbers)
|
||||
for vv in builtins.range(Vdim):
|
||||
bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
|
||||
values, vv)
|
||||
for i in unique_bin_numbers:
|
||||
result[vv, i] = stat_func(np.array(bin_map[i]))
|
||||
|
||||
|
||||
def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
|
||||
""" Create hashmap of bin ids to values in bins
|
||||
key: bin number
|
||||
value: list of binned data
|
||||
"""
|
||||
bin_map = dict()
|
||||
for i in unique_bin_numbers:
|
||||
bin_map[i] = []
|
||||
for i in builtins.range(len(bin_numbers)):
|
||||
bin_map[bin_numbers[i]].append(values[vv, i])
|
||||
return bin_map
|
||||
|
||||
|
||||
def _bin_edges(sample, bins=None, range=None):
|
||||
""" Create edge arrays
|
||||
"""
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
nbin = np.empty(Ndim, int) # Number of bins in each dimension
|
||||
edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
|
||||
dedges = Ndim * [None] # Spacing between edges (will be 2D array)
|
||||
|
||||
# Select range for each dimension
|
||||
# Used only if number of bins is given.
|
||||
if range is None:
|
||||
smin = np.atleast_1d(np.array(sample.min(axis=0), float))
|
||||
smax = np.atleast_1d(np.array(sample.max(axis=0), float))
|
||||
else:
|
||||
if len(range) != Ndim:
|
||||
raise ValueError(
|
||||
f"range given for {len(range)} dimensions; {Ndim} required")
|
||||
smin = np.empty(Ndim)
|
||||
smax = np.empty(Ndim)
|
||||
for i in builtins.range(Ndim):
|
||||
if range[i][1] < range[i][0]:
|
||||
raise ValueError(
|
||||
"In {}range, start must be <= stop".format(
|
||||
f"dimension {i + 1} of " if Ndim > 1 else ""))
|
||||
smin[i], smax[i] = range[i]
|
||||
|
||||
# Make sure the bins have a finite width.
|
||||
for i in builtins.range(len(smin)):
|
||||
if smin[i] == smax[i]:
|
||||
smin[i] = smin[i] - .5
|
||||
smax[i] = smax[i] + .5
|
||||
|
||||
# Preserve sample floating point precision in bin edges
|
||||
edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
|
||||
else float)
|
||||
|
||||
# Create edge arrays
|
||||
for i in builtins.range(Ndim):
|
||||
if np.isscalar(bins[i]):
|
||||
nbin[i] = bins[i] + 2 # +2 for outlier bins
|
||||
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
|
||||
dtype=edges_dtype)
|
||||
else:
|
||||
edges[i] = np.asarray(bins[i], edges_dtype)
|
||||
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
|
||||
dedges[i] = np.diff(edges[i])
|
||||
|
||||
nbin = np.asarray(nbin)
|
||||
|
||||
return nbin, edges, dedges
|
||||
|
||||
|
||||
def _bin_numbers(sample, nbin, edges, dedges):
|
||||
"""Compute the bin number each sample falls into, in each dimension
|
||||
"""
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
sampBin = [
|
||||
np.digitize(sample[:, i], edges[i])
|
||||
for i in range(Ndim)
|
||||
]
|
||||
|
||||
# Using `digitize`, values that fall on an edge are put in the right bin.
|
||||
# For the rightmost bin, we want values equal to the right
|
||||
# edge to be counted in the last bin, and not as an outlier.
|
||||
for i in range(Ndim):
|
||||
# Find the rounding precision
|
||||
dedges_min = dedges[i].min()
|
||||
if dedges_min == 0:
|
||||
raise ValueError('The smallest edge difference is numerically 0.')
|
||||
decimal = int(-np.log10(dedges_min)) + 6
|
||||
# Find which points are on the rightmost edge.
|
||||
on_edge = np.where((sample[:, i] >= edges[i][-1]) &
|
||||
(np.around(sample[:, i], decimal) ==
|
||||
np.around(edges[i][-1], decimal)))[0]
|
||||
# Shift these points one bin to the left.
|
||||
sampBin[i][on_edge] -= 1
|
||||
|
||||
# Compute the sample indices in the flattened statistic matrix.
|
||||
binnumbers = np.ravel_multi_index(sampBin, nbin)
|
||||
|
||||
return binnumbers
|
||||
@@ -0,0 +1,371 @@
|
||||
from math import sqrt
|
||||
import numpy as np
|
||||
from scipy._lib._util import _validate_int
|
||||
from scipy.optimize import brentq
|
||||
from scipy.special import ndtri
|
||||
from ._discrete_distns import binom
|
||||
from ._common import ConfidenceInterval
|
||||
|
||||
|
||||
class BinomTestResult:
|
||||
"""
|
||||
Result of `scipy.stats.binomtest`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
k : int
|
||||
The number of successes (copied from `binomtest` input).
|
||||
n : int
|
||||
The number of trials (copied from `binomtest` input).
|
||||
alternative : str
|
||||
Indicates the alternative hypothesis specified in the input
|
||||
to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
|
||||
or ``'less'``.
|
||||
pvalue : float
|
||||
The p-value of the hypothesis test.
|
||||
proportion_estimate : float
|
||||
The estimate of the proportion of successes.
|
||||
|
||||
"""
|
||||
def __init__(self, k, n, alternative, pvalue, proportion_estimate):
|
||||
self.k = k
|
||||
self.n = n
|
||||
self.alternative = alternative
|
||||
self.proportion_estimate = proportion_estimate
|
||||
self.pvalue = pvalue
|
||||
|
||||
def __repr__(self):
|
||||
s = ("BinomTestResult("
|
||||
f"k={self.k}, "
|
||||
f"n={self.n}, "
|
||||
f"alternative={self.alternative!r}, "
|
||||
f"proportion_estimate={self.proportion_estimate}, "
|
||||
f"pvalue={self.pvalue})")
|
||||
return s
|
||||
|
||||
def proportion_ci(self, confidence_level=0.95, method='exact'):
|
||||
"""
|
||||
Compute the confidence interval for the estimated proportion.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, optional
|
||||
Confidence level for the computed confidence interval
|
||||
of the estimated proportion. Default is 0.95.
|
||||
method : {'exact', 'wilson', 'wilsoncc'}, optional
|
||||
Selects the method used to compute the confidence interval
|
||||
for the estimate of the proportion:
|
||||
|
||||
'exact' :
|
||||
Use the Clopper-Pearson exact method [1]_.
|
||||
'wilson' :
|
||||
Wilson's method, without continuity correction ([2]_, [3]_).
|
||||
'wilsoncc' :
|
||||
Wilson's method, with continuity correction ([2]_, [3]_).
|
||||
|
||||
Default is ``'exact'``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ``ConfidenceInterval`` object
|
||||
The object has attributes ``low`` and ``high`` that hold the
|
||||
lower and upper bounds of the confidence interval.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] C. J. Clopper and E. S. Pearson, The use of confidence or
|
||||
fiducial limits illustrated in the case of the binomial,
|
||||
Biometrika, Vol. 26, No. 4, pp 404-413 (Dec. 1934).
|
||||
.. [2] E. B. Wilson, Probable inference, the law of succession, and
|
||||
statistical inference, J. Amer. Stat. Assoc., 22, pp 209-212
|
||||
(1927).
|
||||
.. [3] Robert G. Newcombe, Two-sided confidence intervals for the
|
||||
single proportion: comparison of seven methods, Statistics
|
||||
in Medicine, 17, pp 857-872 (1998).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import binomtest
|
||||
>>> result = binomtest(k=7, n=50, p=0.1)
|
||||
>>> result.proportion_estimate
|
||||
0.14
|
||||
>>> result.proportion_ci()
|
||||
ConfidenceInterval(low=0.05819170033997342, high=0.26739600249700846)
|
||||
"""
|
||||
if method not in ('exact', 'wilson', 'wilsoncc'):
|
||||
raise ValueError("method must be one of 'exact', 'wilson' or "
|
||||
"'wilsoncc'.")
|
||||
if not (0 <= confidence_level <= 1):
|
||||
raise ValueError('confidence_level must be in the interval '
|
||||
'[0, 1].')
|
||||
if method == 'exact':
|
||||
low, high = _binom_exact_conf_int(self.k, self.n,
|
||||
confidence_level,
|
||||
self.alternative)
|
||||
else:
|
||||
# method is 'wilson' or 'wilsoncc'
|
||||
low, high = _binom_wilson_conf_int(self.k, self.n,
|
||||
confidence_level,
|
||||
self.alternative,
|
||||
correction=method == 'wilsoncc')
|
||||
return ConfidenceInterval(low=low, high=high)
|
||||
|
||||
|
||||
def _findp(func):
|
||||
try:
|
||||
p = brentq(func, 0, 1)
|
||||
except RuntimeError:
|
||||
raise RuntimeError('numerical solver failed to converge when '
|
||||
'computing the confidence limits') from None
|
||||
except ValueError as exc:
|
||||
raise ValueError('brentq raised a ValueError; report this to the '
|
||||
'SciPy developers') from exc
|
||||
return p
|
||||
|
||||
|
||||
def _binom_exact_conf_int(k, n, confidence_level, alternative):
|
||||
"""
|
||||
Compute the estimate and confidence interval for the binomial test.
|
||||
|
||||
Returns proportion, prop_low, prop_high
|
||||
"""
|
||||
if alternative == 'two-sided':
|
||||
alpha = (1 - confidence_level) / 2
|
||||
if k == 0:
|
||||
plow = 0.0
|
||||
else:
|
||||
plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
|
||||
if k == n:
|
||||
phigh = 1.0
|
||||
else:
|
||||
phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
|
||||
elif alternative == 'less':
|
||||
alpha = 1 - confidence_level
|
||||
plow = 0.0
|
||||
if k == n:
|
||||
phigh = 1.0
|
||||
else:
|
||||
phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
|
||||
elif alternative == 'greater':
|
||||
alpha = 1 - confidence_level
|
||||
if k == 0:
|
||||
plow = 0.0
|
||||
else:
|
||||
plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
|
||||
phigh = 1.0
|
||||
return plow, phigh
|
||||
|
||||
|
||||
def _binom_wilson_conf_int(k, n, confidence_level, alternative, correction):
|
||||
# This function assumes that the arguments have already been validated.
|
||||
# In particular, `alternative` must be one of 'two-sided', 'less' or
|
||||
# 'greater'.
|
||||
p = k / n
|
||||
if alternative == 'two-sided':
|
||||
z = ndtri(0.5 + 0.5*confidence_level)
|
||||
else:
|
||||
z = ndtri(confidence_level)
|
||||
|
||||
# For reference, the formulas implemented here are from
|
||||
# Newcombe (1998) (ref. [3] in the proportion_ci docstring).
|
||||
denom = 2*(n + z**2)
|
||||
center = (2*n*p + z**2)/denom
|
||||
q = 1 - p
|
||||
if correction:
|
||||
if alternative == 'less' or k == 0:
|
||||
lo = 0.0
|
||||
else:
|
||||
dlo = (1 + z*sqrt(z**2 - 2 - 1/n + 4*p*(n*q + 1))) / denom
|
||||
lo = center - dlo
|
||||
if alternative == 'greater' or k == n:
|
||||
hi = 1.0
|
||||
else:
|
||||
dhi = (1 + z*sqrt(z**2 + 2 - 1/n + 4*p*(n*q - 1))) / denom
|
||||
hi = center + dhi
|
||||
else:
|
||||
delta = z/denom * sqrt(4*n*p*q + z**2)
|
||||
if alternative == 'less' or k == 0:
|
||||
lo = 0.0
|
||||
else:
|
||||
lo = center - delta
|
||||
if alternative == 'greater' or k == n:
|
||||
hi = 1.0
|
||||
else:
|
||||
hi = center + delta
|
||||
|
||||
return lo, hi
|
||||
|
||||
|
||||
def binomtest(k, n, p=0.5, alternative='two-sided'):
|
||||
"""
|
||||
Perform a test that the probability of success is p.
|
||||
|
||||
The binomial test [1]_ is a test of the null hypothesis that the
|
||||
probability of success in a Bernoulli experiment is `p`.
|
||||
|
||||
Details of the test can be found in many texts on statistics, such
|
||||
as section 24.5 of [2]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k : int
|
||||
The number of successes.
|
||||
n : int
|
||||
The number of trials.
|
||||
p : float, optional
|
||||
The hypothesized probability of success, i.e. the expected
|
||||
proportion of successes. The value must be in the interval
|
||||
``0 <= p <= 1``. The default value is ``p = 0.5``.
|
||||
alternative : {'two-sided', 'greater', 'less'}, optional
|
||||
Indicates the alternative hypothesis. The default value is
|
||||
'two-sided'.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : `~scipy.stats._result_classes.BinomTestResult` instance
|
||||
The return value is an object with the following attributes:
|
||||
|
||||
k : int
|
||||
The number of successes (copied from `binomtest` input).
|
||||
n : int
|
||||
The number of trials (copied from `binomtest` input).
|
||||
alternative : str
|
||||
Indicates the alternative hypothesis specified in the input
|
||||
to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
|
||||
or ``'less'``.
|
||||
pvalue : float
|
||||
The p-value of the hypothesis test.
|
||||
proportion_estimate : float
|
||||
The estimate of the proportion of successes.
|
||||
|
||||
The object has the following methods:
|
||||
|
||||
proportion_ci(confidence_level=0.95, method='exact') :
|
||||
Compute the confidence interval for ``proportion_estimate``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Binomial test, https://en.wikipedia.org/wiki/Binomial_test
|
||||
.. [2] Jerrold H. Zar, Biostatistical Analysis (fifth edition),
|
||||
Prentice Hall, Upper Saddle River, New Jersey USA (2010)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import binomtest
|
||||
|
||||
A car manufacturer claims that no more than 10% of their cars are unsafe.
|
||||
15 cars are inspected for safety, 3 were found to be unsafe. Test the
|
||||
manufacturer's claim:
|
||||
|
||||
>>> result = binomtest(3, n=15, p=0.1, alternative='greater')
|
||||
>>> result.pvalue
|
||||
0.18406106910639114
|
||||
|
||||
The null hypothesis cannot be rejected at the 5% level of significance
|
||||
because the returned p-value is greater than the critical value of 5%.
|
||||
|
||||
The estimated proportion is simply ``3/15``:
|
||||
|
||||
>>> result.proportion_estimate
|
||||
0.2
|
||||
|
||||
We can use the `proportion_ci()` method of the result to compute the
|
||||
confidence interval of the estimate:
|
||||
|
||||
>>> result.proportion_ci(confidence_level=0.95)
|
||||
ConfidenceInterval(low=0.05684686759024681, high=1.0)
|
||||
|
||||
"""
|
||||
k = _validate_int(k, 'k', minimum=0)
|
||||
n = _validate_int(n, 'n', minimum=1)
|
||||
if k > n:
|
||||
raise ValueError('k must not be greater than n.')
|
||||
|
||||
if not (0 <= p <= 1):
|
||||
raise ValueError("p must be in range [0,1]")
|
||||
|
||||
if alternative not in ('two-sided', 'less', 'greater'):
|
||||
raise ValueError("alternative not recognized; \n"
|
||||
"must be 'two-sided', 'less' or 'greater'")
|
||||
if alternative == 'less':
|
||||
pval = binom.cdf(k, n, p)
|
||||
elif alternative == 'greater':
|
||||
pval = binom.sf(k-1, n, p)
|
||||
else:
|
||||
# alternative is 'two-sided'
|
||||
d = binom.pmf(k, n, p)
|
||||
rerr = 1 + 1e-7
|
||||
if k == p * n:
|
||||
# special case as shortcut, would also be handled by `else` below
|
||||
pval = 1.
|
||||
elif k < p * n:
|
||||
ix = _binary_search_for_binom_tst(lambda x1: -binom.pmf(x1, n, p),
|
||||
-d*rerr, np.ceil(p * n), n)
|
||||
# y is the number of terms between mode and n that are <= d*rerr.
|
||||
# ix gave us the first term where a(ix) <= d*rerr < a(ix-1)
|
||||
# if the first equality doesn't hold, y=n-ix. Otherwise, we
|
||||
# need to include ix as well as the equality holds. Note that
|
||||
# the equality will hold in very very rare situations due to rerr.
|
||||
y = n - ix + int(d*rerr == binom.pmf(ix, n, p))
|
||||
pval = binom.cdf(k, n, p) + binom.sf(n - y, n, p)
|
||||
else:
|
||||
ix = _binary_search_for_binom_tst(lambda x1: binom.pmf(x1, n, p),
|
||||
d*rerr, 0, np.floor(p * n))
|
||||
# y is the number of terms between 0 and mode that are <= d*rerr.
|
||||
# we need to add a 1 to account for the 0 index.
|
||||
# For comparing this with old behavior, see
|
||||
# tst_binary_srch_for_binom_tst method in test_morestats.
|
||||
y = ix + 1
|
||||
pval = binom.cdf(y-1, n, p) + binom.sf(k-1, n, p)
|
||||
|
||||
pval = min(1.0, pval)
|
||||
|
||||
result = BinomTestResult(k=k, n=n, alternative=alternative,
|
||||
proportion_estimate=k/n, pvalue=pval)
|
||||
return result
|
||||
|
||||
|
||||
def _binary_search_for_binom_tst(a, d, lo, hi):
|
||||
"""
|
||||
Conducts an implicit binary search on a function specified by `a`.
|
||||
|
||||
Meant to be used on the binomial PMF for the case of two-sided tests
|
||||
to obtain the value on the other side of the mode where the tail
|
||||
probability should be computed. The values on either side of
|
||||
the mode are always in order, meaning binary search is applicable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : callable
|
||||
The function over which to perform binary search. Its values
|
||||
for inputs lo and hi should be in ascending order.
|
||||
d : float
|
||||
The value to search.
|
||||
lo : int
|
||||
The lower end of range to search.
|
||||
hi : int
|
||||
The higher end of the range to search.
|
||||
|
||||
Returns
|
||||
----------
|
||||
int
|
||||
The index, i between lo and hi
|
||||
such that a(i)<=d<a(i+1)
|
||||
"""
|
||||
while lo < hi:
|
||||
mid = lo + (hi-lo)//2
|
||||
midval = a(mid)
|
||||
if midval < d:
|
||||
lo = mid+1
|
||||
elif midval > d:
|
||||
hi = mid-1
|
||||
else:
|
||||
return mid
|
||||
if a(lo) <= d:
|
||||
return lo
|
||||
else:
|
||||
return lo-1
|
||||
@@ -0,0 +1,23 @@
|
||||
from scipy.stats._boost.beta_ufunc import (
|
||||
_beta_pdf, _beta_cdf, _beta_sf, _beta_ppf,
|
||||
_beta_isf, _beta_mean, _beta_variance,
|
||||
_beta_skewness, _beta_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.binom_ufunc import (
|
||||
_binom_pdf, _binom_cdf, _binom_sf, _binom_ppf,
|
||||
_binom_isf, _binom_mean, _binom_variance,
|
||||
_binom_skewness, _binom_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.nbinom_ufunc import (
|
||||
_nbinom_pdf, _nbinom_cdf, _nbinom_sf, _nbinom_ppf,
|
||||
_nbinom_isf, _nbinom_mean, _nbinom_variance,
|
||||
_nbinom_skewness, _nbinom_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.hypergeom_ufunc import (
|
||||
_hypergeom_pdf, _hypergeom_cdf, _hypergeom_sf, _hypergeom_ppf,
|
||||
_hypergeom_isf, _hypergeom_mean, _hypergeom_variance,
|
||||
_hypergeom_skewness, _hypergeom_kurtosis_excess,
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,60 @@
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
|
||||
def pre_build_hook(build_ext, ext):
|
||||
from scipy._build_utils.compiler_helper import get_cxx_std_flag
|
||||
std_flag = get_cxx_std_flag(build_ext._cxx_compiler)
|
||||
if std_flag is not None:
|
||||
ext.extra_compile_args.append(std_flag)
|
||||
|
||||
|
||||
def configuration(parent_package='', top_path=None):
|
||||
from scipy._lib._boost_utils import _boost_dir
|
||||
from scipy._build_utils import import_file
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
import numpy as np
|
||||
config = Configuration('_boost', parent_package, top_path)
|
||||
|
||||
DEFINES = [
|
||||
# return nan instead of throwing
|
||||
('BOOST_MATH_DOMAIN_ERROR_POLICY', 'ignore_error'),
|
||||
('BOOST_MATH_EVALUATION_ERROR_POLICY', 'user_error'),
|
||||
('BOOST_MATH_OVERFLOW_ERROR_POLICY', 'user_error'),
|
||||
]
|
||||
if sys.maxsize > 2**32:
|
||||
# 32-bit machines lose too much precision with no promotion,
|
||||
# so only set this policy for 64-bit machines
|
||||
DEFINES += [('BOOST_MATH_PROMOTE_DOUBLE_POLICY', 'false')]
|
||||
INCLUDES = [
|
||||
'include/',
|
||||
'src/',
|
||||
np.get_include(),
|
||||
_boost_dir(),
|
||||
]
|
||||
|
||||
# generate the PXD and PYX wrappers
|
||||
boost_dir = pathlib.Path(__file__).parent
|
||||
src_dir = boost_dir / 'src'
|
||||
_klass_mapper = import_file(boost_dir / 'include', '_info')._klass_mapper
|
||||
for s in _klass_mapper.values():
|
||||
ext = config.add_extension(
|
||||
f'{s.scipy_name}_ufunc',
|
||||
sources=[f'{src_dir}/{s.scipy_name}_ufunc.cxx'],
|
||||
include_dirs=INCLUDES,
|
||||
define_macros=DEFINES,
|
||||
language='c++',
|
||||
depends=[
|
||||
'include/func_defs.hpp',
|
||||
'include/Templated_PyUFunc.hpp',
|
||||
],
|
||||
)
|
||||
# Add c++11/14 support:
|
||||
ext._pre_build_hook = pre_build_hook
|
||||
|
||||
return config
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from numpy.distutils.core import setup
|
||||
setup(**configuration(top_path='').todict())
|
||||
@@ -0,0 +1,488 @@
|
||||
import warnings
|
||||
import numpy as np
|
||||
from scipy._lib._util import check_random_state
|
||||
from scipy.special import ndtr, ndtri
|
||||
from scipy._lib._util import rng_integers
|
||||
from dataclasses import make_dataclass
|
||||
from ._common import ConfidenceInterval
|
||||
from ._axis_nan_policy import _broadcast_concatenate
|
||||
|
||||
|
||||
class BootstrapDegenerateDistributionWarning(RuntimeWarning):
|
||||
"""
|
||||
Warning generated by `bootstrap` when BCa method is used and
|
||||
the bootstrap distribution is degenerate.
|
||||
"""
|
||||
|
||||
def __init__(self, msg=None):
|
||||
if msg is None:
|
||||
msg = ("The bootstrap distribution is degenerate; the "
|
||||
"confidence interval is not defined.")
|
||||
self.args = (msg,)
|
||||
|
||||
|
||||
def _vectorize_statistic(statistic):
|
||||
"""Vectorize an n-sample statistic"""
|
||||
# This is a little cleaner than np.nditer at the expense of some data
|
||||
# copying: concatenate samples together, then use np.apply_along_axis
|
||||
def stat_nd(*data, axis=0):
|
||||
lengths = [sample.shape[axis] for sample in data]
|
||||
split_indices = np.cumsum(lengths)[:-1]
|
||||
z = _broadcast_concatenate(data, axis)
|
||||
|
||||
def stat_1d(z):
|
||||
data = np.split(z, split_indices)
|
||||
return statistic(*data)
|
||||
|
||||
return np.apply_along_axis(stat_1d, axis, z)[()]
|
||||
return stat_nd
|
||||
|
||||
|
||||
def _jackknife_resample(sample, batch=None):
|
||||
"""Jackknife resample the sample. Only one-sample stats for now."""
|
||||
n = sample.shape[-1]
|
||||
batch_nominal = batch or n
|
||||
|
||||
for k in range(0, n, batch_nominal):
|
||||
# col_start:col_end are the observations to remove
|
||||
batch_actual = min(batch_nominal, n-k)
|
||||
|
||||
# jackknife - each row leaves out one observation
|
||||
j = np.ones((batch_actual, n), dtype=bool)
|
||||
np.fill_diagonal(j[:, k:k+batch_actual], False)
|
||||
i = np.arange(n)
|
||||
i = np.broadcast_to(i, (batch_actual, n))
|
||||
i = i[j].reshape((batch_actual, n-1))
|
||||
|
||||
resamples = sample[..., i]
|
||||
yield resamples
|
||||
|
||||
|
||||
def _bootstrap_resample(sample, n_resamples=None, random_state=None):
|
||||
"""Bootstrap resample the sample."""
|
||||
n = sample.shape[-1]
|
||||
|
||||
# bootstrap - each row is a random resample of original observations
|
||||
i = rng_integers(random_state, 0, n, (n_resamples, n))
|
||||
|
||||
resamples = sample[..., i]
|
||||
return resamples
|
||||
|
||||
|
||||
def _percentile_of_score(a, score, axis):
|
||||
"""Vectorized, simplified `scipy.stats.percentileofscore`.
|
||||
|
||||
Unlike `stats.percentileofscore`, the percentile returned is a fraction
|
||||
in [0, 1].
|
||||
"""
|
||||
B = a.shape[axis]
|
||||
return (a < score).sum(axis=axis) / B
|
||||
|
||||
|
||||
def _percentile_along_axis(theta_hat_b, alpha):
|
||||
"""`np.percentile` with different percentile for each slice."""
|
||||
# the difference between _percentile_along_axis and np.percentile is that
|
||||
# np.percentile gets _all_ the qs for each axis slice, whereas
|
||||
# _percentile_along_axis gets the q corresponding with each axis slice
|
||||
shape = theta_hat_b.shape[:-1]
|
||||
alpha = np.broadcast_to(alpha, shape)
|
||||
percentiles = np.zeros_like(alpha, dtype=np.float64)
|
||||
for indices, alpha_i in np.ndenumerate(alpha):
|
||||
if np.isnan(alpha_i):
|
||||
# e.g. when bootstrap distribution has only one unique element
|
||||
warnings.warn(BootstrapDegenerateDistributionWarning())
|
||||
percentiles[indices] = np.nan
|
||||
else:
|
||||
theta_hat_b_i = theta_hat_b[indices]
|
||||
percentiles[indices] = np.percentile(theta_hat_b_i, alpha_i)
|
||||
return percentiles[()] # return scalar instead of 0d array
|
||||
|
||||
|
||||
def _bca_interval(data, statistic, axis, alpha, theta_hat_b, batch):
|
||||
"""Bias-corrected and accelerated interval."""
|
||||
# closely follows [2] "BCa Bootstrap CIs"
|
||||
sample = data[0] # only works with 1 sample statistics right now
|
||||
|
||||
# calculate z0_hat
|
||||
theta_hat = np.asarray(statistic(sample, axis=axis))[..., None]
|
||||
percentile = _percentile_of_score(theta_hat_b, theta_hat, axis=-1)
|
||||
z0_hat = ndtri(percentile)
|
||||
|
||||
# calculate a_hat
|
||||
theta_hat_i = [] # would be better to fill pre-allocated array
|
||||
for jackknife_sample in _jackknife_resample(sample, batch):
|
||||
theta_hat_i.append(statistic(jackknife_sample, axis=-1))
|
||||
theta_hat_i = np.concatenate(theta_hat_i, axis=-1)
|
||||
theta_hat_dot = theta_hat_i.mean(axis=-1, keepdims=True)
|
||||
num = ((theta_hat_dot - theta_hat_i)**3).sum(axis=-1)
|
||||
den = 6*((theta_hat_dot - theta_hat_i)**2).sum(axis=-1)**(3/2)
|
||||
a_hat = num / den
|
||||
|
||||
# calculate alpha_1, alpha_2
|
||||
z_alpha = ndtri(alpha)
|
||||
z_1alpha = -z_alpha
|
||||
num1 = z0_hat + z_alpha
|
||||
alpha_1 = ndtr(z0_hat + num1/(1 - a_hat*num1))
|
||||
num2 = z0_hat + z_1alpha
|
||||
alpha_2 = ndtr(z0_hat + num2/(1 - a_hat*num2))
|
||||
return alpha_1, alpha_2
|
||||
|
||||
|
||||
def _bootstrap_iv(data, statistic, vectorized, paired, axis, confidence_level,
|
||||
n_resamples, batch, method, random_state):
|
||||
"""Input validation and standardization for `bootstrap`."""
|
||||
|
||||
if vectorized not in {True, False}:
|
||||
raise ValueError("`vectorized` must be `True` or `False`.")
|
||||
|
||||
if not vectorized:
|
||||
statistic = _vectorize_statistic(statistic)
|
||||
|
||||
axis_int = int(axis)
|
||||
if axis != axis_int:
|
||||
raise ValueError("`axis` must be an integer.")
|
||||
|
||||
n_samples = 0
|
||||
try:
|
||||
n_samples = len(data)
|
||||
except TypeError:
|
||||
raise ValueError("`data` must be a sequence of samples.")
|
||||
|
||||
if n_samples == 0:
|
||||
raise ValueError("`data` must contain at least one sample.")
|
||||
|
||||
data_iv = []
|
||||
for sample in data:
|
||||
sample = np.atleast_1d(sample)
|
||||
if sample.shape[axis_int] <= 1:
|
||||
raise ValueError("each sample in `data` must contain two or more "
|
||||
"observations along `axis`.")
|
||||
sample = np.moveaxis(sample, axis_int, -1)
|
||||
data_iv.append(sample)
|
||||
|
||||
if paired not in {True, False}:
|
||||
raise ValueError("`paired` must be `True` or `False`.")
|
||||
|
||||
if paired:
|
||||
n = data_iv[0].shape[-1]
|
||||
for sample in data_iv[1:]:
|
||||
if sample.shape[-1] != n:
|
||||
message = ("When `paired is True`, all samples must have the "
|
||||
"same length along `axis`")
|
||||
raise ValueError(message)
|
||||
|
||||
# to generate the bootstrap distribution for paired-sample statistics,
|
||||
# resample the indices of the observations
|
||||
def statistic(i, axis=-1, data=data_iv, unpaired_statistic=statistic):
|
||||
data = [sample[..., i] for sample in data]
|
||||
return unpaired_statistic(*data, axis=axis)
|
||||
|
||||
data_iv = [np.arange(n)]
|
||||
|
||||
confidence_level_float = float(confidence_level)
|
||||
|
||||
n_resamples_int = int(n_resamples)
|
||||
if n_resamples != n_resamples_int or n_resamples_int <= 0:
|
||||
raise ValueError("`n_resamples` must be a positive integer.")
|
||||
|
||||
if batch is None:
|
||||
batch_iv = batch
|
||||
else:
|
||||
batch_iv = int(batch)
|
||||
if batch != batch_iv or batch_iv <= 0:
|
||||
raise ValueError("`batch` must be a positive integer or None.")
|
||||
|
||||
methods = {'percentile', 'basic', 'bca'}
|
||||
method = method.lower()
|
||||
if method not in methods:
|
||||
raise ValueError(f"`method` must be in {methods}")
|
||||
|
||||
message = "`method = 'BCa' is only available for one-sample statistics"
|
||||
if not paired and n_samples > 1 and method == 'bca':
|
||||
raise ValueError(message)
|
||||
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
return (data_iv, statistic, vectorized, paired, axis_int,
|
||||
confidence_level_float, n_resamples_int, batch_iv,
|
||||
method, random_state)
|
||||
|
||||
|
||||
fields = ['confidence_interval', 'standard_error']
|
||||
BootstrapResult = make_dataclass("BootstrapResult", fields)
|
||||
|
||||
|
||||
def bootstrap(data, statistic, *, vectorized=True, paired=False, axis=0,
|
||||
confidence_level=0.95, n_resamples=9999, batch=None,
|
||||
method='BCa', random_state=None):
|
||||
r"""
|
||||
Compute a two-sided bootstrap confidence interval of a statistic.
|
||||
|
||||
When `method` is ``'percentile'``, a bootstrap confidence interval is
|
||||
computed according to the following procedure.
|
||||
|
||||
1. Resample the data: for each sample in `data` and for each of
|
||||
`n_resamples`, take a random sample of the original sample
|
||||
(with replacement) of the same size as the original sample.
|
||||
|
||||
2. Compute the bootstrap distribution of the statistic: for each set of
|
||||
resamples, compute the test statistic.
|
||||
|
||||
3. Determine the confidence interval: find the interval of the bootstrap
|
||||
distribution that is
|
||||
|
||||
- symmetric about the median and
|
||||
- contains `confidence_level` of the resampled statistic values.
|
||||
|
||||
While the ``'percentile'`` method is the most intuitive, it is rarely
|
||||
used in practice. Two more common methods are available, ``'basic'``
|
||||
('reverse percentile') and ``'BCa'`` ('bias-corrected and accelerated');
|
||||
they differ in how step 3 is performed.
|
||||
|
||||
If the samples in `data` are taken at random from their respective
|
||||
distributions :math:`n` times, the confidence interval returned by
|
||||
`bootstrap` will contain the true value of the statistic for those
|
||||
distributions approximately `confidence_level`:math:`\, \times \, n` times.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : sequence of array-like
|
||||
Each element of data is a sample from an underlying distribution.
|
||||
statistic : callable
|
||||
Statistic for which the confidence interval is to be calculated.
|
||||
`statistic` must be a callable that accepts ``len(data)`` samples
|
||||
as separate arguments and returns the resulting statistic.
|
||||
If `vectorized` is set ``True``,
|
||||
`statistic` must also accept a keyword argument `axis` and be
|
||||
vectorized to compute the statistic along the provided `axis`.
|
||||
vectorized : bool, default: ``True``
|
||||
If `vectorized` is set ``False``, `statistic` will not be passed
|
||||
keyword argument `axis`, and is assumed to calculate the statistic
|
||||
only for 1D samples.
|
||||
paired : bool, default: ``False``
|
||||
Whether the statistic treats corresponding elements of the samples
|
||||
in `data` as paired.
|
||||
axis : int, default: ``0``
|
||||
The axis of the samples in `data` along which the `statistic` is
|
||||
calculated.
|
||||
confidence_level : float, default: ``0.95``
|
||||
The confidence level of the confidence interval.
|
||||
n_resamples : int, default: ``9999``
|
||||
The number of resamples performed to form the bootstrap distribution
|
||||
of the statistic.
|
||||
batch : int, optional
|
||||
The number of resamples to process in each vectorized call to
|
||||
`statistic`. Memory usage is O(`batch`*``n``), where ``n`` is the
|
||||
sample size. Default is ``None``, in which case ``batch = n_resamples``
|
||||
(or ``batch = max(n_resamples, n)`` for ``method='BCa'``).
|
||||
method : {'percentile', 'basic', 'bca'}, default: ``'BCa'``
|
||||
Whether to return the 'percentile' bootstrap confidence interval
|
||||
(``'percentile'``), the 'reverse' or the bias-corrected and accelerated
|
||||
bootstrap confidence interval (``'BCa'``).
|
||||
Note that only ``'percentile'`` and ``'basic'`` support multi-sample
|
||||
statistics at this time.
|
||||
random_state : {None, int, `numpy.random.Generator`,
|
||||
`numpy.random.RandomState`}, optional
|
||||
|
||||
Pseudorandom number generator state used to generate resamples.
|
||||
|
||||
If `random_state` is ``None`` (or `np.random`), the
|
||||
`numpy.random.RandomState` singleton is used.
|
||||
If `random_state` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `random_state`.
|
||||
If `random_state` is already a ``Generator`` or ``RandomState``
|
||||
instance then that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : BootstrapResult
|
||||
An object with attributes:
|
||||
|
||||
confidence_interval : ConfidenceInterval
|
||||
The bootstrap confidence interval as an instance of
|
||||
`collections.namedtuple` with attributes `low` and `high`.
|
||||
standard_error : float or ndarray
|
||||
The bootstrap standard error, that is, the sample standard
|
||||
deviation of the bootstrap distribution
|
||||
|
||||
Notes
|
||||
-----
|
||||
Elements of the confidence interval may be NaN for ``method='BCa'`` if
|
||||
the bootstrap distribution is degenerate (e.g. all elements are identical).
|
||||
In this case, consider using another `method` or inspecting `data` for
|
||||
indications that other analysis may be more appropriate (e.g. all
|
||||
observations are identical).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] B. Efron and R. J. Tibshirani, An Introduction to the Bootstrap,
|
||||
Chapman & Hall/CRC, Boca Raton, FL, USA (1993)
|
||||
.. [2] Nathaniel E. Helwig, "Bootstrap Confidence Intervals",
|
||||
http://users.stat.umn.edu/~helwig/notes/bootci-Notes.pdf
|
||||
.. [3] Bootstrapping (statistics), Wikipedia,
|
||||
https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29
|
||||
|
||||
Examples
|
||||
--------
|
||||
Suppose we have sampled data from an unknown distribution.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> from scipy.stats import norm
|
||||
>>> dist = norm(loc=2, scale=4) # our "unknown" distribution
|
||||
>>> data = dist.rvs(size=100, random_state=rng)
|
||||
|
||||
We are interested int the standard deviation of the distribution.
|
||||
|
||||
>>> std_true = dist.std() # the true value of the statistic
|
||||
>>> print(std_true)
|
||||
4.0
|
||||
>>> std_sample = np.std(data) # the sample statistic
|
||||
>>> print(std_sample)
|
||||
3.9460644295563863
|
||||
|
||||
We can calculate a 90% confidence interval of the statistic using
|
||||
`bootstrap`.
|
||||
|
||||
>>> from scipy.stats import bootstrap
|
||||
>>> data = (data,) # samples must be in a sequence
|
||||
>>> res = bootstrap(data, np.std, confidence_level=0.9,
|
||||
... random_state=rng)
|
||||
>>> print(res.confidence_interval)
|
||||
ConfidenceInterval(low=3.57655333533867, high=4.382043696342881)
|
||||
|
||||
If we sample from the distribution 1000 times and form a bootstrap
|
||||
confidence interval for each sample, the confidence interval
|
||||
contains the true value of the statistic approximately 900 times.
|
||||
|
||||
>>> n_trials = 1000
|
||||
>>> ci_contains_true_std = 0
|
||||
>>> for i in range(n_trials):
|
||||
... data = (dist.rvs(size=100, random_state=rng),)
|
||||
... ci = bootstrap(data, np.std, confidence_level=0.9, n_resamples=1000,
|
||||
... random_state=rng).confidence_interval
|
||||
... if ci[0] < std_true < ci[1]:
|
||||
... ci_contains_true_std += 1
|
||||
>>> print(ci_contains_true_std)
|
||||
875
|
||||
|
||||
Rather than writing a loop, we can also determine the confidence intervals
|
||||
for all 1000 samples at once.
|
||||
|
||||
>>> data = (dist.rvs(size=(n_trials, 100), random_state=rng),)
|
||||
>>> res = bootstrap(data, np.std, axis=-1, confidence_level=0.9,
|
||||
... n_resamples=1000, random_state=rng)
|
||||
>>> ci_l, ci_u = res.confidence_interval
|
||||
|
||||
Here, `ci_l` and `ci_u` contain the confidence interval for each of the
|
||||
``n_trials = 1000`` samples.
|
||||
|
||||
>>> print(ci_l[995:])
|
||||
[3.77729695 3.75090233 3.45829131 3.34078217 3.48072829]
|
||||
>>> print(ci_u[995:])
|
||||
[4.88316666 4.86924034 4.32032996 4.2822427 4.59360598]
|
||||
|
||||
And again, approximately 90% contain the true value, ``std_true = 4``.
|
||||
|
||||
>>> print(np.sum((ci_l < std_true) & (std_true < ci_u)))
|
||||
900
|
||||
|
||||
`bootstrap` can also be used to estimate confidence intervals of
|
||||
multi-sample statistics, including those calculated by hypothesis
|
||||
tests. `scipy.stats.mood` perform's Mood's test for equal scale parameters,
|
||||
and it returns two outputs: a statistic, and a p-value. To get a
|
||||
confidence interval for the test statistic, we first wrap
|
||||
`scipy.stats.mood` in a function that accepts two sample arguments,
|
||||
accepts an `axis` keyword argument, and returns only the statistic.
|
||||
|
||||
>>> from scipy.stats import mood
|
||||
>>> def my_statistic(sample1, sample2, axis):
|
||||
... statistic, _ = mood(sample1, sample2, axis=-1)
|
||||
... return statistic
|
||||
|
||||
Here, we use the 'percentile' method with the default 95% confidence level.
|
||||
|
||||
>>> sample1 = norm.rvs(scale=1, size=100, random_state=rng)
|
||||
>>> sample2 = norm.rvs(scale=2, size=100, random_state=rng)
|
||||
>>> data = (sample1, sample2)
|
||||
>>> res = bootstrap(data, my_statistic, method='basic', random_state=rng)
|
||||
>>> print(mood(sample1, sample2)[0]) # element 0 is the statistic
|
||||
-5.521109549096542
|
||||
>>> print(res.confidence_interval)
|
||||
ConfidenceInterval(low=-7.255994487314675, high=-4.016202624747605)
|
||||
|
||||
The bootstrap estimate of the standard error is also available.
|
||||
|
||||
>>> print(res.standard_error)
|
||||
0.8344963846318795
|
||||
|
||||
Paired-sample statistics work, too. For example, consider the Pearson
|
||||
correlation coefficient.
|
||||
|
||||
>>> from scipy.stats import pearsonr
|
||||
>>> n = 100
|
||||
>>> x = np.linspace(0, 10, n)
|
||||
>>> y = x + rng.uniform(size=n)
|
||||
>>> print(pearsonr(x, y)[0]) # element 0 is the statistic
|
||||
0.9962357936065914
|
||||
|
||||
We wrap `pearsonr` so that it returns only the statistic.
|
||||
|
||||
>>> def my_statistic(x, y):
|
||||
... return pearsonr(x, y)[0]
|
||||
|
||||
We call `bootstrap` using ``paired=True``.
|
||||
Also, since ``my_statistic`` isn't vectorized to calculate the statistic
|
||||
along a given axis, we pass in ``vectorized=False``.
|
||||
|
||||
>>> res = bootstrap((x, y), my_statistic, vectorized=False, paired=True,
|
||||
... random_state=rng)
|
||||
>>> print(res.confidence_interval)
|
||||
ConfidenceInterval(low=0.9950085825848624, high=0.9971212407917498)
|
||||
|
||||
"""
|
||||
# Input validation
|
||||
args = _bootstrap_iv(data, statistic, vectorized, paired, axis,
|
||||
confidence_level, n_resamples, batch, method,
|
||||
random_state)
|
||||
data, statistic, vectorized, paired, axis = args[:5]
|
||||
confidence_level, n_resamples, batch, method, random_state = args[5:]
|
||||
|
||||
theta_hat_b = []
|
||||
|
||||
batch_nominal = batch or n_resamples
|
||||
|
||||
for k in range(0, n_resamples, batch_nominal):
|
||||
batch_actual = min(batch_nominal, n_resamples-k)
|
||||
# Generate resamples
|
||||
resampled_data = []
|
||||
for sample in data:
|
||||
resample = _bootstrap_resample(sample, n_resamples=batch_actual,
|
||||
random_state=random_state)
|
||||
resampled_data.append(resample)
|
||||
|
||||
# Compute bootstrap distribution of statistic
|
||||
theta_hat_b.append(statistic(*resampled_data, axis=-1))
|
||||
theta_hat_b = np.concatenate(theta_hat_b, axis=-1)
|
||||
|
||||
# Calculate percentile interval
|
||||
alpha = (1 - confidence_level)/2
|
||||
if method == 'bca':
|
||||
interval = _bca_interval(data, statistic, axis=-1, alpha=alpha,
|
||||
theta_hat_b=theta_hat_b, batch=batch)
|
||||
percentile_fun = _percentile_along_axis
|
||||
else:
|
||||
interval = alpha, 1-alpha
|
||||
|
||||
def percentile_fun(a, q):
|
||||
return np.percentile(a=a, q=q, axis=-1)
|
||||
|
||||
# Calculate confidence interval of statistic
|
||||
ci_l = percentile_fun(theta_hat_b, interval[0]*100)
|
||||
ci_u = percentile_fun(theta_hat_b, interval[1]*100)
|
||||
if method == 'basic': # see [3]
|
||||
theta_hat = statistic(*data, axis=-1)
|
||||
ci_l, ci_u = 2*theta_hat - ci_u, 2*theta_hat - ci_l
|
||||
|
||||
return BootstrapResult(confidence_interval=ConfidenceInterval(ci_l, ci_u),
|
||||
standard_error=np.std(theta_hat_b, ddof=1, axis=-1))
|
||||
@@ -0,0 +1,6 @@
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
ConfidenceInterval = namedtuple("ConfidenceInterval", ["low", "high"])
|
||||
ConfidenceInterval. __doc__ = "Class for confidence intervals."
|
||||
@@ -0,0 +1,31 @@
|
||||
"""
|
||||
Statistics-related constants.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
# The smallest representable positive number such that 1.0 + _EPS != 1.0.
|
||||
_EPS = np.finfo(float).eps
|
||||
|
||||
# The largest [in magnitude] usable floating value.
|
||||
_XMAX = np.finfo(float).max
|
||||
|
||||
# The log of the largest usable floating value; useful for knowing
|
||||
# when exp(something) will overflow
|
||||
_LOGXMAX = np.log(_XMAX)
|
||||
|
||||
# The smallest [in magnitude] usable floating value.
|
||||
_XMIN = np.finfo(float).tiny
|
||||
|
||||
# -special.psi(1)
|
||||
_EULER = 0.577215664901532860606512090082402431042
|
||||
|
||||
# special.zeta(3, 1) Apery's constant
|
||||
_ZETA3 = 1.202056903159594285399738161511449990765
|
||||
|
||||
# sqrt(2/pi)
|
||||
_SQRT_2_OVER_PI = 0.7978845608028654
|
||||
|
||||
# log(sqrt(2/pi))
|
||||
_LOG_SQRT_2_OVER_PI = -0.22579135264472744
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,194 @@
|
||||
import numpy as np
|
||||
from scipy.sparse import coo_matrix
|
||||
|
||||
|
||||
def crosstab(*args, levels=None, sparse=False):
|
||||
"""
|
||||
Return table of counts for each possible unique combination in ``*args``.
|
||||
|
||||
When ``len(args) > 1``, the array computed by this function is
|
||||
often referred to as a *contingency table* [1]_.
|
||||
|
||||
The arguments must be sequences with the same length. The second return
|
||||
value, `count`, is an integer array with ``len(args)`` dimensions. If
|
||||
`levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk``
|
||||
is the number of unique elements in ``args[k]``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
args : sequences
|
||||
A sequence of sequences whose unique aligned elements are to be
|
||||
counted. The sequences in args must all be the same length.
|
||||
levels : sequence, optional
|
||||
If `levels` is given, it must be a sequence that is the same length as
|
||||
`args`. Each element in `levels` is either a sequence or None. If it
|
||||
is a sequence, it gives the values in the corresponding sequence in
|
||||
`args` that are to be counted. If any value in the sequences in `args`
|
||||
does not occur in the corresponding sequence in `levels`, that value
|
||||
is ignored and not counted in the returned array `count`. The default
|
||||
value of `levels` for ``args[i]`` is ``np.unique(args[i])``
|
||||
sparse : bool, optional
|
||||
If True, return a sparse matrix. The matrix will be an instance of
|
||||
the `scipy.sparse.coo_matrix` class. Because SciPy's sparse matrices
|
||||
must be 2-d, only two input sequences are allowed when `sparse` is
|
||||
True. Default is False.
|
||||
|
||||
Returns
|
||||
-------
|
||||
elements : tuple of numpy.ndarrays.
|
||||
Tuple of length ``len(args)`` containing the arrays of elements that
|
||||
are counted in `count`. These can be interpreted as the labels of
|
||||
the corresponding dimensions of `count`.
|
||||
If `levels` was given, then if ``levels[i]`` is not None,
|
||||
``elements[i]`` will hold the values given in ``levels[i]``.
|
||||
count : numpy.ndarray or scipy.sparse.coo_matrix
|
||||
Counts of the unique elements in ``zip(*args)``, stored in an array.
|
||||
Also known as a *contingency table* when ``len(args) > 1``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.unique
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats.contingency import crosstab
|
||||
|
||||
Given the lists `a` and `x`, create a contingency table that counts the
|
||||
frequencies of the corresponding pairs.
|
||||
|
||||
>>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
|
||||
>>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
|
||||
>>> (avals, xvals), count = crosstab(a, x)
|
||||
>>> avals
|
||||
array(['A', 'B'], dtype='<U1')
|
||||
>>> xvals
|
||||
array(['X', 'Y', 'Z'], dtype='<U1')
|
||||
>>> count
|
||||
array([[2, 3, 0],
|
||||
[1, 0, 4]])
|
||||
|
||||
So `('A', 'X')` occurs twice, `('A', 'Y')` occurs three times, etc.
|
||||
|
||||
Higher dimensional contingency tables can be created.
|
||||
|
||||
>>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
|
||||
>>> (avals, xvals, pvals), count = crosstab(a, x, p)
|
||||
>>> count
|
||||
array([[[2, 0],
|
||||
[2, 1],
|
||||
[0, 0]],
|
||||
[[1, 0],
|
||||
[0, 0],
|
||||
[1, 3]]])
|
||||
>>> count.shape
|
||||
(2, 3, 2)
|
||||
|
||||
The values to be counted can be set by using the `levels` argument.
|
||||
It allows the elements of interest in each input sequence to be
|
||||
given explicitly instead finding the unique elements of the sequence.
|
||||
|
||||
For example, suppose one of the arguments is an array containing the
|
||||
answers to a survey question, with integer values 1 to 4. Even if the
|
||||
value 1 does not occur in the data, we want an entry for it in the table.
|
||||
|
||||
>>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4] # 1 does not occur.
|
||||
>>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4] # 3 does not occur.
|
||||
>>> options = [1, 2, 3, 4]
|
||||
>>> vals, count = crosstab(q1, q2, levels=(options, options))
|
||||
>>> count
|
||||
array([[0, 0, 0, 0],
|
||||
[1, 1, 0, 1],
|
||||
[1, 4, 0, 1],
|
||||
[0, 3, 0, 3]])
|
||||
|
||||
If `levels` is given, but an element of `levels` is None, the unique values
|
||||
of the corresponding argument are used. For example,
|
||||
|
||||
>>> vals, count = crosstab(q1, q2, levels=(None, options))
|
||||
>>> vals
|
||||
[array([2, 3, 4]), [1, 2, 3, 4]]
|
||||
>>> count
|
||||
array([[1, 1, 0, 1],
|
||||
[1, 4, 0, 1],
|
||||
[0, 3, 0, 3]])
|
||||
|
||||
If we want to ignore the pairs where 4 occurs in ``q2``, we can
|
||||
give just the values [1, 2] to `levels`, and the 4 will be ignored:
|
||||
|
||||
>>> vals, count = crosstab(q1, q2, levels=(None, [1, 2]))
|
||||
>>> vals
|
||||
[array([2, 3, 4]), [1, 2]]
|
||||
>>> count
|
||||
array([[1, 1],
|
||||
[1, 4],
|
||||
[0, 3]])
|
||||
|
||||
Finally, let's repeat the first example, but return a sparse matrix:
|
||||
|
||||
>>> (avals, xvals), count = crosstab(a, x, sparse=True)
|
||||
>>> count
|
||||
<2x3 sparse matrix of type '<class 'numpy.int64'>'
|
||||
with 4 stored elements in COOrdinate format>
|
||||
>>> count.A
|
||||
array([[2, 3, 0],
|
||||
[1, 0, 4]])
|
||||
|
||||
"""
|
||||
nargs = len(args)
|
||||
if nargs == 0:
|
||||
raise TypeError("At least one input sequence is required.")
|
||||
|
||||
len0 = len(args[0])
|
||||
if not all(len(a) == len0 for a in args[1:]):
|
||||
raise ValueError("All input sequences must have the same length.")
|
||||
|
||||
if sparse and nargs != 2:
|
||||
raise ValueError("When `sparse` is True, only two input sequences "
|
||||
"are allowed.")
|
||||
|
||||
if levels is None:
|
||||
# Call np.unique with return_inverse=True on each argument.
|
||||
actual_levels, indices = zip(*[np.unique(a, return_inverse=True)
|
||||
for a in args])
|
||||
else:
|
||||
# `levels` is not None...
|
||||
if len(levels) != nargs:
|
||||
raise ValueError('len(levels) must equal the number of input '
|
||||
'sequences')
|
||||
|
||||
args = [np.asarray(arg) for arg in args]
|
||||
mask = np.zeros((nargs, len0), dtype=np.bool_)
|
||||
inv = np.zeros((nargs, len0), dtype=np.intp)
|
||||
actual_levels = []
|
||||
for k, (levels_list, arg) in enumerate(zip(levels, args)):
|
||||
if levels_list is None:
|
||||
levels_list, inv[k, :] = np.unique(arg, return_inverse=True)
|
||||
mask[k, :] = True
|
||||
else:
|
||||
q = arg == np.asarray(levels_list).reshape(-1, 1)
|
||||
mask[k, :] = np.any(q, axis=0)
|
||||
qnz = q.T.nonzero()
|
||||
inv[k, qnz[0]] = qnz[1]
|
||||
actual_levels.append(levels_list)
|
||||
|
||||
mask_all = mask.all(axis=0)
|
||||
indices = tuple(inv[:, mask_all])
|
||||
|
||||
if sparse:
|
||||
count = coo_matrix((np.ones(len(indices[0]), dtype=int),
|
||||
(indices[0], indices[1])))
|
||||
count.sum_duplicates()
|
||||
else:
|
||||
shape = [len(u) for u in actual_levels]
|
||||
count = np.zeros(shape, dtype=int)
|
||||
np.add.at(count, indices, 1)
|
||||
|
||||
return actual_levels, count
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Sane parameters for stats.distributions.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
distcont = [
|
||||
['alpha', (3.5704770516650459,)],
|
||||
['anglit', ()],
|
||||
['arcsine', ()],
|
||||
['argus', (1.0,)],
|
||||
['beta', (2.3098496451481823, 0.62687954300963677)],
|
||||
['betaprime', (5, 6)],
|
||||
['bradford', (0.29891359763170633,)],
|
||||
['burr', (10.5, 4.3)],
|
||||
['burr12', (10, 4)],
|
||||
['cauchy', ()],
|
||||
['chi', (78,)],
|
||||
['chi2', (55,)],
|
||||
['cosine', ()],
|
||||
['crystalball', (2.0, 3.0)],
|
||||
['dgamma', (1.1023326088288166,)],
|
||||
['dweibull', (2.0685080649914673,)],
|
||||
['erlang', (10,)],
|
||||
['expon', ()],
|
||||
['exponnorm', (1.5,)],
|
||||
['exponpow', (2.697119160358469,)],
|
||||
['exponweib', (2.8923945291034436, 1.9505288745913174)],
|
||||
['f', (29, 18)],
|
||||
['fatiguelife', (29,)], # correction numargs = 1
|
||||
['fisk', (3.0857548622253179,)],
|
||||
['foldcauchy', (4.7164673455831894,)],
|
||||
['foldnorm', (1.9521253373555869,)],
|
||||
['gamma', (1.9932305483800778,)],
|
||||
['gausshyper', (13.763771604130699, 3.1189636648681431,
|
||||
2.5145980350183019, 5.1811649903971615)], # veryslow
|
||||
['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
|
||||
['genextreme', (-0.1,)],
|
||||
['gengamma', (4.4162385429431925, 3.1193091679242761)],
|
||||
['gengamma', (4.4162385429431925, -3.1193091679242761)],
|
||||
['genhalflogistic', (0.77274727809929322,)],
|
||||
['genhyperbolic', (0.5, 1.5, -0.5,)],
|
||||
['geninvgauss', (2.3, 1.5)],
|
||||
['genlogistic', (0.41192440799679475,)],
|
||||
['gennorm', (1.2988442399460265,)],
|
||||
['halfgennorm', (0.6748054997000371,)],
|
||||
['genpareto', (0.1,)], # use case with finite moments
|
||||
['gilbrat', ()],
|
||||
['gompertz', (0.94743713075105251,)],
|
||||
['gumbel_l', ()],
|
||||
['gumbel_r', ()],
|
||||
['halfcauchy', ()],
|
||||
['halflogistic', ()],
|
||||
['halfnorm', ()],
|
||||
['hypsecant', ()],
|
||||
['invgamma', (4.0668996136993067,)],
|
||||
['invgauss', (0.14546264555347513,)],
|
||||
['invweibull', (10.58,)],
|
||||
['johnsonsb', (4.3172675099141058, 3.1837781130785063)],
|
||||
['johnsonsu', (2.554395574161155, 2.2482281679651965)],
|
||||
['kappa4', (0.0, 0.0)],
|
||||
['kappa4', (-0.1, 0.1)],
|
||||
['kappa4', (0.0, 0.1)],
|
||||
['kappa4', (0.1, 0.0)],
|
||||
['kappa3', (1.0,)],
|
||||
['ksone', (1000,)], # replace 22 by 100 to avoid failing range, ticket 956
|
||||
['kstwo', (10,)],
|
||||
['kstwobign', ()],
|
||||
['laplace', ()],
|
||||
['laplace_asymmetric', (2,)],
|
||||
['levy', ()],
|
||||
['levy_l', ()],
|
||||
['levy_stable', (1.8, -0.5)],
|
||||
['loggamma', (0.41411931826052117,)],
|
||||
['logistic', ()],
|
||||
['loglaplace', (3.2505926592051435,)],
|
||||
['lognorm', (0.95368226960575331,)],
|
||||
['loguniform', (0.01, 1.25)],
|
||||
['lomax', (1.8771398388773268,)],
|
||||
['maxwell', ()],
|
||||
['mielke', (10.4, 4.6)],
|
||||
['moyal', ()],
|
||||
['nakagami', (4.9673794866666237,)],
|
||||
['ncf', (27, 27, 0.41578441799226107)],
|
||||
['nct', (14, 0.24045031331198066)],
|
||||
['ncx2', (21, 1.0560465975116415)],
|
||||
['norm', ()],
|
||||
['norminvgauss', (1.25, 0.5)],
|
||||
['pareto', (2.621716532144454,)],
|
||||
['pearson3', (0.1,)],
|
||||
['powerlaw', (1.6591133289905851,)],
|
||||
['powerlognorm', (2.1413923530064087, 0.44639540782048337)],
|
||||
['powernorm', (4.4453652254590779,)],
|
||||
['rayleigh', ()],
|
||||
['rdist', (1.6,)],
|
||||
['recipinvgauss', (0.63004267809369119,)],
|
||||
['reciprocal', (0.01, 1.25)],
|
||||
['rice', (0.7749725210111873,)],
|
||||
['semicircular', ()],
|
||||
['skewcauchy', (0.5,)],
|
||||
['skewnorm', (4.0,)],
|
||||
['studentized_range', (3.0, 10.0)],
|
||||
['t', (2.7433514990818093,)],
|
||||
['trapezoid', (0.2, 0.8)],
|
||||
['triang', (0.15785029824528218,)],
|
||||
['truncexpon', (4.6907725456810478,)],
|
||||
['truncnorm', (-1.0978730080013919, 2.7306754109031979)],
|
||||
['truncnorm', (0.1, 2.)],
|
||||
['tukeylambda', (3.1321477856738267,)],
|
||||
['uniform', ()],
|
||||
['vonmises', (3.9939042581071398,)],
|
||||
['vonmises_line', (3.9939042581071398,)],
|
||||
['wald', ()],
|
||||
['weibull_max', (2.8687961709100187,)],
|
||||
['weibull_min', (1.7866166930421596,)],
|
||||
['wrapcauchy', (0.031071279018614728,)]]
|
||||
|
||||
|
||||
distdiscrete = [
|
||||
['bernoulli',(0.3,)],
|
||||
['betabinom', (5, 2.3, 0.63)],
|
||||
['binom', (5, 0.4)],
|
||||
['boltzmann',(1.4, 19)],
|
||||
['dlaplace', (0.8,)], # 0.5
|
||||
['geom', (0.5,)],
|
||||
['hypergeom',(30, 12, 6)],
|
||||
['hypergeom',(21,3,12)], # numpy.random (3,18,12) numpy ticket:921
|
||||
['hypergeom',(21,18,11)], # numpy.random (18,3,11) numpy ticket:921
|
||||
['nchypergeom_fisher', (140, 80, 60, 0.5)],
|
||||
['nchypergeom_wallenius', (140, 80, 60, 0.5)],
|
||||
['logser', (0.6,)], # re-enabled, numpy ticket:921
|
||||
['nbinom', (0.4, 0.4)], # from tickets: 583
|
||||
['nbinom', (5, 0.5)],
|
||||
['planck', (0.51,)], # 4.1
|
||||
['poisson', (0.6,)],
|
||||
['randint', (7, 31)],
|
||||
['skellam', (15, 8)],
|
||||
['zipf', (6.5,)],
|
||||
['zipfian', (0.75, 15)],
|
||||
['zipfian', (1.25, 10)],
|
||||
['yulesimon', (11.0,)],
|
||||
['nhypergeom', (20, 7, 1)]
|
||||
]
|
||||
|
||||
|
||||
invdistdiscrete = [
|
||||
# In each of the following, at least one shape parameter is invalid
|
||||
['hypergeom', (3, 3, 4)],
|
||||
['nhypergeom', (5, 2, 8)],
|
||||
['nchypergeom_fisher', (3, 3, 4, 1)],
|
||||
['nchypergeom_wallenius', (3, 3, 4, 1)],
|
||||
['bernoulli', (1.5, )],
|
||||
['binom', (10, 1.5)],
|
||||
['betabinom', (10, -0.4, -0.5)],
|
||||
['boltzmann', (-1, 4)],
|
||||
['dlaplace', (-0.5, )],
|
||||
['geom', (1.5, )],
|
||||
['logser', (1.5, )],
|
||||
['nbinom', (10, 1.5)],
|
||||
['planck', (-0.5, )],
|
||||
['poisson', (-0.5, )],
|
||||
['randint', (5, 2)],
|
||||
['skellam', (-5, -2)],
|
||||
['zipf', (-2, )],
|
||||
['yulesimon', (-2, )],
|
||||
['zipfian', (-0.75, 15)]
|
||||
]
|
||||
|
||||
|
||||
invdistcont = [
|
||||
# In each of the following, at least one shape parameter is invalid
|
||||
['alpha', (-1, )],
|
||||
['anglit', ()],
|
||||
['arcsine', ()],
|
||||
['argus', (-1, )],
|
||||
['beta', (-2, 2)],
|
||||
['betaprime', (-2, 2)],
|
||||
['bradford', (-1, )],
|
||||
['burr', (-1, 1)],
|
||||
['burr12', (-1, 1)],
|
||||
['cauchy', ()],
|
||||
['chi', (-1, )],
|
||||
['chi2', (-1, )],
|
||||
['cosine', ()],
|
||||
['crystalball', (-1, 2)],
|
||||
['dgamma', (-1, )],
|
||||
['dweibull', (-1, )],
|
||||
['erlang', (-1, )],
|
||||
['expon', ()],
|
||||
['exponnorm', (-1, )],
|
||||
['exponweib', (1, -1)],
|
||||
['exponpow', (-1, )],
|
||||
['f', (10, -10)],
|
||||
['fatiguelife', (-1, )],
|
||||
['fisk', (-1, )],
|
||||
['foldcauchy', (-1, )],
|
||||
['foldnorm', (-1, )],
|
||||
['genlogistic', (-1, )],
|
||||
['gennorm', (-1, )],
|
||||
['genpareto', (np.inf, )],
|
||||
['genexpon', (1, 2, -3)],
|
||||
['genextreme', (np.inf, )],
|
||||
['genhyperbolic', (0.5, -0.5, -1.5,)],
|
||||
['gausshyper', (1, 2, 3, -4)],
|
||||
['gamma', (-1, )],
|
||||
['gengamma', (-1, 0)],
|
||||
['genhalflogistic', (-1, )],
|
||||
['geninvgauss', (1, 0)],
|
||||
['gilbrat', ()],
|
||||
['gompertz', (-1, )],
|
||||
['gumbel_r', ()],
|
||||
['gumbel_l', ()],
|
||||
['halfcauchy', ()],
|
||||
['halflogistic', ()],
|
||||
['halfnorm', ()],
|
||||
['halfgennorm', (-1, )],
|
||||
['hypsecant', ()],
|
||||
['invgamma', (-1, )],
|
||||
['invgauss', (-1, )],
|
||||
['invweibull', (-1, )],
|
||||
['johnsonsb', (1, -2)],
|
||||
['johnsonsu', (1, -2)],
|
||||
['kappa4', (np.nan, 0)],
|
||||
['kappa3', (-1, )],
|
||||
['ksone', (-1, )],
|
||||
['kstwo', (-1, )],
|
||||
['kstwobign', ()],
|
||||
['laplace', ()],
|
||||
['laplace_asymmetric', (-1, )],
|
||||
['levy', ()],
|
||||
['levy_l', ()],
|
||||
['levy_stable', (-1, 1)],
|
||||
['logistic', ()],
|
||||
['loggamma', (-1, )],
|
||||
['loglaplace', (-1, )],
|
||||
['lognorm', (-1, )],
|
||||
['loguniform', (10, 5)],
|
||||
['lomax', (-1, )],
|
||||
['maxwell', ()],
|
||||
['mielke', (1, -2)],
|
||||
['moyal', ()],
|
||||
['nakagami', (-1, )],
|
||||
['ncx2', (-1, 2)],
|
||||
['ncf', (10, 20, -1)],
|
||||
['nct', (-1, 2)],
|
||||
['norm', ()],
|
||||
['norminvgauss', (5, -10)],
|
||||
['pareto', (-1, )],
|
||||
['pearson3', (np.nan, )],
|
||||
['powerlaw', (-1, )],
|
||||
['powerlognorm', (1, -2)],
|
||||
['powernorm', (-1, )],
|
||||
['rdist', (-1, )],
|
||||
['rayleigh', ()],
|
||||
['rice', (-1, )],
|
||||
['recipinvgauss', (-1, )],
|
||||
['semicircular', ()],
|
||||
['skewnorm', (np.inf, )],
|
||||
['studentized_range', (-1, 1)],
|
||||
['t', (-1, )],
|
||||
['trapezoid', (0, 2)],
|
||||
['triang', (2, )],
|
||||
['truncexpon', (-1, )],
|
||||
['truncnorm', (10, 5)],
|
||||
['tukeylambda', (np.nan, )],
|
||||
['uniform', ()],
|
||||
['vonmises', (-1, )],
|
||||
['vonmises_line', (-1, )],
|
||||
['wald', ()],
|
||||
['weibull_min', (-1, )],
|
||||
['weibull_max', (-1, )],
|
||||
['wrapcauchy', (2, )],
|
||||
['reciprocal', (15, 10)],
|
||||
['skewcauchy', (2, )]
|
||||
]
|
||||
@@ -0,0 +1,340 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Apr 2 09:06:05 2021
|
||||
|
||||
@author: matth
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import math
|
||||
import numpy as np
|
||||
from scipy import special
|
||||
from typing import Optional, Union
|
||||
|
||||
__all__ = ['entropy', 'differential_entropy']
|
||||
|
||||
|
||||
def entropy(pk: np.typing.ArrayLike,
|
||||
qk: Optional[np.typing.ArrayLike] = None,
|
||||
base: Optional[float] = None,
|
||||
axis: int = 0
|
||||
) -> Union[np.number, np.ndarray]:
|
||||
"""Calculate the entropy of a distribution for given probability values.
|
||||
|
||||
If only probabilities `pk` are given, the entropy is calculated as
|
||||
``S = -sum(pk * log(pk), axis=axis)``.
|
||||
|
||||
If `qk` is not None, then compute the Kullback-Leibler divergence
|
||||
``S = sum(pk * log(pk / qk), axis=axis)``.
|
||||
|
||||
This routine will normalize `pk` and `qk` if they don't sum to 1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pk : array_like
|
||||
Defines the (discrete) distribution. Along each axis-slice of ``pk``,
|
||||
element ``i`` is the (possibly unnormalized) probability of event
|
||||
``i``.
|
||||
qk : array_like, optional
|
||||
Sequence against which the relative entropy is computed. Should be in
|
||||
the same format as `pk`.
|
||||
base : float, optional
|
||||
The logarithmic base to use, defaults to ``e`` (natural logarithm).
|
||||
axis: int, optional
|
||||
The axis along which the entropy is calculated. Default is 0.
|
||||
|
||||
Returns
|
||||
-------
|
||||
S : {float, array_like}
|
||||
The calculated entropy.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from scipy.stats import entropy
|
||||
|
||||
Bernoulli trial with different p.
|
||||
The outcome of a fair coin is the most uncertain:
|
||||
|
||||
>>> entropy([1/2, 1/2], base=2)
|
||||
1.0
|
||||
|
||||
The outcome of a biased coin is less uncertain:
|
||||
|
||||
>>> entropy([9/10, 1/10], base=2)
|
||||
0.46899559358928117
|
||||
|
||||
Relative entropy:
|
||||
|
||||
>>> entropy([1/2, 1/2], qk=[9/10, 1/10])
|
||||
0.5108256237659907
|
||||
|
||||
"""
|
||||
if base is not None and base <= 0:
|
||||
raise ValueError("`base` must be a positive number or `None`.")
|
||||
|
||||
pk = np.asarray(pk)
|
||||
pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
|
||||
if qk is None:
|
||||
vec = special.entr(pk)
|
||||
else:
|
||||
qk = np.asarray(qk)
|
||||
pk, qk = np.broadcast_arrays(pk, qk)
|
||||
qk = 1.0*qk / np.sum(qk, axis=axis, keepdims=True)
|
||||
vec = special.rel_entr(pk, qk)
|
||||
S = np.sum(vec, axis=axis)
|
||||
if base is not None:
|
||||
S /= np.log(base)
|
||||
return S
|
||||
|
||||
|
||||
def differential_entropy(
|
||||
values: np.typing.ArrayLike,
|
||||
*,
|
||||
window_length: Optional[int] = None,
|
||||
base: Optional[float] = None,
|
||||
axis: int = 0,
|
||||
method: str = "auto",
|
||||
) -> Union[np.number, np.ndarray]:
|
||||
r"""Given a sample of a distribution, estimate the differential entropy.
|
||||
|
||||
Several estimation methods are available using the `method` parameter. By
|
||||
default, a method is selected based the size of the sample.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : sequence
|
||||
Sample from a continuous distribution.
|
||||
window_length : int, optional
|
||||
Window length for computing Vasicek estimate. Must be an integer
|
||||
between 1 and half of the sample size. If ``None`` (the default), it
|
||||
uses the heuristic value
|
||||
|
||||
.. math::
|
||||
\left \lfloor \sqrt{n} + 0.5 \right \rfloor
|
||||
|
||||
where :math:`n` is the sample size. This heuristic was originally
|
||||
proposed in [2]_ and has become common in the literature.
|
||||
base : float, optional
|
||||
The logarithmic base to use, defaults to ``e`` (natural logarithm).
|
||||
axis : int, optional
|
||||
The axis along which the differential entropy is calculated.
|
||||
Default is 0.
|
||||
method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
|
||||
The method used to estimate the differential entropy from the sample.
|
||||
Default is ``'auto'``. See Notes for more information.
|
||||
|
||||
Returns
|
||||
-------
|
||||
entropy : float
|
||||
The calculated differential entropy.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function will converge to the true differential entropy in the limit
|
||||
|
||||
.. math::
|
||||
n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
|
||||
|
||||
The optimal choice of ``window_length`` for a given sample size depends on
|
||||
the (unknown) distribution. Typically, the smoother the density of the
|
||||
distribution, the larger the optimal value of ``window_length`` [1]_.
|
||||
|
||||
The following options are available for the `method` parameter.
|
||||
|
||||
* ``'vasicek'`` uses the estimator presented in [1]_. This is
|
||||
one of the first and most influential estimators of differential entropy.
|
||||
* ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
|
||||
is not only consistent but, under some conditions, asymptotically normal.
|
||||
* ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
|
||||
in simulation to have smaller bias and mean squared error than
|
||||
the Vasicek estimator.
|
||||
* ``'correa'`` uses the estimator presented in [5]_ based on local linear
|
||||
regression. In a simulation study, it had consistently smaller mean
|
||||
square error than the Vasiceck estimator, but it is more expensive to
|
||||
compute.
|
||||
* ``'auto'`` selects the method automatically (default). Currently,
|
||||
this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
|
||||
for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
|
||||
samples, but this behavior is subject to change in future versions.
|
||||
|
||||
All estimators are implemented as described in [6]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
|
||||
Journal of the Royal Statistical Society:
|
||||
Series B (Methodological), 38(1), 54-59.
|
||||
.. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
|
||||
goodness-of-fit test for exponentiality. Communications in
|
||||
Statistics-Theory and Methods, 28(5), 1183-1202.
|
||||
.. [3] Van Es, B. (1992). Estimating functionals related to a density by a
|
||||
class of statistics based on spacings. Scandinavian Journal of
|
||||
Statistics, 61-72.
|
||||
.. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
|
||||
of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
|
||||
.. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
|
||||
in Statistics-Theory and Methods, 24(10), 2439-2449.
|
||||
.. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
|
||||
Annals of Data Science, 2(2), 231-241.
|
||||
https://link.springer.com/article/10.1007/s40745-015-0045-9
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import differential_entropy, norm
|
||||
|
||||
Entropy of a standard normal distribution:
|
||||
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> values = rng.standard_normal(100)
|
||||
>>> differential_entropy(values)
|
||||
1.3407817436640392
|
||||
|
||||
Compare with the true entropy:
|
||||
|
||||
>>> float(norm.entropy())
|
||||
1.4189385332046727
|
||||
|
||||
For several sample sizes between 5 and 1000, compare the accuracy of
|
||||
the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
|
||||
compare the root mean squared error (over 1000 trials) between the estimate
|
||||
and the true differential entropy of the distribution.
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>>
|
||||
>>>
|
||||
>>> def rmse(res, expected):
|
||||
... '''Root mean squared error'''
|
||||
... return np.sqrt(np.mean((res - expected)**2))
|
||||
>>>
|
||||
>>>
|
||||
>>> a, b = np.log10(5), np.log10(1000)
|
||||
>>> ns = np.round(np.logspace(a, b, 10)).astype(int)
|
||||
>>> reps = 1000 # number of repetitions for each sample size
|
||||
>>> expected = stats.expon.entropy()
|
||||
>>>
|
||||
>>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
|
||||
>>> for method in method_errors:
|
||||
... for n in ns:
|
||||
... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
|
||||
... res = stats.differential_entropy(rvs, method=method, axis=-1)
|
||||
... error = rmse(res, expected)
|
||||
... method_errors[method].append(error)
|
||||
>>>
|
||||
>>> for method, errors in method_errors.items():
|
||||
... plt.loglog(ns, errors, label=method)
|
||||
>>>
|
||||
>>> plt.legend()
|
||||
>>> plt.xlabel('sample size')
|
||||
>>> plt.ylabel('RMSE (1000 trials)')
|
||||
>>> plt.title('Entropy Estimator Error (Exponential Distribution)')
|
||||
|
||||
"""
|
||||
values = np.asarray(values)
|
||||
values = np.moveaxis(values, axis, -1)
|
||||
n = values.shape[-1] # number of observations
|
||||
|
||||
if window_length is None:
|
||||
window_length = math.floor(math.sqrt(n) + 0.5)
|
||||
|
||||
if not 2 <= 2 * window_length < n:
|
||||
raise ValueError(
|
||||
f"Window length ({window_length}) must be positive and less "
|
||||
f"than half the sample size ({n}).",
|
||||
)
|
||||
|
||||
if base is not None and base <= 0:
|
||||
raise ValueError("`base` must be a positive number or `None`.")
|
||||
|
||||
sorted_data = np.sort(values, axis=-1)
|
||||
|
||||
methods = {"vasicek": _vasicek_entropy,
|
||||
"van es": _van_es_entropy,
|
||||
"correa": _correa_entropy,
|
||||
"ebrahimi": _ebrahimi_entropy,
|
||||
"auto": _vasicek_entropy}
|
||||
method = method.lower()
|
||||
if method not in methods:
|
||||
message = f"`method` must be one of {set(methods)}"
|
||||
raise ValueError(message)
|
||||
|
||||
if method == "auto":
|
||||
if n <= 10:
|
||||
method = 'van es'
|
||||
elif n <= 1000:
|
||||
method = 'ebrahimi'
|
||||
else:
|
||||
method = 'vasicek'
|
||||
|
||||
res = methods[method](sorted_data, window_length)
|
||||
|
||||
if base is not None:
|
||||
res /= np.log(base)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _pad_along_last_axis(X, m):
|
||||
"""Pad the data for computing the rolling window difference."""
|
||||
# scales a bit better than method in _vasicek_like_entropy
|
||||
shape = np.array(X.shape)
|
||||
shape[-1] = m
|
||||
Xl = np.broadcast_to(X[..., [0]], shape) # [0] vs 0 to maintain shape
|
||||
Xr = np.broadcast_to(X[..., [-1]], shape)
|
||||
return np.concatenate((Xl, X, Xr), axis=-1)
|
||||
|
||||
|
||||
def _vasicek_entropy(X, m):
|
||||
"""Compute the Vasicek estimator as described in [6] Eq. 1.3."""
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m)
|
||||
differences = X[..., 2 * m:] - X[..., : -2 * m:]
|
||||
logs = np.log(n/(2*m) * differences)
|
||||
return np.mean(logs, axis=-1)
|
||||
|
||||
|
||||
def _van_es_entropy(X, m):
|
||||
"""Compute the van Es estimator as described in [6]."""
|
||||
# No equation number, but referred to as HVE_mn.
|
||||
# Typo: there should be a log within the summation.
|
||||
n = X.shape[-1]
|
||||
difference = X[..., m:] - X[..., :-m]
|
||||
term1 = 1/(n-m) * np.sum(np.log((n+1)/m * difference), axis=-1)
|
||||
k = np.arange(m, n+1)
|
||||
return term1 + np.sum(1/k) + np.log(m) - np.log(n+1)
|
||||
|
||||
|
||||
def _ebrahimi_entropy(X, m):
|
||||
"""Compute the Ebrahimi estimator as described in [6]."""
|
||||
# No equation number, but referred to as HE_mn
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m)
|
||||
|
||||
differences = X[..., 2 * m:] - X[..., : -2 * m:]
|
||||
|
||||
i = np.arange(1, n+1).astype(float)
|
||||
ci = np.ones_like(i)*2
|
||||
ci[i <= m] = 1 + (i[i <= m] - 1)/m
|
||||
ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m
|
||||
|
||||
logs = np.log(n * differences / (ci * m))
|
||||
return np.mean(logs, axis=-1)
|
||||
|
||||
|
||||
def _correa_entropy(X, m):
|
||||
"""Compute the Correa estimator as described in [6]."""
|
||||
# No equation number, but referred to as HC_mn
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m)
|
||||
|
||||
i = np.arange(1, n+1)
|
||||
dj = np.arange(-m, m+1)[:, None]
|
||||
j = i + dj
|
||||
j0 = j + m - 1 # 0-indexed version of j
|
||||
|
||||
Xibar = np.mean(X[..., j0], axis=-2, keepdims=True)
|
||||
difference = X[..., j0] - Xibar
|
||||
num = np.sum(difference*dj, axis=-2) # dj is d-i
|
||||
den = n*np.sum(difference**2, axis=-2)
|
||||
return -np.mean(np.log(num/den), axis=-1)
|
||||
@@ -0,0 +1,48 @@
|
||||
import pathlib
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def isNPY_OLD():
|
||||
'''
|
||||
A new random C API was added in 1.18 and became stable in 1.19.
|
||||
Prefer the new random C API when building with recent numpy.
|
||||
'''
|
||||
import numpy as np
|
||||
ver = tuple(int(num) for num in np.__version__.split('.')[:2])
|
||||
return ver < (1, 19)
|
||||
|
||||
|
||||
def make_biasedurn():
|
||||
'''Substitute True/False values for NPY_OLD Cython build variable.'''
|
||||
biasedurn_base = (pathlib.Path(__file__).parent / '_biasedurn').absolute()
|
||||
with open(biasedurn_base.with_suffix('.pyx.templ'), 'r') as src:
|
||||
contents = src.read()
|
||||
with open(biasedurn_base.with_suffix('.pyx'), 'w') as dest:
|
||||
dest.write(contents.format(NPY_OLD=str(bool(isNPY_OLD()))))
|
||||
|
||||
|
||||
def make_unuran():
|
||||
"""Substitute True/False values for NPY_OLD Cython build variable."""
|
||||
import re
|
||||
unuran_base = (
|
||||
pathlib.Path(__file__).parent / "_unuran" / "unuran_wrapper"
|
||||
).absolute()
|
||||
with open(unuran_base.with_suffix(".pyx.templ"), "r") as src:
|
||||
contents = src.read()
|
||||
with open(unuran_base.with_suffix(".pyx"), "w") as dest:
|
||||
dest.write(re.sub("DEF NPY_OLD = isNPY_OLD",
|
||||
f"DEF NPY_OLD = {isNPY_OLD()}",
|
||||
contents))
|
||||
|
||||
|
||||
def make_boost():
|
||||
# Call code generator inside _boost directory
|
||||
code_gen = pathlib.Path(__file__).parent / '_boost/include/code_gen.py'
|
||||
subprocess.run([sys.executable, str(code_gen)], check=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
make_biasedurn()
|
||||
make_unuran()
|
||||
make_boost()
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,151 @@
|
||||
import numpy as np
|
||||
|
||||
#pythran export _Aij(float[:,:], int, int)
|
||||
#pythran export _Aij(int[:,:], int, int)
|
||||
def _Aij(A, i, j):
|
||||
"""Sum of upper-left and lower right blocks of contingency table."""
|
||||
# See `somersd` References [2] bottom of page 309
|
||||
return A[:i, :j].sum() + A[i+1:, j+1:].sum()
|
||||
|
||||
#pythran export _Dij(float[:,:], int, int)
|
||||
#pythran export _Dij(int[:,:], int, int)
|
||||
def _Dij(A, i, j):
|
||||
"""Sum of lower-left and upper-right blocks of contingency table."""
|
||||
# See `somersd` References [2] bottom of page 309
|
||||
return A[i+1:, :j].sum() + A[:i, j+1:].sum()
|
||||
|
||||
|
||||
# pythran export _concordant_pairs(float[:,:])
|
||||
# pythran export _concordant_pairs(int[:,:])
|
||||
def _concordant_pairs(A):
|
||||
"""Twice the number of concordant pairs, excluding ties."""
|
||||
# See `somersd` References [2] bottom of page 309
|
||||
m, n = A.shape
|
||||
count = 0
|
||||
for i in range(m):
|
||||
for j in range(n):
|
||||
count += A[i, j]*_Aij(A, i, j)
|
||||
return count
|
||||
|
||||
|
||||
# pythran export _discordant_pairs(float[:,:])
|
||||
# pythran export _discordant_pairs(int[:,:])
|
||||
def _discordant_pairs(A):
|
||||
"""Twice the number of discordant pairs, excluding ties."""
|
||||
# See `somersd` References [2] bottom of page 309
|
||||
m, n = A.shape
|
||||
count = 0
|
||||
for i in range(m):
|
||||
for j in range(n):
|
||||
count += A[i, j]*_Dij(A, i, j)
|
||||
return count
|
||||
|
||||
|
||||
#pythran export _a_ij_Aij_Dij2(float[:,:])
|
||||
#pythran export _a_ij_Aij_Dij2(int[:,:])
|
||||
def _a_ij_Aij_Dij2(A):
|
||||
"""A term that appears in the ASE of Kendall's tau and Somers' D."""
|
||||
# See `somersd` References [2] section 4: Modified ASEs to test the null hypothesis...
|
||||
m, n = A.shape
|
||||
count = 0
|
||||
for i in range(m):
|
||||
for j in range(n):
|
||||
count += A[i, j]*(_Aij(A, i, j) - _Dij(A, i, j))**2
|
||||
return count
|
||||
|
||||
|
||||
#pythran export _compute_outer_prob_inside_method(int64, int64, int64, int64)
|
||||
def _compute_outer_prob_inside_method(m, n, g, h):
|
||||
"""
|
||||
Count the proportion of paths that do not stay strictly inside two
|
||||
diagonal lines.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
m : integer
|
||||
m > 0
|
||||
n : integer
|
||||
n > 0
|
||||
g : integer
|
||||
g is greatest common divisor of m and n
|
||||
h : integer
|
||||
0 <= h <= lcm(m,n)
|
||||
|
||||
Returns
|
||||
-------
|
||||
p : float
|
||||
The proportion of paths that do not stay inside the two lines.
|
||||
|
||||
The classical algorithm counts the integer lattice paths from (0, 0)
|
||||
to (m, n) which satisfy |x/m - y/n| < h / lcm(m, n).
|
||||
The paths make steps of size +1 in either positive x or positive y
|
||||
directions.
|
||||
We are, however, interested in 1 - proportion to computes p-values,
|
||||
so we change the recursion to compute 1 - p directly while staying
|
||||
within the "inside method" a described by Hodges.
|
||||
|
||||
We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
|
||||
Hodges, J.L. Jr.,
|
||||
"The Significance Probability of the Smirnov Two-Sample Test,"
|
||||
Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
|
||||
|
||||
For the recursion for 1-p see
|
||||
Viehmann, T.: "Numerically more stable computation of the p-values
|
||||
for the two-sample Kolmogorov-Smirnov test," arXiv: 2102.08037
|
||||
|
||||
"""
|
||||
# Probability is symmetrical in m, n. Computation below uses m >= n.
|
||||
if m < n:
|
||||
m, n = n, m
|
||||
mg = m // g
|
||||
ng = n // g
|
||||
|
||||
# Count the integer lattice paths from (0, 0) to (m, n) which satisfy
|
||||
# |nx/g - my/g| < h.
|
||||
# Compute matrix A such that:
|
||||
# A(x, 0) = A(0, y) = 1
|
||||
# A(x, y) = A(x, y-1) + A(x-1, y), for x,y>=1, except that
|
||||
# A(x, y) = 0 if |x/m - y/n|>= h
|
||||
# Probability is A(m, n)/binom(m+n, n)
|
||||
# Optimizations exist for m==n, m==n*p.
|
||||
# Only need to preserve a single column of A, and only a
|
||||
# sliding window of it.
|
||||
# minj keeps track of the slide.
|
||||
minj, maxj = 0, min(int(np.ceil(h / mg)), n + 1)
|
||||
curlen = maxj - minj
|
||||
# Make a vector long enough to hold maximum window needed.
|
||||
lenA = min(2 * maxj + 2, n + 1)
|
||||
# This is an integer calculation, but the entries are essentially
|
||||
# binomial coefficients, hence grow quickly.
|
||||
# Scaling after each column is computed avoids dividing by a
|
||||
# large binomial coefficient at the end, but is not sufficient to avoid
|
||||
# the large dyanamic range which appears during the calculation.
|
||||
# Instead we rescale based on the magnitude of the right most term in
|
||||
# the column and keep track of an exponent separately and apply
|
||||
# it at the end of the calculation. Similarly when multiplying by
|
||||
# the binomial coefficient
|
||||
dtype = np.float64
|
||||
A = np.ones(lenA, dtype=dtype)
|
||||
# Initialize the first column
|
||||
A[minj:maxj] = 0.0
|
||||
for i in range(1, m + 1):
|
||||
# Generate the next column.
|
||||
# First calculate the sliding window
|
||||
lastminj, lastlen = minj, curlen
|
||||
minj = max(int(np.floor((ng * i - h) / mg)) + 1, 0)
|
||||
minj = min(minj, n)
|
||||
maxj = min(int(np.ceil((ng * i + h) / mg)), n + 1)
|
||||
if maxj <= minj:
|
||||
return 1.0
|
||||
# Now fill in the values. We cannot use cumsum, unfortunately.
|
||||
val = 0.0 if minj == 0 else 1.0
|
||||
for jj in range(maxj - minj):
|
||||
j = jj + minj
|
||||
val = (A[jj + minj - lastminj] * i + val * j) / (i + j)
|
||||
A[jj] = val
|
||||
curlen = maxj - minj
|
||||
if lastlen > curlen:
|
||||
# Set some carried-over elements to 1
|
||||
A[maxj - minj:maxj - minj + (lastlen - curlen)] = 1
|
||||
|
||||
return A[maxj - minj - 1]
|
||||
@@ -0,0 +1,638 @@
|
||||
#-------------------------------------------------------------------------------
|
||||
#
|
||||
# Define classes for (uni/multi)-variate kernel density estimation.
|
||||
#
|
||||
# Currently, only Gaussian kernels are implemented.
|
||||
#
|
||||
# Written by: Robert Kern
|
||||
#
|
||||
# Date: 2004-08-09
|
||||
#
|
||||
# Modified: 2005-02-10 by Robert Kern.
|
||||
# Contributed to SciPy
|
||||
# 2005-10-07 by Robert Kern.
|
||||
# Some fixes to match the new scipy_core
|
||||
#
|
||||
# Copyright 2004-2005 by Enthought, Inc.
|
||||
#
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
# Standard library imports.
|
||||
import warnings
|
||||
|
||||
# SciPy imports.
|
||||
from scipy import linalg, special
|
||||
from scipy.special import logsumexp
|
||||
from scipy._lib._util import check_random_state
|
||||
|
||||
from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, dot, exp, pi,
|
||||
sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
|
||||
ones, cov)
|
||||
import numpy as np
|
||||
|
||||
# Local imports.
|
||||
from . import _mvn
|
||||
from ._stats import gaussian_kernel_estimate
|
||||
|
||||
|
||||
__all__ = ['gaussian_kde']
|
||||
|
||||
|
||||
class gaussian_kde:
|
||||
"""Representation of a kernel-density estimate using Gaussian kernels.
|
||||
|
||||
Kernel density estimation is a way to estimate the probability density
|
||||
function (PDF) of a random variable in a non-parametric way.
|
||||
`gaussian_kde` works for both uni-variate and multi-variate data. It
|
||||
includes automatic bandwidth determination. The estimation works best for
|
||||
a unimodal distribution; bimodal or multi-modal distributions tend to be
|
||||
oversmoothed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : array_like
|
||||
Datapoints to estimate from. In case of univariate data this is a 1-D
|
||||
array, otherwise a 2-D array with shape (# of dims, # of data).
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the estimator bandwidth. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a scalar,
|
||||
this will be used directly as `kde.factor`. If a callable, it should
|
||||
take a `gaussian_kde` instance as only parameter and return a scalar.
|
||||
If None (default), 'scott' is used. See Notes for more details.
|
||||
weights : array_like, optional
|
||||
weights of datapoints. This must be the same shape as dataset.
|
||||
If None (default), the samples are assumed to be equally weighted
|
||||
|
||||
Attributes
|
||||
----------
|
||||
dataset : ndarray
|
||||
The dataset with which `gaussian_kde` was initialized.
|
||||
d : int
|
||||
Number of dimensions.
|
||||
n : int
|
||||
Number of datapoints.
|
||||
neff : int
|
||||
Effective number of datapoints.
|
||||
|
||||
.. versionadded:: 1.2.0
|
||||
factor : float
|
||||
The bandwidth factor, obtained from `kde.covariance_factor`. The square
|
||||
of `kde.factor` multiplies the covariance matrix of the data in the kde
|
||||
estimation.
|
||||
covariance : ndarray
|
||||
The covariance matrix of `dataset`, scaled by the calculated bandwidth
|
||||
(`kde.factor`).
|
||||
inv_cov : ndarray
|
||||
The inverse of `covariance`.
|
||||
|
||||
Methods
|
||||
-------
|
||||
evaluate
|
||||
__call__
|
||||
integrate_gaussian
|
||||
integrate_box_1d
|
||||
integrate_box
|
||||
integrate_kde
|
||||
pdf
|
||||
logpdf
|
||||
resample
|
||||
set_bandwidth
|
||||
covariance_factor
|
||||
|
||||
Notes
|
||||
-----
|
||||
Bandwidth selection strongly influences the estimate obtained from the KDE
|
||||
(much more so than the actual shape of the kernel). Bandwidth selection
|
||||
can be done by a "rule of thumb", by cross-validation, by "plug-in
|
||||
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
|
||||
uses a rule of thumb, the default is Scott's Rule.
|
||||
|
||||
Scott's Rule [1]_, implemented as `scotts_factor`, is::
|
||||
|
||||
n**(-1./(d+4)),
|
||||
|
||||
with ``n`` the number of data points and ``d`` the number of dimensions.
|
||||
In the case of unequally weighted points, `scotts_factor` becomes::
|
||||
|
||||
neff**(-1./(d+4)),
|
||||
|
||||
with ``neff`` the effective number of datapoints.
|
||||
Silverman's Rule [2]_, implemented as `silverman_factor`, is::
|
||||
|
||||
(n * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
or in the case of unequally weighted points::
|
||||
|
||||
(neff * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
Good general descriptions of kernel density estimation can be found in [1]_
|
||||
and [2]_, the mathematics for this multi-dimensional implementation can be
|
||||
found in [1]_.
|
||||
|
||||
With a set of weighted samples, the effective number of datapoints ``neff``
|
||||
is defined by::
|
||||
|
||||
neff = sum(weights)^2 / sum(weights^2)
|
||||
|
||||
as detailed in [5]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
|
||||
Visualization", John Wiley & Sons, New York, Chicester, 1992.
|
||||
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
|
||||
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
|
||||
Chapman and Hall, London, 1986.
|
||||
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
|
||||
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
|
||||
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
|
||||
conditional density estimation", Computational Statistics & Data
|
||||
Analysis, Vol. 36, pp. 279-298, 2001.
|
||||
.. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
|
||||
Series A (General), 132, 272
|
||||
|
||||
Examples
|
||||
--------
|
||||
Generate some random two-dimensional data:
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> def measure(n):
|
||||
... "Measurement model, return two coupled measurements."
|
||||
... m1 = np.random.normal(size=n)
|
||||
... m2 = np.random.normal(scale=0.5, size=n)
|
||||
... return m1+m2, m1-m2
|
||||
|
||||
>>> m1, m2 = measure(2000)
|
||||
>>> xmin = m1.min()
|
||||
>>> xmax = m1.max()
|
||||
>>> ymin = m2.min()
|
||||
>>> ymax = m2.max()
|
||||
|
||||
Perform a kernel density estimate on the data:
|
||||
|
||||
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
||||
>>> positions = np.vstack([X.ravel(), Y.ravel()])
|
||||
>>> values = np.vstack([m1, m2])
|
||||
>>> kernel = stats.gaussian_kde(values)
|
||||
>>> Z = np.reshape(kernel(positions).T, X.shape)
|
||||
|
||||
Plot the results:
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
|
||||
... extent=[xmin, xmax, ymin, ymax])
|
||||
>>> ax.plot(m1, m2, 'k.', markersize=2)
|
||||
>>> ax.set_xlim([xmin, xmax])
|
||||
>>> ax.set_ylim([ymin, ymax])
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
def __init__(self, dataset, bw_method=None, weights=None):
|
||||
self.dataset = atleast_2d(asarray(dataset))
|
||||
if not self.dataset.size > 1:
|
||||
raise ValueError("`dataset` input should have multiple elements.")
|
||||
|
||||
self.d, self.n = self.dataset.shape
|
||||
|
||||
if weights is not None:
|
||||
self._weights = atleast_1d(weights).astype(float)
|
||||
self._weights /= sum(self._weights)
|
||||
if self.weights.ndim != 1:
|
||||
raise ValueError("`weights` input should be one-dimensional.")
|
||||
if len(self._weights) != self.n:
|
||||
raise ValueError("`weights` input should be of length n")
|
||||
self._neff = 1/sum(self._weights**2)
|
||||
|
||||
self.set_bandwidth(bw_method=bw_method)
|
||||
|
||||
def evaluate(self, points):
|
||||
"""Evaluate the estimated pdf on a set of points.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
points : (# of dimensions, # of points)-array
|
||||
Alternatively, a (# of dimensions,) vector can be passed in and
|
||||
treated as a single point.
|
||||
|
||||
Returns
|
||||
-------
|
||||
values : (# of points,)-array
|
||||
The values at each point.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError : if the dimensionality of the input points is different than
|
||||
the dimensionality of the KDE.
|
||||
|
||||
"""
|
||||
points = atleast_2d(asarray(points))
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = "points have dimension %s, dataset has dimension %s" % (d,
|
||||
self.d)
|
||||
raise ValueError(msg)
|
||||
|
||||
output_dtype = np.common_type(self.covariance, points)
|
||||
itemsize = np.dtype(output_dtype).itemsize
|
||||
if itemsize == 4:
|
||||
spec = 'float'
|
||||
elif itemsize == 8:
|
||||
spec = 'double'
|
||||
elif itemsize in (12, 16):
|
||||
spec = 'long double'
|
||||
else:
|
||||
raise TypeError('%s has unexpected item size %d' %
|
||||
(output_dtype, itemsize))
|
||||
result = gaussian_kernel_estimate[spec](self.dataset.T, self.weights[:, None],
|
||||
points.T, self.inv_cov, output_dtype)
|
||||
return result[:, 0]
|
||||
|
||||
__call__ = evaluate
|
||||
|
||||
def integrate_gaussian(self, mean, cov):
|
||||
"""
|
||||
Multiply estimated density by a multivariate Gaussian and integrate
|
||||
over the whole space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mean : aray_like
|
||||
A 1-D array, specifying the mean of the Gaussian.
|
||||
cov : array_like
|
||||
A 2-D array, specifying the covariance matrix of the Gaussian.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : scalar
|
||||
The value of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the mean or covariance of the input Gaussian differs from
|
||||
the KDE's dimensionality.
|
||||
|
||||
"""
|
||||
mean = atleast_1d(squeeze(mean))
|
||||
cov = atleast_2d(cov)
|
||||
|
||||
if mean.shape != (self.d,):
|
||||
raise ValueError("mean does not have dimension %s" % self.d)
|
||||
if cov.shape != (self.d, self.d):
|
||||
raise ValueError("covariance does not have dimension %s" % self.d)
|
||||
|
||||
# make mean a column vector
|
||||
mean = mean[:, newaxis]
|
||||
|
||||
sum_cov = self.covariance + cov
|
||||
|
||||
# This will raise LinAlgError if the new cov matrix is not s.p.d
|
||||
# cho_factor returns (ndarray, bool) where bool is a flag for whether
|
||||
# or not ndarray is upper or lower triangular
|
||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
||||
|
||||
diff = self.dataset - mean
|
||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
||||
|
||||
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
|
||||
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
|
||||
|
||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||
result = sum(exp(-energies)*self.weights, axis=0) / norm_const
|
||||
|
||||
return result
|
||||
|
||||
def integrate_box_1d(self, low, high):
|
||||
"""
|
||||
Computes the integral of a 1D pdf between two bounds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low : scalar
|
||||
Lower bound of integration.
|
||||
high : scalar
|
||||
Upper bound of integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDE is over more than one dimension.
|
||||
|
||||
"""
|
||||
if self.d != 1:
|
||||
raise ValueError("integrate_box_1d() only handles 1D pdfs")
|
||||
|
||||
stdev = ravel(sqrt(self.covariance))[0]
|
||||
|
||||
normalized_low = ravel((low - self.dataset) / stdev)
|
||||
normalized_high = ravel((high - self.dataset) / stdev)
|
||||
|
||||
value = np.sum(self.weights*(
|
||||
special.ndtr(normalized_high) -
|
||||
special.ndtr(normalized_low)))
|
||||
return value
|
||||
|
||||
def integrate_box(self, low_bounds, high_bounds, maxpts=None):
|
||||
"""Computes the integral of a pdf over a rectangular interval.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low_bounds : array_like
|
||||
A 1-D array containing the lower bounds of integration.
|
||||
high_bounds : array_like
|
||||
A 1-D array containing the upper bounds of integration.
|
||||
maxpts : int, optional
|
||||
The maximum number of points to use for integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
"""
|
||||
if maxpts is not None:
|
||||
extra_kwds = {'maxpts': maxpts}
|
||||
else:
|
||||
extra_kwds = {}
|
||||
|
||||
value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,
|
||||
self.dataset, self.weights,
|
||||
self.covariance, **extra_kwds)
|
||||
if inform:
|
||||
msg = ('An integral in _mvn.mvnun requires more points than %s' %
|
||||
(self.d * 1000))
|
||||
warnings.warn(msg)
|
||||
|
||||
return value
|
||||
|
||||
def integrate_kde(self, other):
|
||||
"""
|
||||
Computes the integral of the product of this kernel density estimate
|
||||
with another.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : gaussian_kde instance
|
||||
The other kde.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDEs have different dimensionality.
|
||||
|
||||
"""
|
||||
if other.d != self.d:
|
||||
raise ValueError("KDEs are not the same dimensionality")
|
||||
|
||||
# we want to iterate over the smallest number of points
|
||||
if other.n < self.n:
|
||||
small = other
|
||||
large = self
|
||||
else:
|
||||
small = self
|
||||
large = other
|
||||
|
||||
sum_cov = small.covariance + large.covariance
|
||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
||||
result = 0.0
|
||||
for i in range(small.n):
|
||||
mean = small.dataset[:, i, newaxis]
|
||||
diff = large.dataset - mean
|
||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
||||
|
||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||
result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
|
||||
|
||||
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
|
||||
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
|
||||
|
||||
result /= norm_const
|
||||
|
||||
return result
|
||||
|
||||
def resample(self, size=None, seed=None):
|
||||
"""Randomly sample a dataset from the estimated pdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, optional
|
||||
The number of samples to draw. If not provided, then the size is
|
||||
the same as the effective number of samples in the underlying
|
||||
dataset.
|
||||
seed : {None, int, `numpy.random.Generator`,
|
||||
`numpy.random.RandomState`}, optional
|
||||
|
||||
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
resample : (self.d, `size`) ndarray
|
||||
The sampled dataset.
|
||||
|
||||
"""
|
||||
if size is None:
|
||||
size = int(self.neff)
|
||||
|
||||
random_state = check_random_state(seed)
|
||||
norm = transpose(random_state.multivariate_normal(
|
||||
zeros((self.d,), float), self.covariance, size=size
|
||||
))
|
||||
indices = random_state.choice(self.n, size=size, p=self.weights)
|
||||
means = self.dataset[:, indices]
|
||||
|
||||
return means + norm
|
||||
|
||||
def scotts_factor(self):
|
||||
"""Compute Scott's factor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : float
|
||||
Scott's factor.
|
||||
"""
|
||||
return power(self.neff, -1./(self.d+4))
|
||||
|
||||
def silverman_factor(self):
|
||||
"""Compute the Silverman factor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : float
|
||||
The silverman factor.
|
||||
"""
|
||||
return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
|
||||
|
||||
# Default method to calculate bandwidth, can be overwritten by subclass
|
||||
covariance_factor = scotts_factor
|
||||
covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
|
||||
multiplies the data covariance matrix to obtain the kernel covariance
|
||||
matrix. The default is `scotts_factor`. A subclass can overwrite this
|
||||
method to provide a different method, or set it through a call to
|
||||
`kde.set_bandwidth`."""
|
||||
|
||||
def set_bandwidth(self, bw_method=None):
|
||||
"""Compute the estimator bandwidth with given method.
|
||||
|
||||
The new bandwidth calculated after a call to `set_bandwidth` is used
|
||||
for subsequent evaluations of the estimated density.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the estimator bandwidth. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a
|
||||
scalar, this will be used directly as `kde.factor`. If a callable,
|
||||
it should take a `gaussian_kde` instance as only parameter and
|
||||
return a scalar. If None (default), nothing happens; the current
|
||||
`kde.covariance_factor` method is kept.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 0.11
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import scipy.stats as stats
|
||||
>>> x1 = np.array([-7, -5, 1, 4, 5.])
|
||||
>>> kde = stats.gaussian_kde(x1)
|
||||
>>> xs = np.linspace(-10, 10, num=50)
|
||||
>>> y1 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method='silverman')
|
||||
>>> y2 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
|
||||
>>> y3 = kde(xs)
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
|
||||
... label='Data points (rescaled)')
|
||||
>>> ax.plot(xs, y1, label='Scott (default)')
|
||||
>>> ax.plot(xs, y2, label='Silverman')
|
||||
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
|
||||
>>> ax.legend()
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if bw_method is None:
|
||||
pass
|
||||
elif bw_method == 'scott':
|
||||
self.covariance_factor = self.scotts_factor
|
||||
elif bw_method == 'silverman':
|
||||
self.covariance_factor = self.silverman_factor
|
||||
elif np.isscalar(bw_method) and not isinstance(bw_method, str):
|
||||
self._bw_method = 'use constant'
|
||||
self.covariance_factor = lambda: bw_method
|
||||
elif callable(bw_method):
|
||||
self._bw_method = bw_method
|
||||
self.covariance_factor = lambda: self._bw_method(self)
|
||||
else:
|
||||
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
|
||||
"or a callable."
|
||||
raise ValueError(msg)
|
||||
|
||||
self._compute_covariance()
|
||||
|
||||
def _compute_covariance(self):
|
||||
"""Computes the covariance matrix for each Gaussian kernel using
|
||||
covariance_factor().
|
||||
"""
|
||||
self.factor = self.covariance_factor()
|
||||
# Cache covariance and inverse covariance of the data
|
||||
if not hasattr(self, '_data_inv_cov'):
|
||||
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
|
||||
bias=False,
|
||||
aweights=self.weights))
|
||||
self._data_inv_cov = linalg.inv(self._data_covariance)
|
||||
|
||||
self.covariance = self._data_covariance * self.factor**2
|
||||
self.inv_cov = self._data_inv_cov / self.factor**2
|
||||
L = linalg.cholesky(self.covariance*2*pi)
|
||||
self.log_det = 2*np.log(np.diag(L)).sum()
|
||||
|
||||
def pdf(self, x):
|
||||
"""
|
||||
Evaluate the estimated pdf on a provided set of points.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
|
||||
docstring for more details.
|
||||
|
||||
"""
|
||||
return self.evaluate(x)
|
||||
|
||||
def logpdf(self, x):
|
||||
"""
|
||||
Evaluate the log of the estimated pdf on a provided set of points.
|
||||
"""
|
||||
points = atleast_2d(x)
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = "points have dimension %s, dataset has dimension %s" % (d,
|
||||
self.d)
|
||||
raise ValueError(msg)
|
||||
|
||||
if m >= self.n:
|
||||
# there are more points than data, so loop over data
|
||||
energy = np.empty((self.n, m), dtype=float)
|
||||
for i in range(self.n):
|
||||
diff = self.dataset[:, i, newaxis] - points
|
||||
tdiff = dot(self.inv_cov, diff)
|
||||
energy[i] = sum(diff*tdiff, axis=0)
|
||||
log_to_sum = 2.0 * np.log(self.weights) - self.log_det - energy.T
|
||||
result = logsumexp(0.5 * log_to_sum, axis=1)
|
||||
else:
|
||||
# loop over points
|
||||
result = np.empty((m,), dtype=float)
|
||||
for i in range(m):
|
||||
diff = self.dataset - points[:, i, newaxis]
|
||||
tdiff = dot(self.inv_cov, diff)
|
||||
energy = sum(diff * tdiff, axis=0)
|
||||
log_to_sum = 2.0 * np.log(self.weights) - self.log_det - energy
|
||||
result[i] = logsumexp(0.5 * log_to_sum)
|
||||
|
||||
return result
|
||||
|
||||
@property
|
||||
def weights(self):
|
||||
try:
|
||||
return self._weights
|
||||
except AttributeError:
|
||||
self._weights = ones(self.n)/self.n
|
||||
return self._weights
|
||||
|
||||
@property
|
||||
def neff(self):
|
||||
try:
|
||||
return self._neff
|
||||
except AttributeError:
|
||||
self._neff = 1/sum(self.weights**2)
|
||||
return self._neff
|
||||
@@ -0,0 +1,596 @@
|
||||
# Compute the two-sided one-sample Kolmogorov-Smirnov Prob(Dn <= d) where:
|
||||
# D_n = sup_x{|F_n(x) - F(x)|},
|
||||
# F_n(x) is the empirical CDF for a sample of size n {x_i: i=1,...,n},
|
||||
# F(x) is the CDF of a probability distribution.
|
||||
#
|
||||
# Exact methods:
|
||||
# Prob(D_n >= d) can be computed via a matrix algorithm of Durbin[1]
|
||||
# or a recursion algorithm due to Pomeranz[2].
|
||||
# Marsaglia, Tsang & Wang[3] gave a computation-efficient way to perform
|
||||
# the Durbin algorithm.
|
||||
# D_n >= d <==> D_n+ >= d or D_n- >= d (the one-sided K-S statistics), hence
|
||||
# Prob(D_n >= d) = 2*Prob(D_n+ >= d) - Prob(D_n+ >= d and D_n- >= d).
|
||||
# For d > 0.5, the latter intersection probability is 0.
|
||||
#
|
||||
# Approximate methods:
|
||||
# For d close to 0.5, ignoring that intersection term may still give a
|
||||
# reasonable approximation.
|
||||
# Li-Chien[4] and Korolyuk[5] gave an asymptotic formula extending
|
||||
# Kolmogorov's initial asymptotic, suitable for large d. (See
|
||||
# scipy.special.kolmogorov for that asymptotic)
|
||||
# Pelz-Good[6] used the functional equation for Jacobi theta functions to
|
||||
# transform the Li-Chien/Korolyuk formula produce a computational formula
|
||||
# suitable for small d.
|
||||
#
|
||||
# Simard and L'Ecuyer[7] provided an algorithm to decide when to use each of
|
||||
# the above approaches and it is that which is used here.
|
||||
#
|
||||
# Other approaches:
|
||||
# Carvalho[8] optimizes Durbin's matrix algorithm for large values of d.
|
||||
# Moscovich and Nadler[9] use FFTs to compute the convolutions.
|
||||
|
||||
# References:
|
||||
# [1] Durbin J (1968).
|
||||
# "The Probability that the Sample Distribution Function Lies Between Two
|
||||
# Parallel Straight Lines."
|
||||
# Annals of Mathematical Statistics, 39, 398-411.
|
||||
# [2] Pomeranz J (1974).
|
||||
# "Exact Cumulative Distribution of the Kolmogorov-Smirnov Statistic for
|
||||
# Small Samples (Algorithm 487)."
|
||||
# Communications of the ACM, 17(12), 703-704.
|
||||
# [3] Marsaglia G, Tsang WW, Wang J (2003).
|
||||
# "Evaluating Kolmogorov's Distribution."
|
||||
# Journal of Statistical Software, 8(18), 1-4.
|
||||
# [4] LI-CHIEN, C. (1956).
|
||||
# "On the exact distribution of the statistics of A. N. Kolmogorov and
|
||||
# their asymptotic expansion."
|
||||
# Acta Matematica Sinica, 6, 55-81.
|
||||
# [5] KOROLYUK, V. S. (1960).
|
||||
# "Asymptotic analysis of the distribution of the maximum deviation in
|
||||
# the Bernoulli scheme."
|
||||
# Theor. Probability Appl., 4, 339-366.
|
||||
# [6] Pelz W, Good IJ (1976).
|
||||
# "Approximating the Lower Tail-areas of the Kolmogorov-Smirnov One-sample
|
||||
# Statistic."
|
||||
# Journal of the Royal Statistical Society, Series B, 38(2), 152-156.
|
||||
# [7] Simard, R., L'Ecuyer, P. (2011)
|
||||
# "Computing the Two-Sided Kolmogorov-Smirnov Distribution",
|
||||
# Journal of Statistical Software, Vol 39, 11, 1-18.
|
||||
# [8] Carvalho, Luis (2015)
|
||||
# "An Improved Evaluation of Kolmogorov's Distribution"
|
||||
# Journal of Statistical Software, Code Snippets; Vol 65(3), 1-8.
|
||||
# [9] Amit Moscovich, Boaz Nadler (2017)
|
||||
# "Fast calculation of boundary crossing probabilities for Poisson
|
||||
# processes",
|
||||
# Statistics & Probability Letters, Vol 123, 177-182.
|
||||
|
||||
|
||||
import numpy as np
|
||||
import scipy.special
|
||||
import scipy.special._ufuncs as scu
|
||||
import scipy.misc
|
||||
|
||||
_E128 = 128
|
||||
_EP128 = np.ldexp(np.longdouble(1), _E128)
|
||||
_EM128 = np.ldexp(np.longdouble(1), -_E128)
|
||||
|
||||
_SQRT2PI = np.sqrt(2 * np.pi)
|
||||
_LOG_2PI = np.log(2 * np.pi)
|
||||
_MIN_LOG = -708
|
||||
_SQRT3 = np.sqrt(3)
|
||||
_PI_SQUARED = np.pi ** 2
|
||||
_PI_FOUR = np.pi ** 4
|
||||
_PI_SIX = np.pi ** 6
|
||||
|
||||
# [Lifted from _loggamma.pxd.] If B_m are the Bernoulli numbers,
|
||||
# then Stirling coeffs are B_{2j}/(2j)/(2j-1) for j=8,...1.
|
||||
_STIRLING_COEFFS = [-2.955065359477124183e-2, 6.4102564102564102564e-3,
|
||||
-1.9175269175269175269e-3, 8.4175084175084175084e-4,
|
||||
-5.952380952380952381e-4, 7.9365079365079365079e-4,
|
||||
-2.7777777777777777778e-3, 8.3333333333333333333e-2]
|
||||
|
||||
def _log_nfactorial_div_n_pow_n(n):
|
||||
# Computes n! / n**n
|
||||
# = (n-1)! / n**(n-1)
|
||||
# Uses Stirling's approximation, but removes n*log(n) up-front to
|
||||
# avoid subtractive cancellation.
|
||||
# = log(n)/2 - n + log(sqrt(2pi)) + sum B_{2j}/(2j)/(2j-1)/n**(2j-1)
|
||||
rn = 1.0/n
|
||||
return np.log(n)/2 - n + _LOG_2PI/2 + rn * np.polyval(_STIRLING_COEFFS, rn/n)
|
||||
|
||||
|
||||
def _clip_prob(p):
|
||||
"""clips a probability to range 0<=p<=1."""
|
||||
return np.clip(p, 0.0, 1.0)
|
||||
|
||||
|
||||
def _select_and_clip_prob(cdfprob, sfprob, cdf=True):
|
||||
"""Selects either the CDF or SF, and then clips to range 0<=p<=1."""
|
||||
p = np.where(cdf, cdfprob, sfprob)
|
||||
return _clip_prob(p)
|
||||
|
||||
|
||||
def _kolmogn_DMTW(n, d, cdf=True):
|
||||
r"""Computes the Kolmogorov CDF: Pr(D_n <= d) using the MTW approach to
|
||||
the Durbin matrix algorithm.
|
||||
|
||||
Durbin (1968); Marsaglia, Tsang, Wang (2003). [1], [3].
|
||||
"""
|
||||
# Write d = (k-h)/n, where k is positive integer and 0 <= h < 1
|
||||
# Generate initial matrix H of size m*m where m=(2k-1)
|
||||
# Compute k-th row of (n!/n^n) * H^n, scaling intermediate results.
|
||||
# Requires memory O(m^2) and computation O(m^2 log(n)).
|
||||
# Most suitable for small m.
|
||||
|
||||
if d >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf)
|
||||
nd = n * d
|
||||
if nd <= 0.5:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf)
|
||||
k = int(np.ceil(nd))
|
||||
h = k - nd
|
||||
m = 2 * k - 1
|
||||
|
||||
H = np.zeros([m, m])
|
||||
|
||||
# Initialize: v is first column (and last row) of H
|
||||
# v[j] = (1-h^(j+1)/(j+1)! (except for v[-1])
|
||||
# w[j] = 1/(j)!
|
||||
# q = k-th row of H (actually i!/n^i*H^i)
|
||||
intm = np.arange(1, m + 1)
|
||||
v = 1.0 - h ** intm
|
||||
w = np.empty(m)
|
||||
fac = 1.0
|
||||
for j in intm:
|
||||
w[j - 1] = fac
|
||||
fac /= j # This might underflow. Isn't a problem.
|
||||
v[j - 1] *= fac
|
||||
tt = max(2 * h - 1.0, 0)**m - 2*h**m
|
||||
v[-1] = (1.0 + tt) * fac
|
||||
|
||||
for i in range(1, m):
|
||||
H[i - 1:, i] = w[:m - i + 1]
|
||||
H[:, 0] = v
|
||||
H[-1, :] = np.flip(v, axis=0)
|
||||
|
||||
Hpwr = np.eye(np.shape(H)[0]) # Holds intermediate powers of H
|
||||
nn = n
|
||||
expnt = 0 # Scaling of Hpwr
|
||||
Hexpnt = 0 # Scaling of H
|
||||
while nn > 0:
|
||||
if nn % 2:
|
||||
Hpwr = np.matmul(Hpwr, H)
|
||||
expnt += Hexpnt
|
||||
H = np.matmul(H, H)
|
||||
Hexpnt *= 2
|
||||
# Scale as needed.
|
||||
if np.abs(H[k - 1, k - 1]) > _EP128:
|
||||
H /= _EP128
|
||||
Hexpnt += _E128
|
||||
nn = nn // 2
|
||||
|
||||
p = Hpwr[k - 1, k - 1]
|
||||
|
||||
# Multiply by n!/n^n
|
||||
for i in range(1, n + 1):
|
||||
p = i * p / n
|
||||
if np.abs(p) < _EM128:
|
||||
p *= _EP128
|
||||
expnt -= _E128
|
||||
|
||||
# unscale
|
||||
if expnt != 0:
|
||||
p = np.ldexp(p, expnt)
|
||||
|
||||
return _select_and_clip_prob(p, 1.0-p, cdf)
|
||||
|
||||
|
||||
def _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf):
|
||||
"""Compute the endpoints of the interval for row i."""
|
||||
if i == 0:
|
||||
j1, j2 = -ll - ceilf - 1, ll + ceilf - 1
|
||||
else:
|
||||
# i + 1 = 2*ip1div2 + ip1mod2
|
||||
ip1div2, ip1mod2 = divmod(i + 1, 2)
|
||||
if ip1mod2 == 0: # i is odd
|
||||
if ip1div2 == n + 1:
|
||||
j1, j2 = n - ll - ceilf - 1, n + ll + ceilf - 1
|
||||
else:
|
||||
j1, j2 = ip1div2 - 1 - ll - roundf - 1, ip1div2 + ll - 1 + ceilf - 1
|
||||
else:
|
||||
j1, j2 = ip1div2 - 1 - ll - 1, ip1div2 + ll + roundf - 1
|
||||
|
||||
return max(j1 + 2, 0), min(j2, n)
|
||||
|
||||
|
||||
def _kolmogn_Pomeranz(n, x, cdf=True):
|
||||
r"""Computes Pr(D_n <= d) using the Pomeranz recursion algorithm.
|
||||
|
||||
Pomeranz (1974) [2]
|
||||
"""
|
||||
|
||||
# V is n*(2n+2) matrix.
|
||||
# Each row is convolution of the previous row and probabilities from a
|
||||
# Poisson distribution.
|
||||
# Desired CDF probability is n! V[n-1, 2n+1] (final entry in final row).
|
||||
# Only two rows are needed at any given stage:
|
||||
# - Call them V0 and V1.
|
||||
# - Swap each iteration
|
||||
# Only a few (contiguous) entries in each row can be non-zero.
|
||||
# - Keep track of start and end (j1 and j2 below)
|
||||
# - V0s and V1s track the start in the two rows
|
||||
# Scale intermediate results as needed.
|
||||
# Only a few different Poisson distributions can occur
|
||||
t = n * x
|
||||
ll = int(np.floor(t))
|
||||
f = 1.0 * (t - ll) # fractional part of t
|
||||
g = min(f, 1.0 - f)
|
||||
ceilf = (1 if f > 0 else 0)
|
||||
roundf = (1 if f > 0.5 else 0)
|
||||
npwrs = 2 * (ll + 1) # Maximum number of powers needed in convolutions
|
||||
gpower = np.empty(npwrs) # gpower = (g/n)^m/m!
|
||||
twogpower = np.empty(npwrs) # twogpower = (2g/n)^m/m!
|
||||
onem2gpower = np.empty(npwrs) # onem2gpower = ((1-2g)/n)^m/m!
|
||||
# gpower etc are *almost* Poisson probs, just missing normalizing factor.
|
||||
|
||||
gpower[0] = 1.0
|
||||
twogpower[0] = 1.0
|
||||
onem2gpower[0] = 1.0
|
||||
expnt = 0
|
||||
g_over_n, two_g_over_n, one_minus_two_g_over_n = g/n, 2*g/n, (1 - 2*g)/n
|
||||
for m in range(1, npwrs):
|
||||
gpower[m] = gpower[m - 1] * g_over_n / m
|
||||
twogpower[m] = twogpower[m - 1] * two_g_over_n / m
|
||||
onem2gpower[m] = onem2gpower[m - 1] * one_minus_two_g_over_n / m
|
||||
|
||||
V0 = np.zeros([npwrs])
|
||||
V1 = np.zeros([npwrs])
|
||||
V1[0] = 1 # first row
|
||||
V0s, V1s = 0, 0 # start indices of the two rows
|
||||
|
||||
j1, j2 = _pomeranz_compute_j1j2(0, n, ll, ceilf, roundf)
|
||||
for i in range(1, 2 * n + 2):
|
||||
# Preserve j1, V1, V1s, V0s from last iteration
|
||||
k1 = j1
|
||||
V0, V1 = V1, V0
|
||||
V0s, V1s = V1s, V0s
|
||||
V1.fill(0.0)
|
||||
j1, j2 = _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf)
|
||||
if i == 1 or i == 2 * n + 1:
|
||||
pwrs = gpower
|
||||
else:
|
||||
pwrs = (twogpower if i % 2 else onem2gpower)
|
||||
ln2 = j2 - k1 + 1
|
||||
if ln2 > 0:
|
||||
conv = np.convolve(V0[k1 - V0s:k1 - V0s + ln2], pwrs[:ln2])
|
||||
conv_start = j1 - k1 # First index to use from conv
|
||||
conv_len = j2 - j1 + 1 # Number of entries to use from conv
|
||||
V1[:conv_len] = conv[conv_start:conv_start + conv_len]
|
||||
# Scale to avoid underflow.
|
||||
if 0 < np.max(V1) < _EM128:
|
||||
V1 *= _EP128
|
||||
expnt -= _E128
|
||||
V1s = V0s + j1 - k1
|
||||
|
||||
# multiply by n!
|
||||
ans = V1[n - V1s]
|
||||
for m in range(1, n + 1):
|
||||
if np.abs(ans) > _EP128:
|
||||
ans *= _EM128
|
||||
expnt += _E128
|
||||
ans *= m
|
||||
|
||||
# Undo any intermediate scaling
|
||||
if expnt != 0:
|
||||
ans = np.ldexp(ans, expnt)
|
||||
ans = _select_and_clip_prob(ans, 1.0 - ans, cdf)
|
||||
return ans
|
||||
|
||||
|
||||
def _kolmogn_PelzGood(n, x, cdf=True):
|
||||
"""Computes the Pelz-Good approximation to Prob(Dn <= x) with 0<=x<=1.
|
||||
|
||||
Start with Li-Chien, Korolyuk approximation:
|
||||
Prob(Dn <= x) ~ K0(z) + K1(z)/sqrt(n) + K2(z)/n + K3(z)/n**1.5
|
||||
where z = x*sqrt(n).
|
||||
Transform each K_(z) using Jacobi theta functions into a form suitable
|
||||
for small z.
|
||||
Pelz-Good (1976). [6]
|
||||
"""
|
||||
if x <= 0.0:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
if x >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
|
||||
|
||||
z = np.sqrt(n) * x
|
||||
zsquared, zthree, zfour, zsix = z**2, z**3, z**4, z**6
|
||||
|
||||
qlog = -_PI_SQUARED / 8 / zsquared
|
||||
if qlog < _MIN_LOG: # z ~ 0.041743441416853426
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
|
||||
q = np.exp(qlog)
|
||||
|
||||
# Coefficients of terms in the sums for K1, K2 and K3
|
||||
k1a = -zsquared
|
||||
k1b = _PI_SQUARED / 4
|
||||
|
||||
k2a = 6 * zsix + 2 * zfour
|
||||
k2b = (2 * zfour - 5 * zsquared) * _PI_SQUARED / 4
|
||||
k2c = _PI_FOUR * (1 - 2 * zsquared) / 16
|
||||
|
||||
k3d = _PI_SIX * (5 - 30 * zsquared) / 64
|
||||
k3c = _PI_FOUR * (-60 * zsquared + 212 * zfour) / 16
|
||||
k3b = _PI_SQUARED * (135 * zfour - 96 * zsix) / 4
|
||||
k3a = -30 * zsix - 90 * z**8
|
||||
|
||||
K0to3 = np.zeros(4)
|
||||
# Use a Horner scheme to evaluate sum c_i q^(i^2)
|
||||
# Reduces to a sum over odd integers.
|
||||
maxk = int(np.ceil(16 * z / np.pi))
|
||||
for k in range(maxk, 0, -1):
|
||||
m = 2 * k - 1
|
||||
msquared, mfour, msix = m**2, m**4, m**6
|
||||
qpower = np.power(q, 8 * k)
|
||||
coeffs = np.array([1.0,
|
||||
k1a + k1b*msquared,
|
||||
k2a + k2b*msquared + k2c*mfour,
|
||||
k3a + k3b*msquared + k3c*mfour + k3d*msix])
|
||||
K0to3 *= qpower
|
||||
K0to3 += coeffs
|
||||
K0to3 *= q
|
||||
K0to3 *= _SQRT2PI
|
||||
# z**10 > 0 as z > 0.04
|
||||
K0to3 /= np.array([z, 6 * zfour, 72 * z**7, 6480 * z**10])
|
||||
|
||||
# Now do the other sum over the other terms, all integers k
|
||||
# K_2: (pi^2 k^2) q^(k^2),
|
||||
# K_3: (3pi^2 k^2 z^2 - pi^4 k^4)*q^(k^2)
|
||||
# Don't expect much subtractive cancellation so use direct calculation
|
||||
q = np.exp(-_PI_SQUARED / 2 / zsquared)
|
||||
ks = np.arange(maxk, 0, -1)
|
||||
ksquared = ks ** 2
|
||||
sqrt3z = _SQRT3 * z
|
||||
kspi = np.pi * ks
|
||||
qpwers = q ** ksquared
|
||||
k2extra = np.sum(ksquared * qpwers)
|
||||
k2extra *= _PI_SQUARED * _SQRT2PI/(-36 * zthree)
|
||||
K0to3[2] += k2extra
|
||||
k3extra = np.sum((sqrt3z + kspi) * (sqrt3z - kspi) * ksquared * qpwers)
|
||||
k3extra *= _PI_SQUARED * _SQRT2PI/(216 * zsix)
|
||||
K0to3[3] += k3extra
|
||||
powers_of_n = np.power(n * 1.0, np.arange(len(K0to3)) / 2.0)
|
||||
K0to3 /= powers_of_n
|
||||
|
||||
if not cdf:
|
||||
K0to3 *= -1
|
||||
K0to3[0] += 1
|
||||
|
||||
Ksum = sum(K0to3)
|
||||
return Ksum
|
||||
|
||||
|
||||
def _kolmogn(n, x, cdf=True):
|
||||
"""Computes the CDF(or SF) for the two-sided Kolmogorov-Smirnov statistic.
|
||||
|
||||
x must be of type float, n of type integer.
|
||||
|
||||
Simard & L'Ecuyer (2011) [7].
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if x >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
|
||||
if x <= 0.0:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
t = n * x
|
||||
if t <= 1.0: # Ruben-Gambino: 1/2n <= x <= 1/n
|
||||
if t <= 0.5:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
if n <= 140:
|
||||
prob = np.prod(np.arange(1, n+1) * (1.0/n) * (2*t - 1))
|
||||
else:
|
||||
prob = np.exp(_log_nfactorial_div_n_pow_n(n) + n * np.log(2*t-1))
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
if t >= n - 1: # Ruben-Gambino
|
||||
prob = 2 * (1.0 - x)**n
|
||||
return _select_and_clip_prob(1 - prob, prob, cdf=cdf)
|
||||
if x >= 0.5: # Exact: 2 * smirnov
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
|
||||
|
||||
nxsquared = t * x
|
||||
if n <= 140:
|
||||
if nxsquared <= 0.754693:
|
||||
prob = _kolmogn_DMTW(n, x, cdf=True)
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
if nxsquared <= 4:
|
||||
prob = _kolmogn_Pomeranz(n, x, cdf=True)
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
# Now use Miller approximation of 2*smirnov
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
|
||||
|
||||
# Split CDF and SF as they have different cutoffs on nxsquared.
|
||||
if not cdf:
|
||||
if nxsquared >= 370.0:
|
||||
return 0.0
|
||||
if nxsquared >= 2.2:
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _clip_prob(prob)
|
||||
# Fall through and compute the SF as 1.0-CDF
|
||||
if nxsquared >= 18.0:
|
||||
cdfprob = 1.0
|
||||
elif n <= 100000 and n * x**1.5 <= 1.4:
|
||||
cdfprob = _kolmogn_DMTW(n, x, cdf=True)
|
||||
else:
|
||||
cdfprob = _kolmogn_PelzGood(n, x, cdf=True)
|
||||
return _select_and_clip_prob(cdfprob, 1.0 - cdfprob, cdf=cdf)
|
||||
|
||||
|
||||
def _kolmogn_p(n, x):
|
||||
"""Computes the PDF for the two-sided Kolmogorov-Smirnov statistic.
|
||||
|
||||
x must be of type float, n of type integer.
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if x >= 1.0 or x <= 0:
|
||||
return 0
|
||||
t = n * x
|
||||
if t <= 1.0:
|
||||
# Ruben-Gambino: n!/n^n * (2t-1)^n -> 2 n!/n^n * n^2 * (2t-1)^(n-1)
|
||||
if t <= 0.5:
|
||||
return 0.0
|
||||
if n <= 140:
|
||||
prd = np.prod(np.arange(1, n) * (1.0 / n) * (2 * t - 1))
|
||||
else:
|
||||
prd = np.exp(_log_nfactorial_div_n_pow_n(n) + (n-1) * np.log(2 * t - 1))
|
||||
return prd * 2 * n**2
|
||||
if t >= n - 1:
|
||||
# Ruben-Gambino : 1-2(1-x)**n -> 2n*(1-x)**(n-1)
|
||||
return 2 * (1.0 - x) ** (n-1) * n
|
||||
if x >= 0.5:
|
||||
return 2 * scipy.stats.ksone.pdf(x, n)
|
||||
|
||||
# Just take a small delta.
|
||||
# Ideally x +/- delta would stay within [i/n, (i+1)/n] for some integer a.
|
||||
# as the CDF is a piecewise degree n polynomial.
|
||||
# It has knots at 1/n, 2/n, ... (n-1)/n
|
||||
# and is not a C-infinity function at the knots
|
||||
delta = x / 2.0**16
|
||||
delta = min(delta, x - 1.0/n)
|
||||
delta = min(delta, 0.5 - x)
|
||||
|
||||
def _kk(_x):
|
||||
return kolmogn(n, _x)
|
||||
|
||||
return scipy.misc.derivative(_kk, x, dx=delta, order=5)
|
||||
|
||||
|
||||
def _kolmogni(n, p, q):
|
||||
"""Computes the PPF/ISF of kolmogn.
|
||||
|
||||
n of type integer, n>= 1
|
||||
p is the CDF, q the SF, p+q=1
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if p <= 0:
|
||||
return 1.0/n
|
||||
if q <= 0:
|
||||
return 1.0
|
||||
delta = np.exp((np.log(p) - scipy.special.loggamma(n+1))/n)
|
||||
if delta <= 1.0/n:
|
||||
return (delta + 1.0 / n) / 2
|
||||
x = -np.expm1(np.log(q/2.0)/n)
|
||||
if x >= 1 - 1.0/n:
|
||||
return x
|
||||
x1 = scu._kolmogci(p)/np.sqrt(n)
|
||||
x1 = min(x1, 1.0 - 1.0/n)
|
||||
_f = lambda x: _kolmogn(n, x) - p
|
||||
return scipy.optimize.brentq(_f, 1.0/n, x1, xtol=1e-14)
|
||||
|
||||
|
||||
def kolmogn(n, x, cdf=True):
|
||||
"""Computes the CDF for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
The two-sided Kolmogorov-Smirnov distribution has as its CDF Pr(D_n <= x),
|
||||
for a sample of size n drawn from a distribution with CDF F(t), where
|
||||
D_n &= sup_t |F_n(t) - F(t)|, and
|
||||
F_n(t) is the Empirical Cumulative Distribution Function of the sample.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
x : float, array_like
|
||||
The K-S statistic, float between 0 and 1
|
||||
cdf : bool, optional
|
||||
whether to compute the CDF(default=true) or the SF.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cdf : ndarray
|
||||
CDF (or SF it cdf is False) at the specified locations.
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, x, cdf, None],
|
||||
op_dtypes=[None, np.float64, np.bool_, np.float64])
|
||||
for _n, _x, _cdf, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
z[...] = _kolmogn(int(_n), _x, cdf=_cdf)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
|
||||
|
||||
def kolmognp(n, x):
|
||||
"""Computes the PDF for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
x : float, array_like
|
||||
The K-S statistic, float between 0 and 1
|
||||
|
||||
Returns
|
||||
-------
|
||||
pdf : ndarray
|
||||
The PDF at the specified locations
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, x, None])
|
||||
for _n, _x, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
z[...] = _kolmogn_p(int(_n), _x)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
|
||||
|
||||
def kolmogni(n, q, cdf=True):
|
||||
"""Computes the PPF(or ISF) for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
q : float, array_like
|
||||
Probabilities, float between 0 and 1
|
||||
cdf : bool, optional
|
||||
whether to compute the PPF(default=true) or the ISF.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ppf : ndarray
|
||||
PPF (or ISF if cdf is False) at the specified locations
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, q, cdf, None])
|
||||
for _n, _q, _cdf, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
_pcdf, _psf = (_q, 1-_q) if _cdf else (1-_q, _q)
|
||||
z[...] = _kolmogni(int(_n), _pcdf, _psf)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
@@ -0,0 +1,425 @@
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
from scipy import special
|
||||
from scipy import stats
|
||||
from ._axis_nan_policy import _axis_nan_policy_factory
|
||||
|
||||
|
||||
def _broadcast_concatenate(x, y, axis):
|
||||
'''Broadcast then concatenate arrays, leaving concatenation axis last'''
|
||||
x = np.moveaxis(x, axis, -1)
|
||||
y = np.moveaxis(y, axis, -1)
|
||||
z = np.broadcast(x[..., 0], y[..., 0])
|
||||
x = np.broadcast_to(x, z.shape + (x.shape[-1],))
|
||||
y = np.broadcast_to(y, z.shape + (y.shape[-1],))
|
||||
z = np.concatenate((x, y), axis=-1)
|
||||
return x, y, z
|
||||
|
||||
|
||||
class _MWU:
|
||||
'''Distribution of MWU statistic under the null hypothesis'''
|
||||
# Possible improvement: if m and n are small enough, use integer arithmetic
|
||||
|
||||
def __init__(self):
|
||||
'''Minimal initializer'''
|
||||
self._fmnks = -np.ones((1, 1, 1))
|
||||
|
||||
def pmf(self, k, m, n):
|
||||
'''Probability mass function'''
|
||||
self._resize_fmnks(m, n, np.max(k))
|
||||
# could loop over just the unique elements, but probably not worth
|
||||
# the time to find them
|
||||
for i in np.ravel(k):
|
||||
self._f(m, n, i)
|
||||
return self._fmnks[m, n, k] / special.binom(m + n, m)
|
||||
|
||||
def cdf(self, k, m, n):
|
||||
'''Cumulative distribution function'''
|
||||
# We could use the fact that the distribution is symmetric to avoid
|
||||
# summing more than m*n/2 terms, but it might not be worth the
|
||||
# overhead. Let's leave that to an improvement.
|
||||
pmfs = self.pmf(np.arange(0, np.max(k) + 1), m, n)
|
||||
cdfs = np.cumsum(pmfs)
|
||||
return cdfs[k]
|
||||
|
||||
def sf(self, k, m, n):
|
||||
'''Survival function'''
|
||||
# Use the fact that the distribution is symmetric; i.e.
|
||||
# _f(m, n, m*n-k) = _f(m, n, k), and sum from the left
|
||||
k = m*n - k
|
||||
# Note that both CDF and SF include the PMF at k. The p-value is
|
||||
# calculated from the SF and should include the mass at k, so this
|
||||
# is desirable
|
||||
return self.cdf(k, m, n)
|
||||
|
||||
def _resize_fmnks(self, m, n, k):
|
||||
'''If necessary, expand the array that remembers PMF values'''
|
||||
# could probably use `np.pad` but I'm not sure it would save code
|
||||
shape_old = np.array(self._fmnks.shape)
|
||||
shape_new = np.array((m+1, n+1, k+1))
|
||||
if np.any(shape_new > shape_old):
|
||||
shape = np.maximum(shape_old, shape_new)
|
||||
fmnks = -np.ones(shape) # create the new array
|
||||
m0, n0, k0 = shape_old
|
||||
fmnks[:m0, :n0, :k0] = self._fmnks # copy remembered values
|
||||
self._fmnks = fmnks
|
||||
|
||||
def _f(self, m, n, k):
|
||||
'''Recursive implementation of function of [3] Theorem 2.5'''
|
||||
|
||||
# [3] Theorem 2.5 Line 1
|
||||
if k < 0 or m < 0 or n < 0 or k > m*n:
|
||||
return 0
|
||||
|
||||
# if already calculated, return the value
|
||||
if self._fmnks[m, n, k] >= 0:
|
||||
return self._fmnks[m, n, k]
|
||||
|
||||
if k == 0 and m >= 0 and n >= 0: # [3] Theorem 2.5 Line 2
|
||||
fmnk = 1
|
||||
else: # [3] Theorem 2.5 Line 3 / Equation 3
|
||||
fmnk = self._f(m-1, n, k-n) + self._f(m, n-1, k)
|
||||
|
||||
self._fmnks[m, n, k] = fmnk # remember result
|
||||
|
||||
return fmnk
|
||||
|
||||
|
||||
# Maintain state for faster repeat calls to mannwhitneyu w/ method='exact'
|
||||
_mwu_state = _MWU()
|
||||
|
||||
|
||||
def _tie_term(ranks):
|
||||
"""Tie correction term"""
|
||||
# element i of t is the number of elements sharing rank i
|
||||
_, t = np.unique(ranks, return_counts=True, axis=-1)
|
||||
return (t**3 - t).sum(axis=-1)
|
||||
|
||||
|
||||
def _get_mwu_z(U, n1, n2, ranks, axis=0, continuity=True):
|
||||
'''Standardized MWU statistic'''
|
||||
# Follows mannwhitneyu [2]
|
||||
mu = n1 * n2 / 2
|
||||
n = n1 + n2
|
||||
|
||||
# Tie correction according to [2]
|
||||
tie_term = np.apply_along_axis(_tie_term, -1, ranks)
|
||||
s = np.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1))))
|
||||
|
||||
# equivalent to using scipy.stats.tiecorrect
|
||||
# T = np.apply_along_axis(stats.tiecorrect, -1, ranks)
|
||||
# s = np.sqrt(T * n1 * n2 * (n1+n2+1) / 12.0)
|
||||
|
||||
numerator = U - mu
|
||||
|
||||
# Continuity correction.
|
||||
# Because SF is always used to calculate the p-value, we can always
|
||||
# _subtract_ 0.5 for the continuity correction. This always increases the
|
||||
# p-value to account for the rest of the probability mass _at_ q = U.
|
||||
if continuity:
|
||||
numerator -= 0.5
|
||||
|
||||
# no problem evaluating the norm SF at an infinity
|
||||
with np.errstate(divide='ignore', invalid='ignore'):
|
||||
z = numerator / s
|
||||
return z
|
||||
|
||||
|
||||
def _mwu_input_validation(x, y, use_continuity, alternative, axis, method):
|
||||
''' Input validation and standardization for mannwhitneyu '''
|
||||
# Would use np.asarray_chkfinite, but infs are OK
|
||||
x, y = np.atleast_1d(x), np.atleast_1d(y)
|
||||
if np.isnan(x).any() or np.isnan(y).any():
|
||||
raise ValueError('`x` and `y` must not contain NaNs.')
|
||||
if np.size(x) == 0 or np.size(y) == 0:
|
||||
raise ValueError('`x` and `y` must be of nonzero size.')
|
||||
|
||||
bools = {True, False}
|
||||
if use_continuity not in bools:
|
||||
raise ValueError(f'`use_continuity` must be one of {bools}.')
|
||||
|
||||
alternatives = {"two-sided", "less", "greater"}
|
||||
alternative = alternative.lower()
|
||||
if alternative not in alternatives:
|
||||
raise ValueError(f'`alternative` must be one of {alternatives}.')
|
||||
|
||||
axis_int = int(axis)
|
||||
if axis != axis_int:
|
||||
raise ValueError('`axis` must be an integer.')
|
||||
|
||||
methods = {"asymptotic", "exact", "auto"}
|
||||
method = method.lower()
|
||||
if method not in methods:
|
||||
raise ValueError(f'`method` must be one of {methods}.')
|
||||
|
||||
return x, y, use_continuity, alternative, axis_int, method
|
||||
|
||||
|
||||
def _tie_check(xy):
|
||||
"""Find any ties in data"""
|
||||
_, t = np.unique(xy, return_counts=True, axis=-1)
|
||||
return np.any(t != 1)
|
||||
|
||||
|
||||
def _mwu_choose_method(n1, n2, xy, method):
|
||||
"""Choose method 'asymptotic' or 'exact' depending on input size, ties"""
|
||||
|
||||
# if both inputs are large, asymptotic is OK
|
||||
if n1 > 8 and n2 > 8:
|
||||
return "asymptotic"
|
||||
|
||||
# if there are any ties, asymptotic is preferred
|
||||
if np.apply_along_axis(_tie_check, -1, xy).any():
|
||||
return "asymptotic"
|
||||
|
||||
return "exact"
|
||||
|
||||
|
||||
MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
|
||||
|
||||
|
||||
@_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2)
|
||||
def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",
|
||||
axis=0, method="auto"):
|
||||
r'''Perform the Mann-Whitney U rank test on two independent samples.
|
||||
|
||||
The Mann-Whitney U test is a nonparametric test of the null hypothesis
|
||||
that the distribution underlying sample `x` is the same as the
|
||||
distribution underlying sample `y`. It is often used as a test of
|
||||
difference in location between distributions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array-like
|
||||
N-d arrays of samples. The arrays must be broadcastable except along
|
||||
the dimension given by `axis`.
|
||||
use_continuity : bool, optional
|
||||
Whether a continuity correction (1/2) should be applied.
|
||||
Default is True when `method` is ``'asymptotic'``; has no effect
|
||||
otherwise.
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis. Default is 'two-sided'.
|
||||
Let *F(u)* and *G(u)* be the cumulative distribution functions of the
|
||||
distributions underlying `x` and `y`, respectively. Then the following
|
||||
alternative hypotheses are available:
|
||||
|
||||
* 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
|
||||
at least one *u*.
|
||||
* 'less': the distribution underlying `x` is stochastically less
|
||||
than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*.
|
||||
* 'greater': the distribution underlying `x` is stochastically greater
|
||||
than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*.
|
||||
|
||||
Under a more restrictive set of assumptions, the alternative hypotheses
|
||||
can be expressed in terms of the locations of the distributions;
|
||||
see [5] section 5.1.
|
||||
axis : int, optional
|
||||
Axis along which to perform the test. Default is 0.
|
||||
method : {'auto', 'asymptotic', 'exact'}, optional
|
||||
Selects the method used to calculate the *p*-value.
|
||||
Default is 'auto'. The following options are available.
|
||||
|
||||
* ``'asymptotic'``: compares the standardized test statistic
|
||||
against the normal distribution, correcting for ties.
|
||||
* ``'exact'``: computes the exact *p*-value by comparing the observed
|
||||
:math:`U` statistic against the exact distribution of the :math:`U`
|
||||
statistic under the null hypothesis. No correction is made for ties.
|
||||
* ``'auto'``: chooses ``'exact'`` when the size of one of the samples
|
||||
is less than 8 and there are no ties; chooses ``'asymptotic'``
|
||||
otherwise.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : MannwhitneyuResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
The Mann-Whitney U statistic corresponding with sample `x`. See
|
||||
Notes for the test statistic corresponding with sample `y`.
|
||||
pvalue : float
|
||||
The associated *p*-value for the chosen `alternative`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
If ``U1`` is the statistic corresponding with sample `x`, then the
|
||||
statistic corresponding with sample `y` is
|
||||
`U2 = `x.shape[axis] * y.shape[axis] - U1``.
|
||||
|
||||
`mannwhitneyu` is for independent samples. For related / paired samples,
|
||||
consider `scipy.stats.wilcoxon`.
|
||||
|
||||
`method` ``'exact'`` is recommended when there are no ties and when either
|
||||
sample size is less than 8 [1]_. The implementation follows the recurrence
|
||||
relation originally proposed in [1]_ as it is described in [3]_.
|
||||
Note that the exact method is *not* corrected for ties, but
|
||||
`mannwhitneyu` will not raise errors or warnings if there are ties in the
|
||||
data.
|
||||
|
||||
The Mann-Whitney U test is a non-parametric version of the t-test for
|
||||
independent samples. When the the means of samples from the populations
|
||||
are normally distributed, consider `scipy.stats.ttest_ind`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random
|
||||
variables is stochastically larger than the other", The Annals of
|
||||
Mathematical Statistics, Vol. 18, pp. 50-60, 1947.
|
||||
.. [2] Mann-Whitney U Test, Wikipedia,
|
||||
http://en.wikipedia.org/wiki/Mann-Whitney_U_test
|
||||
.. [3] A. Di Bucchianico, "Combinatorics, computer algebra, and the
|
||||
Wilcoxon-Mann-Whitney test", Journal of Statistical Planning and
|
||||
Inference, Vol. 79, pp. 349-364, 1999.
|
||||
.. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics
|
||||
Learning Support Centre, 2004.
|
||||
.. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney
|
||||
or t-test? On assumptions for hypothesis tests and multiple \
|
||||
interpretations of decision rules." Statistics surveys, Vol. 4, pp.
|
||||
1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/
|
||||
|
||||
Examples
|
||||
--------
|
||||
We follow the example from [4]_: nine randomly sampled young adults were
|
||||
diagnosed with type II diabetes at the ages below.
|
||||
|
||||
>>> males = [19, 22, 16, 29, 24]
|
||||
>>> females = [20, 11, 17, 12]
|
||||
|
||||
We use the Mann-Whitney U test to assess whether there is a statistically
|
||||
significant difference in the diagnosis age of males and females.
|
||||
The null hypothesis is that the distribution of male diagnosis ages is
|
||||
the same as the distribution of female diagnosis ages. We decide
|
||||
that a confidence level of 95% is required to reject the null hypothesis
|
||||
in favor of the alternative that the distributions are different.
|
||||
Since the number of samples is very small and there are no ties in the
|
||||
data, we can compare the observed test statistic against the *exact*
|
||||
distribution of the test statistic under the null hypothesis.
|
||||
|
||||
>>> from scipy.stats import mannwhitneyu
|
||||
>>> U1, p = mannwhitneyu(males, females, method="exact")
|
||||
>>> print(U1)
|
||||
17.0
|
||||
|
||||
`mannwhitneyu` always reports the statistic associated with the first
|
||||
sample, which, in this case, is males. This agrees with :math:`U_M = 17`
|
||||
reported in [4]_. The statistic associated with the second statistic
|
||||
can be calculated:
|
||||
|
||||
>>> nx, ny = len(males), len(females)
|
||||
>>> U2 = nx*ny - U1
|
||||
>>> print(U2)
|
||||
3.0
|
||||
|
||||
This agrees with :math:`U_F = 3` reported in [4]_. The two-sided
|
||||
*p*-value can be calculated from either statistic, and the value produced
|
||||
by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.
|
||||
|
||||
>>> print(p)
|
||||
0.1111111111111111
|
||||
|
||||
The exact distribution of the test statistic is asymptotically normal, so
|
||||
the example continues by comparing the exact *p*-value against the
|
||||
*p*-value produced using the normal approximation.
|
||||
|
||||
>>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")
|
||||
>>> print(pnorm)
|
||||
0.11134688653314041
|
||||
|
||||
Here `mannwhitneyu`'s reported *p*-value appears to conflict with the
|
||||
value :math:`p = 0.09` given in [4]_. The reason is that [4]_
|
||||
does not apply the continuity correction performed by `mannwhitneyu`;
|
||||
`mannwhitneyu` reduces the distance between the test statistic and the
|
||||
mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the
|
||||
discrete statistic is being compared against a continuous distribution.
|
||||
Here, the :math:`U` statistic used is less than the mean, so we reduce
|
||||
the distance by adding 0.5 in the numerator.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import norm
|
||||
>>> U = min(U1, U2)
|
||||
>>> N = nx + ny
|
||||
>>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
|
||||
>>> p = 2 * norm.cdf(z) # use CDF to get p-value from smaller statistic
|
||||
>>> print(p)
|
||||
0.11134688653314041
|
||||
|
||||
If desired, we can disable the continuity correction to get a result
|
||||
that agrees with that reported in [4]_.
|
||||
|
||||
>>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,
|
||||
... method="asymptotic")
|
||||
>>> print(pnorm)
|
||||
0.0864107329737
|
||||
|
||||
Regardless of whether we perform an exact or asymptotic test, the
|
||||
probability of the test statistic being as extreme or more extreme by
|
||||
chance exceeds 5%, so we do not consider the results statistically
|
||||
significant.
|
||||
|
||||
Suppose that, before seeing the data, we had hypothesized that females
|
||||
would tend to be diagnosed at a younger age than males.
|
||||
In that case, it would be natural to provide the female ages as the
|
||||
first input, and we would have performed a one-sided test using
|
||||
``alternative = 'less'``: females are diagnosed at an age that is
|
||||
stochastically less than that of males.
|
||||
|
||||
>>> res = mannwhitneyu(females, males, alternative="less", method="exact")
|
||||
>>> print(res)
|
||||
MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)
|
||||
|
||||
Again, the probability of getting a sufficiently low value of the
|
||||
test statistic by chance under the null hypothesis is greater than 5%,
|
||||
so we do not reject the null hypothesis in favor of our alternative.
|
||||
|
||||
If it is reasonable to assume that the means of samples from the
|
||||
populations are normally distributed, we could have used a t-test to
|
||||
perform the analysis.
|
||||
|
||||
>>> from scipy.stats import ttest_ind
|
||||
>>> res = ttest_ind(females, males, alternative="less")
|
||||
>>> print(res)
|
||||
Ttest_indResult(statistic=-2.239334696520584, pvalue=0.030068441095757924)
|
||||
|
||||
Under this assumption, the *p*-value would be low enough to reject the
|
||||
null hypothesis in favor of the alternative.
|
||||
|
||||
'''
|
||||
|
||||
x, y, use_continuity, alternative, axis_int, method = (
|
||||
_mwu_input_validation(x, y, use_continuity, alternative, axis, method))
|
||||
|
||||
x, y, xy = _broadcast_concatenate(x, y, axis)
|
||||
|
||||
n1, n2 = x.shape[-1], y.shape[-1]
|
||||
|
||||
if method == "auto":
|
||||
method = _mwu_choose_method(n1, n2, xy, method)
|
||||
|
||||
# Follows [2]
|
||||
ranks = stats.rankdata(xy, axis=-1) # method 2, step 1
|
||||
R1 = ranks[..., :n1].sum(axis=-1) # method 2, step 2
|
||||
U1 = R1 - n1*(n1+1)/2 # method 2, step 3
|
||||
U2 = n1 * n2 - U1 # as U1 + U2 = n1 * n2
|
||||
|
||||
if alternative == "greater":
|
||||
U, f = U1, 1 # U is the statistic to use for p-value, f is a factor
|
||||
elif alternative == "less":
|
||||
U, f = U2, 1 # Due to symmetry, use SF of U2 rather than CDF of U1
|
||||
else:
|
||||
U, f = np.maximum(U1, U2), 2 # multiply SF by two for two-sided test
|
||||
|
||||
if method == "exact":
|
||||
p = _mwu_state.sf(U.astype(int), n1, n2)
|
||||
elif method == "asymptotic":
|
||||
z = _get_mwu_z(U, n1, n2, ranks, continuity=use_continuity)
|
||||
p = stats.norm.sf(z)
|
||||
p *= f
|
||||
|
||||
# Ensure that test statistic is not greater than 1
|
||||
# This could happen for exact test when U = m*n/2
|
||||
p = np.clip(p, 0, 1)
|
||||
|
||||
return MannwhitneyuResult(U1, p)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,474 @@
|
||||
"""
|
||||
Additional statistics functions with support for masked arrays.
|
||||
|
||||
"""
|
||||
|
||||
# Original author (2007): Pierre GF Gerard-Marchant
|
||||
|
||||
|
||||
__all__ = ['compare_medians_ms',
|
||||
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
|
||||
'idealfourths',
|
||||
'median_cihs','mjci','mquantiles_cimj',
|
||||
'rsh',
|
||||
'trimmed_mean_ci',]
|
||||
|
||||
|
||||
import numpy as np
|
||||
from numpy import float_, int_, ndarray
|
||||
|
||||
import numpy.ma as ma
|
||||
from numpy.ma import MaskedArray
|
||||
|
||||
from . import _mstats_basic as mstats
|
||||
|
||||
from scipy.stats.distributions import norm, beta, t, binom
|
||||
|
||||
|
||||
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
|
||||
"""
|
||||
Computes quantile estimates with the Harrell-Davis method.
|
||||
|
||||
The quantile estimates are calculated as a weighted linear combination
|
||||
of order statistics.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : bool, optional
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles : MaskedArray
|
||||
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
|
||||
quantiles and variances (if `var` is True), where ``p`` is the
|
||||
number of quantiles.
|
||||
|
||||
See Also
|
||||
--------
|
||||
hdquantiles_sd
|
||||
|
||||
"""
|
||||
def _hd_1D(data,prob,var):
|
||||
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
|
||||
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
|
||||
# Don't use length here, in case we have a numpy scalar
|
||||
n = xsorted.size
|
||||
|
||||
hd = np.empty((2,len(prob)), float_)
|
||||
if n < 2:
|
||||
hd.flat = np.nan
|
||||
if var:
|
||||
return hd
|
||||
return hd[0]
|
||||
|
||||
v = np.arange(n+1) / float(n)
|
||||
betacdf = beta.cdf
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
hd_mean = np.dot(w, xsorted)
|
||||
hd[0,i] = hd_mean
|
||||
#
|
||||
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
|
||||
#
|
||||
hd[0, prob == 0] = xsorted[0]
|
||||
hd[0, prob == 1] = xsorted[-1]
|
||||
if var:
|
||||
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
|
||||
return hd
|
||||
return hd[0]
|
||||
# Initialization & checks
|
||||
data = ma.array(data, copy=False, dtype=float_)
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None) or (data.ndim == 1):
|
||||
result = _hd_1D(data, p, var)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
|
||||
|
||||
return ma.fix_invalid(result, copy=False)
|
||||
|
||||
|
||||
def hdmedian(data, axis=-1, var=False):
|
||||
"""
|
||||
Returns the Harrell-Davis estimate of the median along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
axis : int, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : bool, optional
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdmedian : MaskedArray
|
||||
The median values. If ``var=True``, the variance is returned inside
|
||||
the masked array. E.g. for a 1-D array the shape change from (1,) to
|
||||
(2,).
|
||||
|
||||
"""
|
||||
result = hdquantiles(data,[0.5], axis=axis, var=var)
|
||||
return result.squeeze()
|
||||
|
||||
|
||||
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
|
||||
"""
|
||||
The standard error of the Harrell-Davis quantile estimates by jackknife.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles_sd : MaskedArray
|
||||
Standard error of the Harrell-Davis quantile estimates.
|
||||
|
||||
See Also
|
||||
--------
|
||||
hdquantiles
|
||||
|
||||
"""
|
||||
def _hdsd_1D(data, prob):
|
||||
"Computes the std error for 1D arrays."
|
||||
xsorted = np.sort(data.compressed())
|
||||
n = len(xsorted)
|
||||
|
||||
hdsd = np.empty(len(prob), float_)
|
||||
if n < 2:
|
||||
hdsd.flat = np.nan
|
||||
|
||||
vv = np.arange(n) / float(n-1)
|
||||
betacdf = beta.cdf
|
||||
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
mx_ = np.fromiter([w[:k] @ xsorted[:k] + w[k:] @ xsorted[k+1:]
|
||||
for k in range(n)], dtype=float_)
|
||||
# mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / (n - 1)
|
||||
# hdsd[i] = (n - 1) * np.sqrt(mx_var / n)
|
||||
hdsd[i] = np.sqrt(mx_.var() * (n - 1))
|
||||
return hdsd
|
||||
|
||||
# Initialization & checks
|
||||
data = ma.array(data, copy=False, dtype=float_)
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _hdsd_1D(data, p)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
|
||||
|
||||
return ma.fix_invalid(result, copy=False).ravel()
|
||||
|
||||
|
||||
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
|
||||
alpha=0.05, axis=None):
|
||||
"""
|
||||
Selected confidence interval of the trimmed mean along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data.
|
||||
limits : {None, tuple}, optional
|
||||
None or a two item tuple.
|
||||
Tuple of the percentages to cut on each side of the array, with respect
|
||||
to the number of unmasked data, as floats between 0. and 1. If ``n``
|
||||
is the number of unmasked data before trimming, then
|
||||
(``n * limits[0]``)th smallest data and (``n * limits[1]``)th
|
||||
largest data are masked. The total number of unmasked data after
|
||||
trimming is ``n * (1. - sum(limits))``.
|
||||
The value of one limit can be set to None to indicate an open interval.
|
||||
|
||||
Defaults to (0.2, 0.2).
|
||||
inclusive : (2,) tuple of boolean, optional
|
||||
If relative==False, tuple indicating whether values exactly equal to
|
||||
the absolute limits are allowed.
|
||||
If relative==True, tuple indicating whether the number of data being
|
||||
masked on each side should be rounded (True) or truncated (False).
|
||||
|
||||
Defaults to (True, True).
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
|
||||
Defaults to 0.05.
|
||||
axis : int, optional
|
||||
Axis along which to cut. If None, uses a flattened version of `data`.
|
||||
|
||||
Defaults to None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
trimmed_mean_ci : (2,) ndarray
|
||||
The lower and upper confidence intervals of the trimmed data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
|
||||
tmean = trimmed.mean(axis)
|
||||
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
|
||||
df = trimmed.count(axis) - 1
|
||||
tppf = t.ppf(1-alpha/2.,df)
|
||||
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
|
||||
|
||||
|
||||
def mjci(data, prob=[0.25,0.5,0.75], axis=None):
|
||||
"""
|
||||
Returns the Maritz-Jarrett estimators of the standard error of selected
|
||||
experimental quantiles of the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
"""
|
||||
def _mjci_1D(data, p):
|
||||
data = np.sort(data.compressed())
|
||||
n = data.size
|
||||
prob = (np.array(p) * n + 0.5).astype(int_)
|
||||
betacdf = beta.cdf
|
||||
|
||||
mj = np.empty(len(prob), float_)
|
||||
x = np.arange(1,n+1, dtype=float_) / n
|
||||
y = x - 1./n
|
||||
for (i,m) in enumerate(prob):
|
||||
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
|
||||
C1 = np.dot(W,data)
|
||||
C2 = np.dot(W,data**2)
|
||||
mj[i] = np.sqrt(C2 - C1**2)
|
||||
return mj
|
||||
|
||||
data = ma.array(data, copy=False)
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
return _mjci_1D(data, p)
|
||||
else:
|
||||
return ma.apply_along_axis(_mjci_1D, axis, data, p)
|
||||
|
||||
|
||||
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha confidence interval for the selected quantiles of the
|
||||
data, with Maritz-Jarrett estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles.
|
||||
If None, use a flattened array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci_lower : ndarray
|
||||
The lower boundaries of the confidence interval. Of the same length as
|
||||
`prob`.
|
||||
ci_upper : ndarray
|
||||
The upper boundaries of the confidence interval. Of the same length as
|
||||
`prob`.
|
||||
|
||||
"""
|
||||
alpha = min(alpha, 1 - alpha)
|
||||
z = norm.ppf(1 - alpha/2.)
|
||||
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
|
||||
smj = mjci(data, prob, axis=axis)
|
||||
return (xq - z * smj, xq + z * smj)
|
||||
|
||||
|
||||
def median_cihs(data, alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha-level confidence interval for the median of the data.
|
||||
|
||||
Uses the Hettmasperger-Sheather method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data. Masked values are discarded. The input should be 1D only,
|
||||
or `axis` should be set to None.
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
median_cihs
|
||||
Alpha level confidence interval.
|
||||
|
||||
"""
|
||||
def _cihs_1D(data, alpha):
|
||||
data = np.sort(data.compressed())
|
||||
n = len(data)
|
||||
alpha = min(alpha, 1-alpha)
|
||||
k = int(binom._ppf(alpha/2., n, 0.5))
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
if gk < 1-alpha:
|
||||
k -= 1
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
|
||||
I = (gk - 1 + alpha)/(gk - gkk)
|
||||
lambd = (n-k) * I / float(k + (n-2*k)*I)
|
||||
lims = (lambd*data[k] + (1-lambd)*data[k-1],
|
||||
lambd*data[n-k-1] + (1-lambd)*data[n-k])
|
||||
return lims
|
||||
data = ma.array(data, copy=False)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _cihs_1D(data, alpha)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def compare_medians_ms(group_1, group_2, axis=None):
|
||||
"""
|
||||
Compares the medians from two independent groups along the given axis.
|
||||
|
||||
The comparison is performed using the McKean-Schrader estimate of the
|
||||
standard error of the medians.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
group_1 : array_like
|
||||
First dataset. Has to be of size >=7.
|
||||
group_2 : array_like
|
||||
Second dataset. Has to be of size >=7.
|
||||
axis : int, optional
|
||||
Axis along which the medians are estimated. If None, the arrays are
|
||||
flattened. If `axis` is not None, then `group_1` and `group_2`
|
||||
should have the same shape.
|
||||
|
||||
Returns
|
||||
-------
|
||||
compare_medians_ms : {float, ndarray}
|
||||
If `axis` is None, then returns a float, otherwise returns a 1-D
|
||||
ndarray of floats with a length equal to the length of `group_1`
|
||||
along `axis`.
|
||||
|
||||
"""
|
||||
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
|
||||
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
|
||||
mstats.stde_median(group_2, axis=axis))
|
||||
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
|
||||
return 1 - norm.cdf(W)
|
||||
|
||||
|
||||
def idealfourths(data, axis=None):
|
||||
"""
|
||||
Returns an estimate of the lower and upper quartiles.
|
||||
|
||||
Uses the ideal fourths algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input array.
|
||||
axis : int, optional
|
||||
Axis along which the quartiles are estimated. If None, the arrays are
|
||||
flattened.
|
||||
|
||||
Returns
|
||||
-------
|
||||
idealfourths : {list of floats, masked array}
|
||||
Returns the two internal values that divide `data` into four parts
|
||||
using the ideal fourths algorithm either along the flattened array
|
||||
(if `axis` is None) or along `axis` of `data`.
|
||||
|
||||
"""
|
||||
def _idf(data):
|
||||
x = data.compressed()
|
||||
n = len(x)
|
||||
if n < 3:
|
||||
return [np.nan,np.nan]
|
||||
(j,h) = divmod(n/4. + 5/12.,1)
|
||||
j = int(j)
|
||||
qlo = (1-h)*x[j-1] + h*x[j]
|
||||
k = n - j
|
||||
qup = (1-h)*x[k] + h*x[k-1]
|
||||
return [qlo, qup]
|
||||
data = ma.sort(data, axis=axis).view(MaskedArray)
|
||||
if (axis is None):
|
||||
return _idf(data)
|
||||
else:
|
||||
return ma.apply_along_axis(_idf, axis, data)
|
||||
|
||||
|
||||
def rsh(data, points=None):
|
||||
"""
|
||||
Evaluates Rosenblatt's shifted histogram estimators for each data point.
|
||||
|
||||
Rosenblatt's estimator is a centered finite-difference approximation to the
|
||||
derivative of the empirical cumulative distribution function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : sequence
|
||||
Input data, should be 1-D. Masked values are ignored.
|
||||
points : sequence or None, optional
|
||||
Sequence of points where to evaluate Rosenblatt shifted histogram.
|
||||
If None, use the data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
if points is None:
|
||||
points = data
|
||||
else:
|
||||
points = np.array(points, copy=False, ndmin=1)
|
||||
|
||||
if data.ndim != 1:
|
||||
raise AttributeError("The input array should be 1D only !")
|
||||
|
||||
n = data.count()
|
||||
r = idealfourths(data, axis=None)
|
||||
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
|
||||
nhi = (data[:,None] <= points[None,:] + h).sum(0)
|
||||
nlo = (data[:,None] < points[None,:] - h).sum(0)
|
||||
return (nhi-nlo) / (2.*n*h)
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,476 @@
|
||||
from itertools import permutations
|
||||
import numpy as np
|
||||
import math
|
||||
from ._continuous_distns import norm
|
||||
import scipy.stats
|
||||
from dataclasses import make_dataclass
|
||||
|
||||
|
||||
PageTrendTestResult = make_dataclass("PageTrendTestResult",
|
||||
("statistic", "pvalue", "method"))
|
||||
|
||||
|
||||
def page_trend_test(data, ranked=False, predicted_ranks=None, method='auto'):
|
||||
r"""
|
||||
Perform Page's Test, a measure of trend in observations between treatments.
|
||||
|
||||
Page's Test (also known as Page's :math:`L` test) is useful when:
|
||||
|
||||
* there are :math:`n \geq 3` treatments,
|
||||
* :math:`m \geq 2` subjects are observed for each treatment, and
|
||||
* the observations are hypothesized to have a particular order.
|
||||
|
||||
Specifically, the test considers the null hypothesis that
|
||||
|
||||
.. math::
|
||||
|
||||
m_1 = m_2 = m_3 \cdots = m_n,
|
||||
|
||||
where :math:`m_j` is the mean of the observed quantity under treatment
|
||||
:math:`j`, against the alternative hypothesis that
|
||||
|
||||
.. math::
|
||||
|
||||
m_1 \leq m_2 \leq m_3 \leq \cdots \leq m_n,
|
||||
|
||||
where at least one inequality is strict.
|
||||
|
||||
As noted by [4]_, Page's :math:`L` test has greater statistical power than
|
||||
the Friedman test against the alternative that there is a difference in
|
||||
trend, as Friedman's test only considers a difference in the means of the
|
||||
observations without considering their order. Whereas Spearman :math:`\rho`
|
||||
considers the correlation between the ranked observations of two variables
|
||||
(e.g. the airspeed velocity of a swallow vs. the weight of the coconut it
|
||||
carries), Page's :math:`L` is concerned with a trend in an observation
|
||||
(e.g. the airspeed velocity of a swallow) across several distinct
|
||||
treatments (e.g. carrying each of five coconuts of different weight) even
|
||||
as the observation is repeated with multiple subjects (e.g. one European
|
||||
swallow and one African swallow).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array-like
|
||||
A :math:`m \times n` array; the element in row :math:`i` and
|
||||
column :math:`j` is the observation corresponding with subject
|
||||
:math:`i` and treatment :math:`j`. By default, the columns are
|
||||
assumed to be arranged in order of increasing predicted mean.
|
||||
|
||||
ranked : boolean, optional
|
||||
By default, `data` is assumed to be observations rather than ranks;
|
||||
it will be ranked with `scipy.stats.rankdata` along ``axis=1``. If
|
||||
`data` is provided in the form of ranks, pass argument ``True``.
|
||||
|
||||
predicted_ranks : array-like, optional
|
||||
The predicted ranks of the column means. If not specified,
|
||||
the columns are assumed to be arranged in order of increasing
|
||||
predicted mean, so the default `predicted_ranks` are
|
||||
:math:`[1, 2, \dots, n-1, n]`.
|
||||
|
||||
method : {'auto', 'asymptotic', 'exact'}, optional
|
||||
Selects the method used to calculate the *p*-value. The following
|
||||
options are available.
|
||||
|
||||
* 'auto': selects between 'exact' and 'asymptotic' to
|
||||
achieve reasonably accurate results in reasonable time (default)
|
||||
* 'asymptotic': compares the standardized test statistic against
|
||||
the normal distribution
|
||||
* 'exact': computes the exact *p*-value by comparing the observed
|
||||
:math:`L` statistic against those realized by all possible
|
||||
permutations of ranks (under the null hypothesis that each
|
||||
permutation is equally likely)
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : PageTrendTestResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
Page's :math:`L` test statistic.
|
||||
pvalue : float
|
||||
The associated *p*-value
|
||||
method : {'asymptotic', 'exact'}
|
||||
The method used to compute the *p*-value
|
||||
|
||||
See Also
|
||||
--------
|
||||
rankdata, friedmanchisquare, spearmanr
|
||||
|
||||
Notes
|
||||
-----
|
||||
As noted in [1]_, "the :math:`n` 'treatments' could just as well represent
|
||||
:math:`n` objects or events or performances or persons or trials ranked."
|
||||
Similarly, the :math:`m` 'subjects' could equally stand for :math:`m`
|
||||
"groupings by ability or some other control variable, or judges doing
|
||||
the ranking, or random replications of some other sort."
|
||||
|
||||
The procedure for calculating the :math:`L` statistic, adapted from
|
||||
[1]_, is:
|
||||
|
||||
1. "Predetermine with careful logic the appropriate hypotheses
|
||||
concerning the predicted ording of the experimental results.
|
||||
If no reasonable basis for ordering any treatments is known, the
|
||||
:math:`L` test is not appropriate."
|
||||
2. "As in other experiments, determine at what level of confidence
|
||||
you will reject the null hypothesis that there is no agreement of
|
||||
experimental results with the monotonic hypothesis."
|
||||
3. "Cast the experimental material into a two-way table of :math:`n`
|
||||
columns (treatments, objects ranked, conditions) and :math:`m`
|
||||
rows (subjects, replication groups, levels of control variables)."
|
||||
4. "When experimental observations are recorded, rank them across each
|
||||
row", e.g. ``ranks = scipy.stats.rankdata(data, axis=1)``.
|
||||
5. "Add the ranks in each column", e.g.
|
||||
``colsums = np.sum(ranks, axis=0)``.
|
||||
6. "Multiply each sum of ranks by the predicted rank for that same
|
||||
column", e.g. ``products = predicted_ranks * colsums``.
|
||||
7. "Sum all such products", e.g. ``L = products.sum()``.
|
||||
|
||||
[1]_ continues by suggesting use of the standardized statistic
|
||||
|
||||
.. math::
|
||||
|
||||
\chi_L^2 = \frac{\left[12L-3mn(n+1)^2\right]^2}{mn^2(n^2-1)(n+1)}
|
||||
|
||||
"which is distributed approximately as chi-square with 1 degree of
|
||||
freedom. The ordinary use of :math:`\chi^2` tables would be
|
||||
equivalent to a two-sided test of agreement. If a one-sided test
|
||||
is desired, *as will almost always be the case*, the probability
|
||||
discovered in the chi-square table should be *halved*."
|
||||
|
||||
However, this standardized statistic does not distinguish between the
|
||||
observed values being well correlated with the predicted ranks and being
|
||||
_anti_-correlated with the predicted ranks. Instead, we follow [2]_
|
||||
and calculate the standardized statistic
|
||||
|
||||
.. math::
|
||||
|
||||
\Lambda = \frac{L - E_0}{\sqrt{V_0}},
|
||||
|
||||
where :math:`E_0 = \frac{1}{4} mn(n+1)^2` and
|
||||
:math:`V_0 = \frac{1}{144} mn^2(n+1)(n^2-1)`, "which is asymptotically
|
||||
normal under the null hypothesis".
|
||||
|
||||
The *p*-value for ``method='exact'`` is generated by comparing the observed
|
||||
value of :math:`L` against the :math:`L` values generated for all
|
||||
:math:`(n!)^m` possible permutations of ranks. The calculation is performed
|
||||
using the recursive method of [5].
|
||||
|
||||
The *p*-values are not adjusted for the possibility of ties. When
|
||||
ties are present, the reported ``'exact'`` *p*-values may be somewhat
|
||||
larger (i.e. more conservative) than the true *p*-value [2]_. The
|
||||
``'asymptotic'``` *p*-values, however, tend to be smaller (i.e. less
|
||||
conservative) than the ``'exact'`` *p*-values.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Ellis Batten Page, "Ordered hypotheses for multiple treatments:
|
||||
a significant test for linear ranks", *Journal of the American
|
||||
Statistical Association* 58(301), p. 216--230, 1963.
|
||||
|
||||
.. [2] Markus Neuhauser, *Nonparametric Statistical Test: A computational
|
||||
approach*, CRC Press, p. 150--152, 2012.
|
||||
|
||||
.. [3] Statext LLC, "Page's L Trend Test - Easy Statistics", *Statext -
|
||||
Statistics Study*, https://www.statext.com/practice/PageTrendTest03.php,
|
||||
Accessed July 12, 2020.
|
||||
|
||||
.. [4] "Page's Trend Test", *Wikipedia*, WikimediaFoundation,
|
||||
https://en.wikipedia.org/wiki/Page%27s_trend_test,
|
||||
Accessed July 12, 2020.
|
||||
|
||||
.. [5] Robert E. Odeh, "The exact distribution of Page's L-statistic in
|
||||
the two-way layout", *Communications in Statistics - Simulation and
|
||||
Computation*, 6(1), p. 49--61, 1977.
|
||||
|
||||
Examples
|
||||
--------
|
||||
We use the example from [3]_: 10 students are asked to rate three
|
||||
teaching methods - tutorial, lecture, and seminar - on a scale of 1-5,
|
||||
with 1 being the lowest and 5 being the highest. We have decided that
|
||||
a confidence level of 99% is required to reject the null hypothesis in
|
||||
favor of our alternative: that the seminar will have the highest ratings
|
||||
and the tutorial will have the lowest. Initially, the data have been
|
||||
tabulated with each row representing an individual student's ratings of
|
||||
the three methods in the following order: tutorial, lecture, seminar.
|
||||
|
||||
>>> table = [[3, 4, 3],
|
||||
... [2, 2, 4],
|
||||
... [3, 3, 5],
|
||||
... [1, 3, 2],
|
||||
... [2, 3, 2],
|
||||
... [2, 4, 5],
|
||||
... [1, 2, 4],
|
||||
... [3, 4, 4],
|
||||
... [2, 4, 5],
|
||||
... [1, 3, 4]]
|
||||
|
||||
Because the tutorial is hypothesized to have the lowest ratings, the
|
||||
column corresponding with tutorial rankings should be first; the seminar
|
||||
is hypothesized to have the highest ratings, so its column should be last.
|
||||
Since the columns are already arranged in this order of increasing
|
||||
predicted mean, we can pass the table directly into `page_trend_test`.
|
||||
|
||||
>>> from scipy.stats import page_trend_test
|
||||
>>> res = page_trend_test(table)
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
|
||||
method='exact')
|
||||
|
||||
This *p*-value indicates that there is a 0.1819% chance that
|
||||
the :math:`L` statistic would reach such an extreme value under the null
|
||||
hypothesis. Because 0.1819% is less than 1%, we have evidence to reject
|
||||
the null hypothesis in favor of our alternative at a 99% confidence level.
|
||||
|
||||
The value of the :math:`L` statistic is 133.5. To check this manually,
|
||||
we rank the data such that high scores correspond with high ranks, settling
|
||||
ties with an average rank:
|
||||
|
||||
>>> from scipy.stats import rankdata
|
||||
>>> ranks = rankdata(table, axis=1)
|
||||
>>> ranks
|
||||
array([[1.5, 3. , 1.5],
|
||||
[1.5, 1.5, 3. ],
|
||||
[1.5, 1.5, 3. ],
|
||||
[1. , 3. , 2. ],
|
||||
[1.5, 3. , 1.5],
|
||||
[1. , 2. , 3. ],
|
||||
[1. , 2. , 3. ],
|
||||
[1. , 2.5, 2.5],
|
||||
[1. , 2. , 3. ],
|
||||
[1. , 2. , 3. ]])
|
||||
|
||||
We add the ranks within each column, multiply the sums by the
|
||||
predicted ranks, and sum the products.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> m, n = ranks.shape
|
||||
>>> predicted_ranks = np.arange(1, n+1)
|
||||
>>> L = (predicted_ranks * np.sum(ranks, axis=0)).sum()
|
||||
>>> res.statistic == L
|
||||
True
|
||||
|
||||
As presented in [3]_, the asymptotic approximation of the *p*-value is the
|
||||
survival function of the normal distribution evaluated at the standardized
|
||||
test statistic:
|
||||
|
||||
>>> from scipy.stats import norm
|
||||
>>> E0 = (m*n*(n+1)**2)/4
|
||||
>>> V0 = (m*n**2*(n+1)*(n**2-1))/144
|
||||
>>> Lambda = (L-E0)/np.sqrt(V0)
|
||||
>>> p = norm.sf(Lambda)
|
||||
>>> p
|
||||
0.0012693433690751756
|
||||
|
||||
This does not precisely match the *p*-value reported by `page_trend_test`
|
||||
above. The asymptotic distribution is not very accurate, nor conservative,
|
||||
for :math:`m \leq 12` and :math:`n \leq 8`, so `page_trend_test` chose to
|
||||
use ``method='exact'`` based on the dimensions of the table and the
|
||||
recommendations in Page's original paper [1]_. To override
|
||||
`page_trend_test`'s choice, provide the `method` argument.
|
||||
|
||||
>>> res = page_trend_test(table, method="asymptotic")
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0012693433690751756,
|
||||
method='asymptotic')
|
||||
|
||||
If the data are already ranked, we can pass in the ``ranks`` instead of
|
||||
the ``table`` to save computation time.
|
||||
|
||||
>>> res = page_trend_test(ranks, # ranks of data
|
||||
... ranked=True, # data is already ranked
|
||||
... )
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
|
||||
method='exact')
|
||||
|
||||
Suppose the raw data had been tabulated in an order different from the
|
||||
order of predicted means, say lecture, seminar, tutorial.
|
||||
|
||||
>>> table = np.asarray(table)[:, [1, 2, 0]]
|
||||
|
||||
Since the arrangement of this table is not consistent with the assumed
|
||||
ordering, we can either rearrange the table or provide the
|
||||
`predicted_ranks`. Remembering that the lecture is predicted
|
||||
to have the middle rank, the seminar the highest, and tutorial the lowest,
|
||||
we pass:
|
||||
|
||||
>>> res = page_trend_test(table, # data as originally tabulated
|
||||
... predicted_ranks=[2, 3, 1], # our predicted order
|
||||
... )
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
|
||||
method='exact')
|
||||
|
||||
"""
|
||||
|
||||
# Possible values of the method parameter and the corresponding function
|
||||
# used to evaluate the p value
|
||||
methods = {"asymptotic": _l_p_asymptotic,
|
||||
"exact": _l_p_exact,
|
||||
"auto": None}
|
||||
if method not in methods:
|
||||
raise ValueError(f"`method` must be in {set(methods)}")
|
||||
|
||||
ranks = np.array(data, copy=False)
|
||||
if ranks.ndim != 2: # TODO: relax this to accept 3d arrays?
|
||||
raise ValueError("`data` must be a 2d array.")
|
||||
|
||||
m, n = ranks.shape
|
||||
if m < 2 or n < 3:
|
||||
raise ValueError("Page's L is only appropriate for data with two "
|
||||
"or more rows and three or more columns.")
|
||||
|
||||
if np.any(np.isnan(data)):
|
||||
raise ValueError("`data` contains NaNs, which cannot be ranked "
|
||||
"meaningfully")
|
||||
|
||||
# ensure NumPy array and rank the data if it's not already ranked
|
||||
if ranked:
|
||||
# Only a basic check on whether data is ranked. Checking that the data
|
||||
# is properly ranked could take as much time as ranking it.
|
||||
if not (ranks.min() >= 1 and ranks.max() <= ranks.shape[1]):
|
||||
raise ValueError("`data` is not properly ranked. Rank the data or "
|
||||
"pass `ranked=False`.")
|
||||
else:
|
||||
ranks = scipy.stats.rankdata(data, axis=-1)
|
||||
|
||||
# generate predicted ranks if not provided, ensure valid NumPy array
|
||||
if predicted_ranks is None:
|
||||
predicted_ranks = np.arange(1, n+1)
|
||||
else:
|
||||
predicted_ranks = np.array(predicted_ranks, copy=False)
|
||||
if (predicted_ranks.ndim < 1 or
|
||||
(set(predicted_ranks) != set(range(1, n+1)) or
|
||||
len(predicted_ranks) != n)):
|
||||
raise ValueError(f"`predicted_ranks` must include each integer "
|
||||
f"from 1 to {n} (the number of columns in "
|
||||
f"`data`) exactly once.")
|
||||
|
||||
if type(ranked) is not bool:
|
||||
raise TypeError("`ranked` must be boolean.")
|
||||
|
||||
# Calculate the L statistic
|
||||
L = _l_vectorized(ranks, predicted_ranks)
|
||||
|
||||
# Calculate the p-value
|
||||
if method == "auto":
|
||||
method = _choose_method(ranks)
|
||||
p_fun = methods[method] # get the function corresponding with the method
|
||||
p = p_fun(L, m, n)
|
||||
|
||||
page_result = PageTrendTestResult(statistic=L, pvalue=p, method=method)
|
||||
return page_result
|
||||
|
||||
|
||||
def _choose_method(ranks):
|
||||
'''Choose method for computing p-value automatically'''
|
||||
m, n = ranks.shape
|
||||
if n > 8 or (m > 12 and n > 3) or m > 20: # as in [1], [4]
|
||||
method = "asymptotic"
|
||||
else:
|
||||
method = "exact"
|
||||
return method
|
||||
|
||||
|
||||
def _l_vectorized(ranks, predicted_ranks):
|
||||
'''Calculate's Page's L statistic for each page of a 3d array'''
|
||||
colsums = ranks.sum(axis=-2, keepdims=True)
|
||||
products = predicted_ranks * colsums
|
||||
Ls = products.sum(axis=-1)
|
||||
Ls = Ls[0] if Ls.size == 1 else Ls.ravel()
|
||||
return Ls
|
||||
|
||||
|
||||
def _l_p_asymptotic(L, m, n):
|
||||
'''Calculate the p-value of Page's L from the asymptotic distribution'''
|
||||
# Using [1] as a reference, the asymptotic p-value would be calculated as:
|
||||
# chi_L = (12*L - 3*m*n*(n+1)**2)**2/(m*n**2*(n**2-1)*(n+1))
|
||||
# p = chi2.sf(chi_L, df=1, loc=0, scale=1)/2
|
||||
# but this is insentive to the direction of the hypothesized ranking
|
||||
|
||||
# See [2] page 151
|
||||
E0 = (m*n*(n+1)**2)/4
|
||||
V0 = (m*n**2*(n+1)*(n**2-1))/144
|
||||
Lambda = (L-E0)/np.sqrt(V0)
|
||||
# This is a one-sided "greater" test - calculate the probability that the
|
||||
# L statistic under H0 would be greater than the observed L statistic
|
||||
p = norm.sf(Lambda)
|
||||
return p
|
||||
|
||||
|
||||
def _l_p_exact(L, m, n):
|
||||
'''Calculate the p-value of Page's L exactly'''
|
||||
# [1] uses m, n; [5] uses n, k.
|
||||
# Switch convention here because exact calculation code references [5].
|
||||
L, n, k = int(L), int(m), int(n)
|
||||
_pagel_state.set_k(k)
|
||||
return _pagel_state.sf(L, n)
|
||||
|
||||
|
||||
class _PageL:
|
||||
'''Maintains state between `page_trend_test` executions'''
|
||||
|
||||
def __init__(self):
|
||||
'''Lightweight initialization'''
|
||||
self.all_pmfs = {}
|
||||
|
||||
def set_k(self, k):
|
||||
'''Calculate lower and upper limits of L for single row'''
|
||||
self.k = k
|
||||
# See [5] top of page 52
|
||||
self.a, self.b = (k*(k+1)*(k+2))//6, (k*(k+1)*(2*k+1))//6
|
||||
|
||||
def sf(self, l, n):
|
||||
'''Survival function of Page's L statistic'''
|
||||
ps = [self.pmf(l, n) for l in range(l, n*self.b + 1)]
|
||||
return np.sum(ps)
|
||||
|
||||
def p_l_k_1(self):
|
||||
'''Relative frequency of each L value over all possible single rows'''
|
||||
|
||||
# See [5] Equation (6)
|
||||
ranks = range(1, self.k+1)
|
||||
# generate all possible rows of length k
|
||||
rank_perms = np.array(list(permutations(ranks)))
|
||||
# compute Page's L for all possible rows
|
||||
Ls = (ranks*rank_perms).sum(axis=1)
|
||||
# count occurences of each L value
|
||||
counts = np.histogram(Ls, np.arange(self.a-0.5, self.b+1.5))[0]
|
||||
# factorial(k) is number of possible permutations
|
||||
return counts/math.factorial(self.k)
|
||||
|
||||
def pmf(self, l, n):
|
||||
'''Recursive function to evaluate p(l, k, n); see [5] Equation 1'''
|
||||
|
||||
if n not in self.all_pmfs:
|
||||
self.all_pmfs[n] = {}
|
||||
if self.k not in self.all_pmfs[n]:
|
||||
self.all_pmfs[n][self.k] = {}
|
||||
|
||||
# Cache results to avoid repeating calculation. Initially this was
|
||||
# written with lru_cache, but this seems faster? Also, we could add
|
||||
# an option to save this for future lookup.
|
||||
if l in self.all_pmfs[n][self.k]:
|
||||
return self.all_pmfs[n][self.k][l]
|
||||
|
||||
if n == 1:
|
||||
ps = self.p_l_k_1() # [5] Equation 6
|
||||
ls = range(self.a, self.b+1)
|
||||
# not fast, but we'll only be here once
|
||||
self.all_pmfs[n][self.k] = {l: p for l, p in zip(ls, ps)}
|
||||
return self.all_pmfs[n][self.k][l]
|
||||
|
||||
p = 0
|
||||
low = max(l-(n-1)*self.b, self.a) # [5] Equation 2
|
||||
high = min(l-(n-1)*self.a, self.b)
|
||||
|
||||
# [5] Equation 1
|
||||
for t in range(low, high+1):
|
||||
p1 = self.pmf(l-t, n-1)
|
||||
p2 = self.pmf(t, 1)
|
||||
p += p1*p2
|
||||
self.all_pmfs[n][self.k][l] = p
|
||||
return p
|
||||
|
||||
|
||||
# Maintain state for faster repeat calls to page_trend_test w/ method='exact'
|
||||
_pagel_state = _PageL()
|
||||
1740
dashboard/flask-server/venv/Lib/site-packages/scipy/stats/_qmc.py
Normal file
1740
dashboard/flask-server/venv/Lib/site-packages/scipy/stats/_qmc.py
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
from scipy._lib._util import DecimalNumber, IntNumber
|
||||
|
||||
|
||||
def _cy_wrapper_centered_discrepancy(
|
||||
sample: np.ndarray,
|
||||
iterative: bool,
|
||||
workers: IntNumber,
|
||||
) -> float: ...
|
||||
|
||||
|
||||
def _cy_wrapper_wrap_around_discrepancy(
|
||||
sample: np.ndarray,
|
||||
iterative: bool,
|
||||
workers: IntNumber,
|
||||
) -> float: ...
|
||||
|
||||
|
||||
def _cy_wrapper_mixture_discrepancy(
|
||||
sample: np.ndarray,
|
||||
iterative: bool,
|
||||
workers: IntNumber,
|
||||
) -> float: ...
|
||||
|
||||
|
||||
def _cy_wrapper_l2_star_discrepancy(
|
||||
sample: np.ndarray,
|
||||
iterative: bool,
|
||||
workers: IntNumber,
|
||||
) -> float: ...
|
||||
|
||||
|
||||
def _cy_wrapper_update_discrepancy(
|
||||
x_new_view: np.ndarray,
|
||||
sample_view: np.ndarray,
|
||||
initial_disc: DecimalNumber,
|
||||
) -> float: ...
|
||||
|
||||
|
||||
def _cy_van_der_corput(
|
||||
n: IntNumber,
|
||||
base: IntNumber,
|
||||
start_index: IntNumber,
|
||||
workers: IntNumber,
|
||||
) -> np.ndarray: ...
|
||||
|
||||
|
||||
def _cy_van_der_corput_scrambled(
|
||||
n: IntNumber,
|
||||
base: IntNumber,
|
||||
start_index: IntNumber,
|
||||
permutations: np.ndarray,
|
||||
workers: IntNumber,
|
||||
) -> np.ndarray: ...
|
||||
@@ -0,0 +1,260 @@
|
||||
|
||||
import operator
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
from scipy.special import ndtri
|
||||
from ._common import ConfidenceInterval
|
||||
|
||||
|
||||
def _validate_int(n, bound, name):
|
||||
msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
|
||||
try:
|
||||
n = operator.index(n)
|
||||
except TypeError:
|
||||
raise TypeError(msg) from None
|
||||
if n < bound:
|
||||
raise ValueError(msg)
|
||||
return n
|
||||
|
||||
|
||||
@dataclass
|
||||
class RelativeRiskResult:
|
||||
"""
|
||||
Result of `scipy.stats.contingency.relative_risk`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
relative_risk : float
|
||||
This is::
|
||||
|
||||
(exposed_cases/exposed_total) / (control_cases/control_total)
|
||||
|
||||
exposed_cases : int
|
||||
The number of "cases" (i.e. occurrence of disease or other event
|
||||
of interest) among the sample of "exposed" individuals.
|
||||
exposed_total : int
|
||||
The total number of "exposed" individuals in the sample.
|
||||
control_cases : int
|
||||
The number of "cases" among the sample of "control" or non-exposed
|
||||
individuals.
|
||||
control_total : int
|
||||
The total number of "control" individuals in the sample.
|
||||
|
||||
Methods
|
||||
-------
|
||||
confidence_interval :
|
||||
Compute the confidence interval for the relative risk estimate.
|
||||
"""
|
||||
|
||||
relative_risk: float
|
||||
exposed_cases: int
|
||||
exposed_total: int
|
||||
control_cases: int
|
||||
control_total: int
|
||||
|
||||
def confidence_interval(self, confidence_level=0.95):
|
||||
"""
|
||||
Compute the confidence interval for the relative risk.
|
||||
|
||||
The confidence interval is computed using the Katz method
|
||||
(i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, optional
|
||||
The confidence level to use for the confidence interval.
|
||||
Default is 0.95.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ConfidenceInterval instance
|
||||
The return value is an object with attributes ``low`` and
|
||||
``high`` that hold the confidence interval.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
|
||||
confidence intervals for the risk ratio in cohort studies",
|
||||
Biometrics, 34, 469-474 (1978).
|
||||
.. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
|
||||
CRC Press LLC, Boca Raton, FL, USA (1996).
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats.contingency import relative_risk
|
||||
>>> result = relative_risk(exposed_cases=10, exposed_total=75,
|
||||
... control_cases=12, control_total=225)
|
||||
>>> result.relative_risk
|
||||
2.5
|
||||
>>> result.confidence_interval()
|
||||
ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
|
||||
"""
|
||||
if not 0 <= confidence_level <= 1:
|
||||
raise ValueError('confidence_level must be in the interval '
|
||||
'[0, 1].')
|
||||
|
||||
# Handle edge cases where either exposed_cases or control_cases
|
||||
# is zero. We follow the convention of the R function riskratio
|
||||
# from the epitools library.
|
||||
if self.exposed_cases == 0 and self.control_cases == 0:
|
||||
# relative risk is nan.
|
||||
return ConfidenceInterval(low=np.nan, high=np.nan)
|
||||
elif self.exposed_cases == 0:
|
||||
# relative risk is 0.
|
||||
return ConfidenceInterval(low=0.0, high=np.nan)
|
||||
elif self.control_cases == 0:
|
||||
# relative risk is inf
|
||||
return ConfidenceInterval(low=np.nan, high=np.inf)
|
||||
|
||||
alpha = 1 - confidence_level
|
||||
z = ndtri(1 - alpha/2)
|
||||
rr = self.relative_risk
|
||||
|
||||
# Estimate of the variance of log(rr) is
|
||||
# var(log(rr)) = 1/exposed_cases - 1/exposed_total +
|
||||
# 1/control_cases - 1/control_total
|
||||
# and the standard error is the square root of that.
|
||||
se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
|
||||
1/self.control_cases - 1/self.control_total)
|
||||
delta = z*se
|
||||
katz_lo = rr*np.exp(-delta)
|
||||
katz_hi = rr*np.exp(delta)
|
||||
return ConfidenceInterval(low=katz_lo, high=katz_hi)
|
||||
|
||||
|
||||
def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
|
||||
"""
|
||||
Compute the relative risk (also known as the risk ratio).
|
||||
|
||||
This function computes the relative risk associated with a 2x2
|
||||
contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
|
||||
of accepting a table as an argument, the individual numbers that are
|
||||
used to compute the relative risk are given as separate parameters.
|
||||
This is to avoid the ambiguity of which row or column of the contingency
|
||||
table corresponds to the "exposed" cases and which corresponds to the
|
||||
"control" cases. Unlike, say, the odds ratio, the relative risk is not
|
||||
invariant under an interchange of the rows or columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
exposed_cases : nonnegative int
|
||||
The number of "cases" (i.e. occurrence of disease or other event
|
||||
of interest) among the sample of "exposed" individuals.
|
||||
exposed_total : positive int
|
||||
The total number of "exposed" individuals in the sample.
|
||||
control_cases : nonnegative int
|
||||
The number of "cases" among the sample of "control" or non-exposed
|
||||
individuals.
|
||||
control_total : positive int
|
||||
The total number of "control" individuals in the sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
|
||||
The object has the float attribute ``relative_risk``, which is::
|
||||
|
||||
rr = (exposed_cases/exposed_total) / (control_cases/control_total)
|
||||
|
||||
The object also has the method ``confidence_interval`` to compute
|
||||
the confidence interval of the relative risk for a given confidence
|
||||
level.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The R package epitools has the function `riskratio`, which accepts
|
||||
a table with the following layout::
|
||||
|
||||
disease=0 disease=1
|
||||
exposed=0 (ref) n00 n01
|
||||
exposed=1 n10 n11
|
||||
|
||||
With a 2x2 table in the above format, the estimate of the CI is
|
||||
computed by `riskratio` when the argument method="wald" is given,
|
||||
or with the function `riskratio.wald`.
|
||||
|
||||
For example, in a test of the incidence of lung cancer among a
|
||||
sample of smokers and nonsmokers, the "exposed" category would
|
||||
correspond to "is a smoker" and the "disease" category would
|
||||
correspond to "has or had lung cancer".
|
||||
|
||||
To pass the same data to ``relative_risk``, use::
|
||||
|
||||
relative_risk(n11, n10 + n11, n01, n00 + n01)
|
||||
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Alan Agresti, An Introduction to Categorical Data Analysis
|
||||
(second edition), Wiley, Hoboken, NJ, USA (2007).
|
||||
.. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
|
||||
CRC Press LLC, Boca Raton, FL, USA (1996).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats.contingency import relative_risk
|
||||
|
||||
This example is from Example 3.1 of [2]_. The results of a heart
|
||||
disease study are summarized in the following table::
|
||||
|
||||
High CAT Low CAT Total
|
||||
-------- ------- -----
|
||||
CHD 27 44 71
|
||||
No CHD 95 443 538
|
||||
|
||||
Total 122 487 609
|
||||
|
||||
CHD is coronary heart disease, and CAT refers to the level of
|
||||
circulating catecholamine. CAT is the "exposure" variable, and
|
||||
high CAT is the "exposed" category. So the data from the table
|
||||
to be passed to ``relative_risk`` is::
|
||||
|
||||
exposed_cases = 27
|
||||
exposed_total = 122
|
||||
control_cases = 44
|
||||
control_total = 487
|
||||
|
||||
>>> result = relative_risk(27, 122, 44, 487)
|
||||
>>> result.relative_risk
|
||||
2.4495156482861398
|
||||
|
||||
Find the confidence interval for the relative risk.
|
||||
|
||||
>>> result.confidence_interval(confidence_level=0.95)
|
||||
ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)
|
||||
|
||||
The interval does not contain 1, so the data supports the statement
|
||||
that high CAT is associated with greater risk of CHD.
|
||||
"""
|
||||
# Relative risk is a trivial calculation. The nontrivial part is in the
|
||||
# `confidence_interval` method of the RelativeRiskResult class.
|
||||
|
||||
exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
|
||||
exposed_total = _validate_int(exposed_total, 1, "exposed_total")
|
||||
control_cases = _validate_int(control_cases, 0, "control_cases")
|
||||
control_total = _validate_int(control_total, 1, "control_total")
|
||||
|
||||
if exposed_cases > exposed_total:
|
||||
raise ValueError('exposed_cases must not exceed exposed_total.')
|
||||
if control_cases > control_total:
|
||||
raise ValueError('control_cases must not exceed control_total.')
|
||||
|
||||
if exposed_cases == 0 and control_cases == 0:
|
||||
# relative risk is 0/0.
|
||||
rr = np.nan
|
||||
elif exposed_cases == 0:
|
||||
# relative risk is 0/nonzero
|
||||
rr = 0.0
|
||||
elif control_cases == 0:
|
||||
# relative risk is nonzero/0.
|
||||
rr = np.inf
|
||||
else:
|
||||
p1 = exposed_cases / exposed_total
|
||||
p2 = control_cases / control_total
|
||||
rr = p1 / p2
|
||||
return RelativeRiskResult(relative_risk=rr,
|
||||
exposed_cases=exposed_cases,
|
||||
exposed_total=exposed_total,
|
||||
control_cases=control_cases,
|
||||
control_total=control_total)
|
||||
@@ -0,0 +1,24 @@
|
||||
# This module exists only to allow Sphinx to generate docs
|
||||
# for the result objects returned by some functions in stats.
|
||||
|
||||
"""
|
||||
Result classes
|
||||
--------------
|
||||
|
||||
.. currentmodule:: scipy.stats._result_classes
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
RelativeRiskResult
|
||||
BinomTestResult
|
||||
TukeyHSDResult
|
||||
|
||||
"""
|
||||
|
||||
__all__ = ['BinomTestResult', 'RelativeRiskResult', 'TukeyHSDResult']
|
||||
|
||||
|
||||
from ._binomtest import BinomTestResult
|
||||
from ._relative_risk import RelativeRiskResult
|
||||
from ._hypotests import TukeyHSDResult
|
||||
@@ -0,0 +1,195 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import numpy as np
|
||||
from ._unuran import unuran_wrapper
|
||||
from scipy._lib.deprecation import _deprecated
|
||||
from scipy._lib._util import check_random_state
|
||||
|
||||
|
||||
def rvs_ratio_uniforms(pdf, umax, vmin, vmax, size=1, c=0, random_state=None):
|
||||
"""
|
||||
Generate random samples from a probability density function using the
|
||||
ratio-of-uniforms method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pdf : callable
|
||||
A function with signature `pdf(x)` that is proportional to the
|
||||
probability density function of the distribution.
|
||||
umax : float
|
||||
The upper bound of the bounding rectangle in the u-direction.
|
||||
vmin : float
|
||||
The lower bound of the bounding rectangle in the v-direction.
|
||||
vmax : float
|
||||
The upper bound of the bounding rectangle in the v-direction.
|
||||
size : int or tuple of ints, optional
|
||||
Defining number of random variates (default is 1).
|
||||
c : float, optional.
|
||||
Shift parameter of ratio-of-uniforms method, see Notes. Default is 0.
|
||||
random_state : {None, int, `numpy.random.Generator`,
|
||||
`numpy.random.RandomState`}, optional
|
||||
|
||||
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
rvs : ndarray
|
||||
The random variates distributed according to the probability
|
||||
distribution defined by the pdf.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Given a univariate probability density function `pdf` and a constant `c`,
|
||||
define the set ``A = {(u, v) : 0 < u <= sqrt(pdf(v/u + c))}``.
|
||||
If `(U, V)` is a random vector uniformly distributed over `A`,
|
||||
then `V/U + c` follows a distribution according to `pdf`.
|
||||
|
||||
The above result (see [1]_, [2]_) can be used to sample random variables
|
||||
using only the pdf, i.e. no inversion of the cdf is required. Typical
|
||||
choices of `c` are zero or the mode of `pdf`. The set `A` is a subset of
|
||||
the rectangle ``R = [0, umax] x [vmin, vmax]`` where
|
||||
|
||||
- ``umax = sup sqrt(pdf(x))``
|
||||
- ``vmin = inf (x - c) sqrt(pdf(x))``
|
||||
- ``vmax = sup (x - c) sqrt(pdf(x))``
|
||||
|
||||
In particular, these values are finite if `pdf` is bounded and
|
||||
``x**2 * pdf(x)`` is bounded (i.e. subquadratic tails).
|
||||
One can generate `(U, V)` uniformly on `R` and return
|
||||
`V/U + c` if `(U, V)` are also in `A` which can be directly
|
||||
verified.
|
||||
|
||||
The algorithm is not changed if one replaces `pdf` by k * `pdf` for any
|
||||
constant k > 0. Thus, it is often convenient to work with a function
|
||||
that is proportional to the probability density function by dropping
|
||||
unneccessary normalization factors.
|
||||
|
||||
Intuitively, the method works well if `A` fills up most of the
|
||||
enclosing rectangle such that the probability is high that `(U, V)`
|
||||
lies in `A` whenever it lies in `R` as the number of required
|
||||
iterations becomes too large otherwise. To be more precise, note that
|
||||
the expected number of iterations to draw `(U, V)` uniformly
|
||||
distributed on `R` such that `(U, V)` is also in `A` is given by
|
||||
the ratio ``area(R) / area(A) = 2 * umax * (vmax - vmin) / area(pdf)``,
|
||||
where `area(pdf)` is the integral of `pdf` (which is equal to one if the
|
||||
probability density function is used but can take on other values if a
|
||||
function proportional to the density is used). The equality holds since
|
||||
the area of `A` is equal to 0.5 * area(pdf) (Theorem 7.1 in [1]_).
|
||||
If the sampling fails to generate a single random variate after 50000
|
||||
iterations (i.e. not a single draw is in `A`), an exception is raised.
|
||||
|
||||
If the bounding rectangle is not correctly specified (i.e. if it does not
|
||||
contain `A`), the algorithm samples from a distribution different from
|
||||
the one given by `pdf`. It is therefore recommended to perform a
|
||||
test such as `~scipy.stats.kstest` as a check.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] L. Devroye, "Non-Uniform Random Variate Generation",
|
||||
Springer-Verlag, 1986.
|
||||
|
||||
.. [2] W. Hoermann and J. Leydold, "Generating generalized inverse Gaussian
|
||||
random variates", Statistics and Computing, 24(4), p. 547--557, 2014.
|
||||
|
||||
.. [3] A.J. Kinderman and J.F. Monahan, "Computer Generation of Random
|
||||
Variables Using the Ratio of Uniform Deviates",
|
||||
ACM Transactions on Mathematical Software, 3(3), p. 257--260, 1977.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
|
||||
Simulate normally distributed random variables. It is easy to compute the
|
||||
bounding rectangle explicitly in that case. For simplicity, we drop the
|
||||
normalization factor of the density.
|
||||
|
||||
>>> f = lambda x: np.exp(-x**2 / 2)
|
||||
>>> v_bound = np.sqrt(f(np.sqrt(2))) * np.sqrt(2)
|
||||
>>> umax, vmin, vmax = np.sqrt(f(0)), -v_bound, v_bound
|
||||
>>> rvs = stats.rvs_ratio_uniforms(f, umax, vmin, vmax, size=2500,
|
||||
... random_state=rng)
|
||||
|
||||
The K-S test confirms that the random variates are indeed normally
|
||||
distributed (normality is not rejected at 5% significance level):
|
||||
|
||||
>>> stats.kstest(rvs, 'norm')[1]
|
||||
0.250634764150542
|
||||
|
||||
The exponential distribution provides another example where the bounding
|
||||
rectangle can be determined explicitly.
|
||||
|
||||
>>> rvs = stats.rvs_ratio_uniforms(lambda x: np.exp(-x), umax=1,
|
||||
... vmin=0, vmax=2*np.exp(-1), size=1000,
|
||||
... random_state=rng)
|
||||
>>> stats.kstest(rvs, 'expon')[1]
|
||||
0.21121052054580314
|
||||
|
||||
"""
|
||||
if vmin >= vmax:
|
||||
raise ValueError("vmin must be smaller than vmax.")
|
||||
|
||||
if umax <= 0:
|
||||
raise ValueError("umax must be positive.")
|
||||
|
||||
size1d = tuple(np.atleast_1d(size))
|
||||
N = np.prod(size1d) # number of rvs needed, reshape upon return
|
||||
|
||||
# start sampling using ratio of uniforms method
|
||||
rng = check_random_state(random_state)
|
||||
x = np.zeros(N)
|
||||
simulated, i = 0, 1
|
||||
|
||||
# loop until N rvs have been generated: expected runtime is finite.
|
||||
# to avoid infinite loop, raise exception if not a single rv has been
|
||||
# generated after 50000 tries. even if the expected numer of iterations
|
||||
# is 1000, the probability of this event is (1-1/1000)**50000
|
||||
# which is of order 10e-22
|
||||
while simulated < N:
|
||||
k = N - simulated
|
||||
# simulate uniform rvs on [0, umax] and [vmin, vmax]
|
||||
u1 = umax * rng.uniform(size=k)
|
||||
v1 = rng.uniform(vmin, vmax, size=k)
|
||||
# apply rejection method
|
||||
rvs = v1 / u1 + c
|
||||
accept = (u1**2 <= pdf(rvs))
|
||||
num_accept = np.sum(accept)
|
||||
if num_accept > 0:
|
||||
x[simulated:(simulated + num_accept)] = rvs[accept]
|
||||
simulated += num_accept
|
||||
|
||||
if (simulated == 0) and (i*N >= 50000):
|
||||
msg = ("Not a single random variate could be generated in {} "
|
||||
"attempts. The ratio of uniforms method does not appear "
|
||||
"to work for the provided parameters. Please check the "
|
||||
"pdf and the bounds.".format(i*N))
|
||||
raise RuntimeError(msg)
|
||||
i += 1
|
||||
|
||||
return np.reshape(x, size1d)
|
||||
|
||||
|
||||
class NumericalInverseHermite:
|
||||
@_deprecated(
|
||||
"NumericalInverseHermite has been deprecated from `scipy.stats`. "
|
||||
" To use `NumericalInverseHermite`, import/use it from "
|
||||
"`scipy.stats.sampling` module instead. "
|
||||
"i.e. `from scipy.stats.sampling import NumericalInverseHermite`"
|
||||
)
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.hinv = unuran_wrapper.NumericalInverseHermite(*args, **kwargs)
|
||||
self.intervals = self.hinv.intervals
|
||||
self.midpoint_error = self.hinv.midpoint_error
|
||||
|
||||
def rvs(self, *args, **kwargs):
|
||||
return self.hinv.rvs(*args, **kwargs)
|
||||
|
||||
def ppf(self, *args, **kwargs):
|
||||
return self.hinv.ppf(*args, **kwargs)
|
||||
|
||||
def qrvs(self, *args, **kwargs):
|
||||
return self.hinv.qrvs(*args, **kwargs)
|
||||
Binary file not shown.
@@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
from scipy._lib._util import IntNumber
|
||||
from typing_extensions import Literal
|
||||
|
||||
def initialize_v(
|
||||
v : np.ndarray,
|
||||
dim : IntNumber
|
||||
) -> None: ...
|
||||
|
||||
def _cscramble (
|
||||
dim : IntNumber,
|
||||
ltm : np.ndarray,
|
||||
sv: np.ndarray
|
||||
) -> None: ...
|
||||
|
||||
def _fill_p_cumulative(
|
||||
p: np.ndarray,
|
||||
p_cumulative: np.ndarray
|
||||
) -> None: ...
|
||||
|
||||
def _draw(
|
||||
n : IntNumber,
|
||||
num_gen: IntNumber,
|
||||
dim: IntNumber,
|
||||
sv: np.ndarray,
|
||||
quasi: np.ndarray,
|
||||
result: np.ndarray
|
||||
) -> None: ...
|
||||
|
||||
def _fast_forward(
|
||||
n: IntNumber,
|
||||
num_gen: IntNumber,
|
||||
dim: IntNumber,
|
||||
sv: np.ndarray,
|
||||
quasi: np.ndarray
|
||||
) -> None: ...
|
||||
|
||||
def _categorize(
|
||||
draws: np.ndarray,
|
||||
p_cumulative: np.ndarray,
|
||||
result: np.ndarray
|
||||
) -> None: ...
|
||||
|
||||
def initialize_direction_numbers() -> None: ...
|
||||
|
||||
_MAXDIM: Literal[21201]
|
||||
_MAXBIT: Literal[30]
|
||||
_MAXDEG: Literal[18]
|
||||
|
||||
def _test_find_index(
|
||||
p_cumulative: np.ndarray,
|
||||
size: int,
|
||||
value: float
|
||||
) -> int: ...
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,484 @@
|
||||
import numpy as np
|
||||
import scipy.stats._stats_py
|
||||
from . import distributions
|
||||
from .._lib._bunch import _make_tuple_bunch
|
||||
|
||||
|
||||
__all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']
|
||||
|
||||
# This is not a namedtuple for backwards compatibility. See PR #12983
|
||||
LinregressResult = _make_tuple_bunch('LinregressResult',
|
||||
['slope', 'intercept', 'rvalue',
|
||||
'pvalue', 'stderr'],
|
||||
extra_field_names=['intercept_stderr'])
|
||||
|
||||
|
||||
def linregress(x, y=None, alternative='two-sided'):
|
||||
"""
|
||||
Calculate a linear least-squares regression for two sets of measurements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array_like
|
||||
Two sets of measurements. Both arrays should have the same length. If
|
||||
only `x` is given (and ``y=None``), then it must be a two-dimensional
|
||||
array where one dimension has length 2. The two sets of measurements
|
||||
are then found by splitting the array along the length-2 dimension. In
|
||||
the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is
|
||||
equivalent to ``linregress(x[0], x[1])``.
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis. Default is 'two-sided'.
|
||||
The following options are available:
|
||||
|
||||
* 'two-sided': the slope of the regression line is nonzero
|
||||
* 'less': the slope of the regression line is less than zero
|
||||
* 'greater': the slope of the regression line is greater than zero
|
||||
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : ``LinregressResult`` instance
|
||||
The return value is an object with the following attributes:
|
||||
|
||||
slope : float
|
||||
Slope of the regression line.
|
||||
intercept : float
|
||||
Intercept of the regression line.
|
||||
rvalue : float
|
||||
The Pearson correlation coefficient. The square of ``rvalue``
|
||||
is equal to the coefficient of determination.
|
||||
pvalue : float
|
||||
The p-value for a hypothesis test whose null hypothesis is
|
||||
that the slope is zero, using Wald Test with t-distribution of
|
||||
the test statistic. See `alternative` above for alternative
|
||||
hypotheses.
|
||||
stderr : float
|
||||
Standard error of the estimated slope (gradient), under the
|
||||
assumption of residual normality.
|
||||
intercept_stderr : float
|
||||
Standard error of the estimated intercept, under the assumption
|
||||
of residual normality.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.optimize.curve_fit :
|
||||
Use non-linear least squares to fit a function to data.
|
||||
scipy.optimize.leastsq :
|
||||
Minimize the sum of squares of a set of equations.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Missing values are considered pair-wise: if a value is missing in `x`,
|
||||
the corresponding value in `y` is masked.
|
||||
|
||||
For compatibility with older versions of SciPy, the return value acts
|
||||
like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,
|
||||
``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::
|
||||
|
||||
slope, intercept, r, p, se = linregress(x, y)
|
||||
|
||||
With that style, however, the standard error of the intercept is not
|
||||
available. To have access to all the computed values, including the
|
||||
standard error of the intercept, use the return value as an object
|
||||
with attributes, e.g.::
|
||||
|
||||
result = linregress(x, y)
|
||||
print(result.intercept, result.intercept_stderr)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
|
||||
Generate some data:
|
||||
|
||||
>>> x = rng.random(10)
|
||||
>>> y = 1.6*x + rng.random(10)
|
||||
|
||||
Perform the linear regression:
|
||||
|
||||
>>> res = stats.linregress(x, y)
|
||||
|
||||
Coefficient of determination (R-squared):
|
||||
|
||||
>>> print(f"R-squared: {res.rvalue**2:.6f}")
|
||||
R-squared: 0.717533
|
||||
|
||||
Plot the data along with the fitted line:
|
||||
|
||||
>>> plt.plot(x, y, 'o', label='original data')
|
||||
>>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
|
||||
>>> plt.legend()
|
||||
>>> plt.show()
|
||||
|
||||
Calculate 95% confidence interval on slope and intercept:
|
||||
|
||||
>>> # Two-sided inverse Students t-distribution
|
||||
>>> # p - probability, df - degrees of freedom
|
||||
>>> from scipy.stats import t
|
||||
>>> tinv = lambda p, df: abs(t.ppf(p/2, df))
|
||||
|
||||
>>> ts = tinv(0.05, len(x)-2)
|
||||
>>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")
|
||||
slope (95%): 1.453392 +/- 0.743465
|
||||
>>> print(f"intercept (95%): {res.intercept:.6f}"
|
||||
... f" +/- {ts*res.intercept_stderr:.6f}")
|
||||
intercept (95%): 0.616950 +/- 0.544475
|
||||
|
||||
"""
|
||||
TINY = 1.0e-20
|
||||
if y is None: # x is a (2, N) or (N, 2) shaped array_like
|
||||
x = np.asarray(x)
|
||||
if x.shape[0] == 2:
|
||||
x, y = x
|
||||
elif x.shape[1] == 2:
|
||||
x, y = x.T
|
||||
else:
|
||||
raise ValueError("If only `x` is given as input, it has to "
|
||||
"be of shape (2, N) or (N, 2); provided shape "
|
||||
f"was {x.shape}.")
|
||||
else:
|
||||
x = np.asarray(x)
|
||||
y = np.asarray(y)
|
||||
|
||||
if x.size == 0 or y.size == 0:
|
||||
raise ValueError("Inputs must not be empty.")
|
||||
|
||||
if np.amax(x) == np.amin(x) and len(x) > 1:
|
||||
raise ValueError("Cannot calculate a linear regression "
|
||||
"if all x values are identical")
|
||||
|
||||
n = len(x)
|
||||
xmean = np.mean(x, None)
|
||||
ymean = np.mean(y, None)
|
||||
|
||||
# Average sums of square differences from the mean
|
||||
# ssxm = mean( (x-mean(x))^2 )
|
||||
# ssxym = mean( (x-mean(x)) * (y-mean(y)) )
|
||||
ssxm, ssxym, _, ssym = np.cov(x, y, bias=1).flat
|
||||
|
||||
# R-value
|
||||
# r = ssxym / sqrt( ssxm * ssym )
|
||||
if ssxm == 0.0 or ssym == 0.0:
|
||||
# If the denominator was going to be 0
|
||||
r = 0.0
|
||||
else:
|
||||
r = ssxym / np.sqrt(ssxm * ssym)
|
||||
# Test for numerical error propagation (make sure -1 < r < 1)
|
||||
if r > 1.0:
|
||||
r = 1.0
|
||||
elif r < -1.0:
|
||||
r = -1.0
|
||||
|
||||
slope = ssxym / ssxm
|
||||
intercept = ymean - slope*xmean
|
||||
if n == 2:
|
||||
# handle case when only two points are passed in
|
||||
if y[0] == y[1]:
|
||||
prob = 1.0
|
||||
else:
|
||||
prob = 0.0
|
||||
slope_stderr = 0.0
|
||||
intercept_stderr = 0.0
|
||||
else:
|
||||
df = n - 2 # Number of degrees of freedom
|
||||
# n-2 degrees of freedom because 2 has been used up
|
||||
# to estimate the mean and standard deviation
|
||||
t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
|
||||
t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
|
||||
|
||||
slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
|
||||
|
||||
# Also calculate the standard error of the intercept
|
||||
# The following relationship is used:
|
||||
# ssxm = mean( (x-mean(x))^2 )
|
||||
# = ssx - sx*sx
|
||||
# = mean( x^2 ) - mean(x)^2
|
||||
intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
|
||||
|
||||
return LinregressResult(slope=slope, intercept=intercept, rvalue=r,
|
||||
pvalue=prob, stderr=slope_stderr,
|
||||
intercept_stderr=intercept_stderr)
|
||||
|
||||
|
||||
def theilslopes(y, x=None, alpha=0.95, method='separate'):
|
||||
r"""
|
||||
Computes the Theil-Sen estimator for a set of points (x, y).
|
||||
|
||||
`theilslopes` implements a method for robust linear regression. It
|
||||
computes the slope as the median of all slopes between paired values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array_like
|
||||
Dependent variable.
|
||||
x : array_like or None, optional
|
||||
Independent variable. If None, use ``arange(len(y))`` instead.
|
||||
alpha : float, optional
|
||||
Confidence degree between 0 and 1. Default is 95% confidence.
|
||||
Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
|
||||
interpreted as "find the 90% confidence interval".
|
||||
method : {'joint', 'separate'}, optional
|
||||
Method to be used for computing estimate for intercept.
|
||||
Following methods are supported,
|
||||
|
||||
* 'joint': Uses np.median(y - medslope * x) as intercept.
|
||||
* 'separate': Uses np.median(y) - medslope * np.median(x)
|
||||
as intercept.
|
||||
|
||||
The default is 'separate'.
|
||||
|
||||
.. versionadded:: 1.8.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
medslope : float
|
||||
Theil slope.
|
||||
medintercept : float
|
||||
Intercept of the Theil line.
|
||||
lo_slope : float
|
||||
Lower bound of the confidence interval on `medslope`.
|
||||
up_slope : float
|
||||
Upper bound of the confidence interval on `medslope`.
|
||||
|
||||
See also
|
||||
--------
|
||||
siegelslopes : a similar technique using repeated medians
|
||||
|
||||
Notes
|
||||
-----
|
||||
The implementation of `theilslopes` follows [1]_. The intercept is
|
||||
not defined in [1]_, and here it is defined as ``median(y) -
|
||||
medslope*median(x)``, which is given in [3]_. Other definitions of
|
||||
the intercept exist in the literature such as ``median(y - medslope*x)``
|
||||
in [4]_. The approach to compute the intercept can be determined by the
|
||||
parameter ``method``. A confidence interval for the intercept is not
|
||||
given as this question is not addressed in [1]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] P.K. Sen, "Estimates of the regression coefficient based on
|
||||
Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
|
||||
.. [2] H. Theil, "A rank-invariant method of linear and polynomial
|
||||
regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
|
||||
53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
|
||||
.. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
|
||||
John Wiley and Sons, New York, pp. 493.
|
||||
.. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
>>> x = np.linspace(-5, 5, num=150)
|
||||
>>> y = x + np.random.normal(size=x.size)
|
||||
>>> y[11:15] += 10 # add outliers
|
||||
>>> y[-5:] -= 7
|
||||
|
||||
Compute the slope, intercept and 90% confidence interval. For comparison,
|
||||
also compute the least-squares fit with `linregress`:
|
||||
|
||||
>>> res = stats.theilslopes(y, x, 0.90, method='separate')
|
||||
>>> lsq_res = stats.linregress(x, y)
|
||||
|
||||
Plot the results. The Theil-Sen regression line is shown in red, with the
|
||||
dashed red lines illustrating the confidence interval of the slope (note
|
||||
that the dashed red lines are not the confidence interval of the regression
|
||||
as the confidence interval of the intercept is not included). The green
|
||||
line shows the least-squares fit for comparison.
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111)
|
||||
>>> ax.plot(x, y, 'b.')
|
||||
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
|
||||
>>> ax.plot(x, res[1] + res[2] * x, 'r--')
|
||||
>>> ax.plot(x, res[1] + res[3] * x, 'r--')
|
||||
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if method not in ['joint', 'separate']:
|
||||
raise ValueError(("method must be either 'joint' or 'separate'."
|
||||
"'{}' is invalid.".format(method)))
|
||||
# We copy both x and y so we can use _find_repeats.
|
||||
y = np.array(y).flatten()
|
||||
if x is None:
|
||||
x = np.arange(len(y), dtype=float)
|
||||
else:
|
||||
x = np.array(x, dtype=float).flatten()
|
||||
if len(x) != len(y):
|
||||
raise ValueError("Incompatible lengths ! (%s<>%s)" %
|
||||
(len(y), len(x)))
|
||||
|
||||
# Compute sorted slopes only when deltax > 0
|
||||
deltax = x[:, np.newaxis] - x
|
||||
deltay = y[:, np.newaxis] - y
|
||||
slopes = deltay[deltax > 0] / deltax[deltax > 0]
|
||||
slopes.sort()
|
||||
medslope = np.median(slopes)
|
||||
if method == 'joint':
|
||||
medinter = np.median(y - medslope * x)
|
||||
else:
|
||||
medinter = np.median(y) - medslope * np.median(x)
|
||||
# Now compute confidence intervals
|
||||
if alpha > 0.5:
|
||||
alpha = 1. - alpha
|
||||
|
||||
z = distributions.norm.ppf(alpha / 2.)
|
||||
# This implements (2.6) from Sen (1968)
|
||||
_, nxreps = _find_repeats(x)
|
||||
_, nyreps = _find_repeats(y)
|
||||
nt = len(slopes) # N in Sen (1968)
|
||||
ny = len(y) # n in Sen (1968)
|
||||
# Equation 2.6 in Sen (1968):
|
||||
sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
|
||||
sum(k * (k-1) * (2*k + 5) for k in nxreps) -
|
||||
sum(k * (k-1) * (2*k + 5) for k in nyreps))
|
||||
# Find the confidence interval indices in `slopes`
|
||||
sigma = np.sqrt(sigsq)
|
||||
Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
|
||||
Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
|
||||
delta = slopes[[Rl, Ru]]
|
||||
return medslope, medinter, delta[0], delta[1]
|
||||
|
||||
|
||||
def _find_repeats(arr):
|
||||
# This function assumes it may clobber its input.
|
||||
if len(arr) == 0:
|
||||
return np.array(0, np.float64), np.array(0, np.intp)
|
||||
|
||||
# XXX This cast was previously needed for the Fortran implementation,
|
||||
# should we ditch it?
|
||||
arr = np.asarray(arr, np.float64).ravel()
|
||||
arr.sort()
|
||||
|
||||
# Taken from NumPy 1.9's np.unique.
|
||||
change = np.concatenate(([True], arr[1:] != arr[:-1]))
|
||||
unique = arr[change]
|
||||
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
|
||||
freq = np.diff(change_idx)
|
||||
atleast2 = freq > 1
|
||||
return unique[atleast2], freq[atleast2]
|
||||
|
||||
|
||||
def siegelslopes(y, x=None, method="hierarchical"):
|
||||
r"""
|
||||
Computes the Siegel estimator for a set of points (x, y).
|
||||
|
||||
`siegelslopes` implements a method for robust linear regression
|
||||
using repeated medians (see [1]_) to fit a line to the points (x, y).
|
||||
The method is robust to outliers with an asymptotic breakdown point
|
||||
of 50%.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array_like
|
||||
Dependent variable.
|
||||
x : array_like or None, optional
|
||||
Independent variable. If None, use ``arange(len(y))`` instead.
|
||||
method : {'hierarchical', 'separate'}
|
||||
If 'hierarchical', estimate the intercept using the estimated
|
||||
slope ``medslope`` (default option).
|
||||
If 'separate', estimate the intercept independent of the estimated
|
||||
slope. See Notes for details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
medslope : float
|
||||
Estimate of the slope of the regression line.
|
||||
medintercept : float
|
||||
Estimate of the intercept of the regression line.
|
||||
|
||||
See also
|
||||
--------
|
||||
theilslopes : a similar technique without repeated medians
|
||||
|
||||
Notes
|
||||
-----
|
||||
With ``n = len(y)``, compute ``m_j`` as the median of
|
||||
the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
|
||||
``medslope`` is then the median of all slopes ``m_j``.
|
||||
Two ways are given to estimate the intercept in [1]_ which can be chosen
|
||||
via the parameter ``method``.
|
||||
The hierarchical approach uses the estimated slope ``medslope``
|
||||
and computes ``medintercept`` as the median of ``y - medslope*x``.
|
||||
The other approach estimates the intercept separately as follows: for
|
||||
each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
|
||||
lines through the remaining points and take the median ``i_j``.
|
||||
``medintercept`` is the median of the ``i_j``.
|
||||
|
||||
The implementation computes `n` times the median of a vector of size `n`
|
||||
which can be slow for large vectors. There are more efficient algorithms
|
||||
(see [2]_) which are not implemented here.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] A. Siegel, "Robust Regression Using Repeated Medians",
|
||||
Biometrika, Vol. 69, pp. 242-244, 1982.
|
||||
|
||||
.. [2] A. Stein and M. Werman, "Finding the repeated median regression
|
||||
line", Proceedings of the Third Annual ACM-SIAM Symposium on
|
||||
Discrete Algorithms, pp. 409-413, 1992.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
>>> x = np.linspace(-5, 5, num=150)
|
||||
>>> y = x + np.random.normal(size=x.size)
|
||||
>>> y[11:15] += 10 # add outliers
|
||||
>>> y[-5:] -= 7
|
||||
|
||||
Compute the slope and intercept. For comparison, also compute the
|
||||
least-squares fit with `linregress`:
|
||||
|
||||
>>> res = stats.siegelslopes(y, x)
|
||||
>>> lsq_res = stats.linregress(x, y)
|
||||
|
||||
Plot the results. The Siegel regression line is shown in red. The green
|
||||
line shows the least-squares fit for comparison.
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111)
|
||||
>>> ax.plot(x, y, 'b.')
|
||||
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
|
||||
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if method not in ['hierarchical', 'separate']:
|
||||
raise ValueError("method can only be 'hierarchical' or 'separate'")
|
||||
y = np.asarray(y).ravel()
|
||||
if x is None:
|
||||
x = np.arange(len(y), dtype=float)
|
||||
else:
|
||||
x = np.asarray(x, dtype=float).ravel()
|
||||
if len(x) != len(y):
|
||||
raise ValueError("Incompatible lengths ! (%s<>%s)" %
|
||||
(len(y), len(x)))
|
||||
|
||||
deltax = x[:, np.newaxis] - x
|
||||
deltay = y[:, np.newaxis] - y
|
||||
slopes, intercepts = [], []
|
||||
|
||||
for j in range(len(x)):
|
||||
id_nonzero = deltax[j, :] != 0
|
||||
slopes_j = deltay[j, id_nonzero] / deltax[j, id_nonzero]
|
||||
medslope_j = np.median(slopes_j)
|
||||
slopes.append(medslope_j)
|
||||
if method == 'separate':
|
||||
z = y*x[j] - y[j]*x
|
||||
medintercept_j = np.median(z[id_nonzero] / deltax[j, id_nonzero])
|
||||
intercepts.append(medintercept_j)
|
||||
|
||||
medslope = np.median(np.asarray(slopes))
|
||||
if method == "separate":
|
||||
medinter = np.median(np.asarray(intercepts))
|
||||
else:
|
||||
medinter = np.median(y - medslope*x)
|
||||
|
||||
return medslope, medinter
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,199 @@
|
||||
import numpy as np
|
||||
from numpy import poly1d
|
||||
from scipy.special import beta
|
||||
|
||||
|
||||
# The following code was used to generate the Pade coefficients for the
|
||||
# Tukey Lambda variance function. Version 0.17 of mpmath was used.
|
||||
#---------------------------------------------------------------------------
|
||||
# import mpmath as mp
|
||||
#
|
||||
# mp.mp.dps = 60
|
||||
#
|
||||
# one = mp.mpf(1)
|
||||
# two = mp.mpf(2)
|
||||
#
|
||||
# def mpvar(lam):
|
||||
# if lam == 0:
|
||||
# v = mp.pi**2 / three
|
||||
# else:
|
||||
# v = (two / lam**2) * (one / (one + two*lam) -
|
||||
# mp.beta(lam + one, lam + one))
|
||||
# return v
|
||||
#
|
||||
# t = mp.taylor(mpvar, 0, 8)
|
||||
# p, q = mp.pade(t, 4, 4)
|
||||
# print("p =", [mp.fp.mpf(c) for c in p])
|
||||
# print("q =", [mp.fp.mpf(c) for c in q])
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
# Pade coefficients for the Tukey Lambda variance function.
|
||||
_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127,
|
||||
-0.5370742306855439, 0.17292046290190008,
|
||||
-0.02371146284628187]
|
||||
_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124,
|
||||
1.7660926747377275, 0.2643989311168465]
|
||||
|
||||
# numpy.poly1d instances for the numerator and denominator of the
|
||||
# Pade approximation to the Tukey Lambda variance.
|
||||
_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1])
|
||||
_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1])
|
||||
|
||||
|
||||
def tukeylambda_variance(lam):
|
||||
"""Variance of the Tukey Lambda distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lam : array_like
|
||||
The lambda values at which to compute the variance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v : ndarray
|
||||
The variance. For lam < -0.5, the variance is not defined, so
|
||||
np.nan is returned. For lam = 0.5, np.inf is returned.
|
||||
|
||||
Notes
|
||||
-----
|
||||
In an interval around lambda=0, this function uses the [4,4] Pade
|
||||
approximation to compute the variance. Otherwise it uses the standard
|
||||
formula (https://en.wikipedia.org/wiki/Tukey_lambda_distribution). The
|
||||
Pade approximation is used because the standard formula has a removable
|
||||
discontinuity at lambda = 0, and does not produce accurate numerical
|
||||
results near lambda = 0.
|
||||
"""
|
||||
lam = np.asarray(lam)
|
||||
shp = lam.shape
|
||||
lam = np.atleast_1d(lam).astype(np.float64)
|
||||
|
||||
# For absolute values of lam less than threshold, use the Pade
|
||||
# approximation.
|
||||
threshold = 0.075
|
||||
|
||||
# Play games with masks to implement the conditional evaluation of
|
||||
# the distribution.
|
||||
# lambda < -0.5: var = nan
|
||||
low_mask = lam < -0.5
|
||||
# lambda == -0.5: var = inf
|
||||
neghalf_mask = lam == -0.5
|
||||
# abs(lambda) < threshold: use Pade approximation
|
||||
small_mask = np.abs(lam) < threshold
|
||||
# else the "regular" case: use the explicit formula.
|
||||
reg_mask = ~(low_mask | neghalf_mask | small_mask)
|
||||
|
||||
# Get the 'lam' values for the cases where they are needed.
|
||||
small = lam[small_mask]
|
||||
reg = lam[reg_mask]
|
||||
|
||||
# Compute the function for each case.
|
||||
v = np.empty_like(lam)
|
||||
v[low_mask] = np.nan
|
||||
v[neghalf_mask] = np.inf
|
||||
if small.size > 0:
|
||||
# Use the Pade approximation near lambda = 0.
|
||||
v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small)
|
||||
if reg.size > 0:
|
||||
v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) -
|
||||
beta(reg + 1, reg + 1))
|
||||
v.shape = shp
|
||||
return v
|
||||
|
||||
|
||||
# The following code was used to generate the Pade coefficients for the
|
||||
# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used.
|
||||
#---------------------------------------------------------------------------
|
||||
# import mpmath as mp
|
||||
#
|
||||
# mp.mp.dps = 60
|
||||
#
|
||||
# one = mp.mpf(1)
|
||||
# two = mp.mpf(2)
|
||||
# three = mp.mpf(3)
|
||||
# four = mp.mpf(4)
|
||||
#
|
||||
# def mpkurt(lam):
|
||||
# if lam == 0:
|
||||
# k = mp.mpf(6)/5
|
||||
# else:
|
||||
# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) +
|
||||
# three*mp.beta(two*lam+one, two*lam+one))
|
||||
# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2
|
||||
# k = numer / denom - three
|
||||
# return k
|
||||
#
|
||||
# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the
|
||||
# # taylor function and we request a degree 9 Taylor polynomial, we actually
|
||||
# # get degree 8.
|
||||
# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01)
|
||||
# t = [mp.chop(c, tol=1e-15) for c in t]
|
||||
# p, q = mp.pade(t, 4, 4)
|
||||
# print("p =", [mp.fp.mpf(c) for c in p])
|
||||
# print("q =", [mp.fp.mpf(c) for c in q])
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
# Pade coefficients for the Tukey Lambda kurtosis function.
|
||||
_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077,
|
||||
0.20601184383406815, 4.59796302262789]
|
||||
_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842,
|
||||
0.43075235247853005, -2.789746758009912]
|
||||
|
||||
# numpy.poly1d instances for the numerator and denominator of the
|
||||
# Pade approximation to the Tukey Lambda kurtosis.
|
||||
_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1])
|
||||
_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1])
|
||||
|
||||
|
||||
def tukeylambda_kurtosis(lam):
|
||||
"""Kurtosis of the Tukey Lambda distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lam : array_like
|
||||
The lambda values at which to compute the variance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v : ndarray
|
||||
The variance. For lam < -0.25, the variance is not defined, so
|
||||
np.nan is returned. For lam = 0.25, np.inf is returned.
|
||||
|
||||
"""
|
||||
lam = np.asarray(lam)
|
||||
shp = lam.shape
|
||||
lam = np.atleast_1d(lam).astype(np.float64)
|
||||
|
||||
# For absolute values of lam less than threshold, use the Pade
|
||||
# approximation.
|
||||
threshold = 0.055
|
||||
|
||||
# Use masks to implement the conditional evaluation of the kurtosis.
|
||||
# lambda < -0.25: kurtosis = nan
|
||||
low_mask = lam < -0.25
|
||||
# lambda == -0.25: kurtosis = inf
|
||||
negqrtr_mask = lam == -0.25
|
||||
# lambda near 0: use Pade approximation
|
||||
small_mask = np.abs(lam) < threshold
|
||||
# else the "regular" case: use the explicit formula.
|
||||
reg_mask = ~(low_mask | negqrtr_mask | small_mask)
|
||||
|
||||
# Get the 'lam' values for the cases where they are needed.
|
||||
small = lam[small_mask]
|
||||
reg = lam[reg_mask]
|
||||
|
||||
# Compute the function for each case.
|
||||
k = np.empty_like(lam)
|
||||
k[low_mask] = np.nan
|
||||
k[negqrtr_mask] = np.inf
|
||||
if small.size > 0:
|
||||
k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small)
|
||||
if reg.size > 0:
|
||||
numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) +
|
||||
3 * beta(2 * reg + 1, 2 * reg + 1))
|
||||
denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2
|
||||
k[reg_mask] = numer / denom - 3
|
||||
|
||||
# The return value will be a numpy array; resetting the shape ensures that
|
||||
# if `lam` was a scalar, the return value is a 0-d array.
|
||||
k.shape = shp
|
||||
return k
|
||||
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user