first commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,225 @@
|
||||
"""
|
||||
Rudimentary Apache Arrow-backed ExtensionArray.
|
||||
|
||||
At the moment, just a boolean array / type is implemented.
|
||||
Eventually, we'll want to parametrize the type and support
|
||||
multiple dtypes. Not all methods are implemented yet, and the
|
||||
current implementation is not efficient.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import itertools
|
||||
import operator
|
||||
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
|
||||
from pandas._typing import type_t
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.extensions import (
|
||||
ExtensionArray,
|
||||
ExtensionDtype,
|
||||
register_extension_dtype,
|
||||
take,
|
||||
)
|
||||
from pandas.api.types import is_scalar
|
||||
from pandas.core.arraylike import OpsMixin
|
||||
from pandas.core.construction import extract_array
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class ArrowBoolDtype(ExtensionDtype):
|
||||
|
||||
type = np.bool_
|
||||
kind = "b"
|
||||
name = "arrow_bool"
|
||||
na_value = pa.NULL
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[ArrowBoolArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return ArrowBoolArray
|
||||
|
||||
@property
|
||||
def _is_boolean(self) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class ArrowStringDtype(ExtensionDtype):
|
||||
|
||||
type = str
|
||||
kind = "U"
|
||||
name = "arrow_string"
|
||||
na_value = pa.NULL
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[ArrowStringArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return ArrowStringArray
|
||||
|
||||
|
||||
class ArrowExtensionArray(OpsMixin, ExtensionArray):
|
||||
_data: pa.ChunkedArray
|
||||
|
||||
@classmethod
|
||||
def from_scalars(cls, values):
|
||||
if isinstance(values, cls):
|
||||
# in particular for empty cases the pa.array(np.asarray(...))
|
||||
# does not round-trip
|
||||
return cls(values._data)
|
||||
|
||||
elif not len(values):
|
||||
if isinstance(values, list):
|
||||
dtype = bool if cls is ArrowBoolArray else str
|
||||
values = np.array([], dtype=dtype)
|
||||
|
||||
arr = pa.chunked_array([pa.array(np.asarray(values))])
|
||||
return cls(arr)
|
||||
|
||||
@classmethod
|
||||
def from_array(cls, arr):
|
||||
assert isinstance(arr, pa.Array)
|
||||
return cls(pa.chunked_array([arr]))
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(cls, scalars, dtype=None, copy=False):
|
||||
return cls.from_scalars(scalars)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{type(self).__name__}({repr(self._data)})"
|
||||
|
||||
def __contains__(self, obj) -> bool:
|
||||
if obj is None or obj is self.dtype.na_value:
|
||||
# None -> EA.__contains__ only checks for self._dtype.na_value, not
|
||||
# any compatible NA value.
|
||||
# self.dtype.na_value -> <pa.NullScalar:None> isn't recognized by pd.isna
|
||||
return bool(self.isna().any())
|
||||
return bool(super().__contains__(obj))
|
||||
|
||||
def __getitem__(self, item):
|
||||
if is_scalar(item):
|
||||
return self._data.to_pandas()[item]
|
||||
else:
|
||||
vals = self._data.to_pandas()[item]
|
||||
return type(self).from_scalars(vals)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._data)
|
||||
|
||||
def astype(self, dtype, copy=True):
|
||||
# needed to fix this astype for the Series constructor.
|
||||
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
return super().astype(dtype, copy)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self._dtype
|
||||
|
||||
def _logical_method(self, other, op):
|
||||
if not isinstance(other, type(self)):
|
||||
raise NotImplementedError()
|
||||
|
||||
result = op(np.array(self._data), np.array(other._data))
|
||||
return ArrowBoolArray(
|
||||
pa.chunked_array([pa.array(result, mask=pd.isna(self._data.to_pandas()))])
|
||||
)
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, type(self)):
|
||||
# TODO: use some pyarrow function here?
|
||||
return np.asarray(self).__eq__(other)
|
||||
|
||||
return self._logical_method(other, operator.eq)
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
return sum(
|
||||
x.size
|
||||
for chunk in self._data.chunks
|
||||
for x in chunk.buffers()
|
||||
if x is not None
|
||||
)
|
||||
|
||||
def isna(self):
|
||||
nas = pd.isna(self._data.to_pandas())
|
||||
return type(self).from_scalars(nas)
|
||||
|
||||
def take(self, indices, allow_fill=False, fill_value=None):
|
||||
data = self._data.to_pandas()
|
||||
data = extract_array(data, extract_numpy=True)
|
||||
|
||||
if allow_fill and fill_value is None:
|
||||
fill_value = self.dtype.na_value
|
||||
|
||||
result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
|
||||
return self._from_sequence(result, dtype=self.dtype)
|
||||
|
||||
def copy(self):
|
||||
return type(self)(copy.copy(self._data))
|
||||
|
||||
@classmethod
|
||||
def _concat_same_type(cls, to_concat):
|
||||
chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat))
|
||||
arr = pa.chunked_array(chunks)
|
||||
return cls(arr)
|
||||
|
||||
def __invert__(self):
|
||||
return type(self).from_scalars(~self._data.to_pandas())
|
||||
|
||||
def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
|
||||
if skipna:
|
||||
arr = self[~self.isna()]
|
||||
else:
|
||||
arr = self
|
||||
|
||||
try:
|
||||
op = getattr(arr, name)
|
||||
except AttributeError as err:
|
||||
raise TypeError from err
|
||||
return op(**kwargs)
|
||||
|
||||
def any(self, axis=0, out=None):
|
||||
# Explicitly return a plain bool to reproduce GH-34660
|
||||
return bool(self._data.to_pandas().any())
|
||||
|
||||
def all(self, axis=0, out=None):
|
||||
# Explicitly return a plain bool to reproduce GH-34660
|
||||
return bool(self._data.to_pandas().all())
|
||||
|
||||
|
||||
class ArrowBoolArray(ArrowExtensionArray):
|
||||
def __init__(self, values):
|
||||
if not isinstance(values, pa.ChunkedArray):
|
||||
raise ValueError
|
||||
|
||||
assert values.type == pa.bool_()
|
||||
self._data = values
|
||||
self._dtype = ArrowBoolDtype()
|
||||
|
||||
|
||||
class ArrowStringArray(ArrowExtensionArray):
|
||||
def __init__(self, values):
|
||||
if not isinstance(values, pa.ChunkedArray):
|
||||
raise ValueError
|
||||
|
||||
assert values.type == pa.string()
|
||||
self._data = values
|
||||
self._dtype = ArrowStringDtype()
|
||||
@@ -0,0 +1,113 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import (
|
||||
is_ci_environment,
|
||||
is_platform_windows,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import is_bool_dtype
|
||||
from pandas.tests.extension import base
|
||||
|
||||
pytest.importorskip("pyarrow", minversion="1.0.1")
|
||||
|
||||
from pandas.tests.extension.arrow.arrays import ( # isort:skip
|
||||
ArrowBoolArray,
|
||||
ArrowBoolDtype,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return ArrowBoolDtype()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
values = np.random.randint(0, 2, size=100, dtype=bool)
|
||||
values[1] = ~values[0]
|
||||
return ArrowBoolArray.from_scalars(values)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing():
|
||||
return ArrowBoolArray.from_scalars([None, True])
|
||||
|
||||
|
||||
def test_basic_equals(data):
|
||||
# https://github.com/pandas-dev/pandas/issues/34660
|
||||
assert pd.Series(data).equals(pd.Series(data))
|
||||
|
||||
|
||||
class BaseArrowTests:
|
||||
pass
|
||||
|
||||
|
||||
class TestDtype(BaseArrowTests, base.BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestInterface(BaseArrowTests, base.BaseInterfaceTests):
|
||||
def test_copy(self, data):
|
||||
# __setitem__ does not work, so we only have a smoke-test
|
||||
data.copy()
|
||||
|
||||
def test_view(self, data):
|
||||
# __setitem__ does not work, so we only have a smoke-test
|
||||
data.view()
|
||||
|
||||
@pytest.mark.xfail(
|
||||
raises=AssertionError,
|
||||
reason="Doesn't recognize data._na_value as NA",
|
||||
)
|
||||
def test_contains(self, data, data_missing):
|
||||
super().test_contains(data, data_missing)
|
||||
|
||||
|
||||
class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
|
||||
# seems like some bug in isna on empty BoolArray returning floats.
|
||||
@pytest.mark.xfail(reason="bad is-na for empty data")
|
||||
def test_from_sequence_from_cls(self, data):
|
||||
super().test_from_sequence_from_cls(data)
|
||||
|
||||
@pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899")
|
||||
def test_series_constructor_no_data_with_index(self, dtype, na_value):
|
||||
# pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
|
||||
super().test_series_constructor_no_data_with_index(dtype, na_value)
|
||||
|
||||
@pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899")
|
||||
def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
|
||||
# pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
|
||||
super().test_series_constructor_scalar_na_with_index(dtype, na_value)
|
||||
|
||||
@pytest.mark.xfail(reason="ufunc 'invert' not supported for the input types")
|
||||
def test_construct_empty_dataframe(self, dtype):
|
||||
super().test_construct_empty_dataframe(dtype)
|
||||
|
||||
@pytest.mark.xfail(reason="_from_sequence ignores dtype keyword")
|
||||
def test_empty(self, dtype):
|
||||
super().test_empty(dtype)
|
||||
|
||||
|
||||
class TestReduce(base.BaseNoReduceTests):
|
||||
def test_reduce_series_boolean(self):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
is_ci_environment() and is_platform_windows(),
|
||||
reason="Causes stack overflow on Windows CI",
|
||||
)
|
||||
class TestReduceBoolean(base.BaseBooleanReduceTests):
|
||||
pass
|
||||
|
||||
|
||||
def test_is_bool_dtype(data):
|
||||
assert is_bool_dtype(data)
|
||||
assert pd.core.common.is_bool_indexer(data)
|
||||
s = pd.Series(range(len(data)))
|
||||
result = s[data]
|
||||
expected = s[np.asarray(data)]
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,12 @@
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
|
||||
pytest.importorskip("pyarrow", minversion="1.0.0")
|
||||
|
||||
|
||||
def test_constructor_from_list():
|
||||
# GH 27673
|
||||
result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow"))
|
||||
assert isinstance(result.dtype, pd.StringDtype)
|
||||
assert result.dtype.storage == "pyarrow"
|
||||
@@ -0,0 +1,60 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas._typing import type_t
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.extensions import (
|
||||
ExtensionDtype,
|
||||
register_extension_dtype,
|
||||
)
|
||||
|
||||
pytest.importorskip("pyarrow", minversion="1.0.1")
|
||||
|
||||
import pyarrow as pa # isort:skip
|
||||
|
||||
from pandas.tests.extension.arrow.arrays import ArrowExtensionArray # isort:skip
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class ArrowTimestampUSDtype(ExtensionDtype):
|
||||
|
||||
type = datetime.datetime
|
||||
kind = "M"
|
||||
name = "arrow_timestamp_us"
|
||||
na_value = pa.NULL
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[ArrowTimestampUSArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return ArrowTimestampUSArray
|
||||
|
||||
|
||||
class ArrowTimestampUSArray(ArrowExtensionArray):
|
||||
def __init__(self, values):
|
||||
if not isinstance(values, pa.ChunkedArray):
|
||||
raise ValueError
|
||||
|
||||
assert values.type == pa.timestamp("us")
|
||||
self._data = values
|
||||
self._dtype = ArrowTimestampUSDtype()
|
||||
|
||||
|
||||
def test_constructor_extensionblock():
|
||||
# GH 34986
|
||||
pd.DataFrame(
|
||||
{
|
||||
"timestamp": ArrowTimestampUSArray.from_scalars(
|
||||
[None, datetime.datetime(2010, 9, 8, 7, 6, 5, 4)]
|
||||
)
|
||||
}
|
||||
)
|
||||
@@ -0,0 +1,71 @@
|
||||
"""
|
||||
Base test suite for extension arrays.
|
||||
|
||||
These tests are intended for third-party libraries to subclass to validate
|
||||
that their extension arrays and dtypes satisfy the interface. Moving or
|
||||
renaming the tests should not be done lightly.
|
||||
|
||||
Libraries are expected to implement a few pytest fixtures to provide data
|
||||
for the tests. The fixtures may be located in either
|
||||
|
||||
* The same module as your test class.
|
||||
* A ``conftest.py`` in the same directory as your test class.
|
||||
|
||||
The full list of fixtures may be found in the ``conftest.py`` next to this
|
||||
file.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import pytest
|
||||
from pandas.tests.extension.base import BaseDtypeTests
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return MyDtype()
|
||||
|
||||
|
||||
class TestMyDtype(BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
Your class ``TestDtype`` will inherit all the tests defined on
|
||||
``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype``
|
||||
wherever the test requires it. You're free to implement additional tests.
|
||||
|
||||
All the tests in these modules use ``self.assert_frame_equal`` or
|
||||
``self.assert_series_equal`` for dataframe or series comparisons. By default,
|
||||
they use the usual ``pandas.testing.assert_frame_equal`` and
|
||||
``pandas.testing.assert_series_equal``. You can override the checks used
|
||||
by defining the staticmethods ``assert_frame_equal`` and
|
||||
``assert_series_equal`` on your base test class.
|
||||
|
||||
"""
|
||||
from pandas.tests.extension.base.casting import BaseCastingTests # noqa
|
||||
from pandas.tests.extension.base.constructors import BaseConstructorsTests # noqa
|
||||
from pandas.tests.extension.base.dim2 import ( # noqa
|
||||
Dim2CompatTests,
|
||||
NDArrayBacked2DTests,
|
||||
)
|
||||
from pandas.tests.extension.base.dtype import BaseDtypeTests # noqa
|
||||
from pandas.tests.extension.base.getitem import BaseGetitemTests # noqa
|
||||
from pandas.tests.extension.base.groupby import BaseGroupbyTests # noqa
|
||||
from pandas.tests.extension.base.index import BaseIndexTests # noqa
|
||||
from pandas.tests.extension.base.interface import BaseInterfaceTests # noqa
|
||||
from pandas.tests.extension.base.io import BaseParsingTests # noqa
|
||||
from pandas.tests.extension.base.methods import BaseMethodsTests # noqa
|
||||
from pandas.tests.extension.base.missing import BaseMissingTests # noqa
|
||||
from pandas.tests.extension.base.ops import ( # noqa
|
||||
BaseArithmeticOpsTests,
|
||||
BaseComparisonOpsTests,
|
||||
BaseOpsUtil,
|
||||
BaseUnaryOpsTests,
|
||||
)
|
||||
from pandas.tests.extension.base.printing import BasePrintingTests # noqa
|
||||
from pandas.tests.extension.base.reduce import ( # noqa
|
||||
BaseBooleanReduceTests,
|
||||
BaseNoReduceTests,
|
||||
BaseNumericReduceTests,
|
||||
)
|
||||
from pandas.tests.extension.base.reshaping import BaseReshapingTests # noqa
|
||||
from pandas.tests.extension.base.setitem import BaseSetitemTests # noqa
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,21 @@
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class BaseExtensionTests:
|
||||
# classmethod and different signature is needed
|
||||
# to make inheritance compliant with mypy
|
||||
@classmethod
|
||||
def assert_equal(cls, left, right, **kwargs):
|
||||
return tm.assert_equal(left, right, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def assert_series_equal(cls, left, right, *args, **kwargs):
|
||||
return tm.assert_series_equal(left, right, *args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def assert_frame_equal(cls, left, right, *args, **kwargs):
|
||||
return tm.assert_frame_equal(left, right, *args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def assert_extension_array_equal(cls, left, right, *args, **kwargs):
|
||||
return tm.assert_extension_array_equal(left, right, *args, **kwargs)
|
||||
@@ -0,0 +1,87 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import np_version_under1p20
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.internals import ObjectBlock
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseCastingTests(BaseExtensionTests):
|
||||
"""Casting to and from ExtensionDtypes"""
|
||||
|
||||
def test_astype_object_series(self, all_data):
|
||||
ser = pd.Series(all_data, name="A")
|
||||
result = ser.astype(object)
|
||||
assert result.dtype == np.dtype(object)
|
||||
if hasattr(result._mgr, "blocks"):
|
||||
assert isinstance(result._mgr.blocks[0], ObjectBlock)
|
||||
assert isinstance(result._mgr.array, np.ndarray)
|
||||
assert result._mgr.array.dtype == np.dtype(object)
|
||||
|
||||
def test_astype_object_frame(self, all_data):
|
||||
df = pd.DataFrame({"A": all_data})
|
||||
|
||||
result = df.astype(object)
|
||||
if hasattr(result._mgr, "blocks"):
|
||||
blk = result._data.blocks[0]
|
||||
assert isinstance(blk, ObjectBlock), type(blk)
|
||||
assert isinstance(result._mgr.arrays[0], np.ndarray)
|
||||
assert result._mgr.arrays[0].dtype == np.dtype(object)
|
||||
|
||||
# earlier numpy raises TypeError on e.g. np.dtype(np.int64) == "Int64"
|
||||
# instead of returning False
|
||||
if not np_version_under1p20:
|
||||
# check that we can compare the dtypes
|
||||
comp = result.dtypes == df.dtypes
|
||||
assert not comp.any()
|
||||
|
||||
def test_tolist(self, data):
|
||||
result = pd.Series(data).tolist()
|
||||
expected = list(data)
|
||||
assert result == expected
|
||||
|
||||
def test_astype_str(self, data):
|
||||
result = pd.Series(data[:5]).astype(str)
|
||||
expected = pd.Series([str(x) for x in data[:5]], dtype=str)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"nullable_string_dtype",
|
||||
[
|
||||
"string[python]",
|
||||
pytest.param(
|
||||
"string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_astype_string(self, data, nullable_string_dtype):
|
||||
# GH-33465
|
||||
result = pd.Series(data[:5]).astype(nullable_string_dtype)
|
||||
expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_to_numpy(self, data):
|
||||
expected = np.asarray(data)
|
||||
|
||||
result = data.to_numpy()
|
||||
self.assert_equal(result, expected)
|
||||
|
||||
result = pd.Series(data).to_numpy()
|
||||
self.assert_equal(result, expected)
|
||||
|
||||
def test_astype_empty_dataframe(self, dtype):
|
||||
# https://github.com/pandas-dev/pandas/issues/33113
|
||||
df = pd.DataFrame()
|
||||
result = df.astype(dtype)
|
||||
self.assert_frame_equal(result, df)
|
||||
|
||||
@pytest.mark.parametrize("copy", [True, False])
|
||||
def test_astype_own_type(self, data, copy):
|
||||
# ensure that astype returns the original object for equal dtype and copy=False
|
||||
# https://github.com/pandas-dev/pandas/issues/28488
|
||||
result = data.astype(data.dtype, copy=copy)
|
||||
assert (result is data) is (not copy)
|
||||
self.assert_extension_array_equal(result, data)
|
||||
@@ -0,0 +1,142 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.extensions import ExtensionArray
|
||||
from pandas.core.internals.blocks import EABackedBlock
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseConstructorsTests(BaseExtensionTests):
|
||||
def test_from_sequence_from_cls(self, data):
|
||||
result = type(data)._from_sequence(data, dtype=data.dtype)
|
||||
self.assert_extension_array_equal(result, data)
|
||||
|
||||
data = data[:0]
|
||||
result = type(data)._from_sequence(data, dtype=data.dtype)
|
||||
self.assert_extension_array_equal(result, data)
|
||||
|
||||
def test_array_from_scalars(self, data):
|
||||
scalars = [data[0], data[1], data[2]]
|
||||
result = data._from_sequence(scalars)
|
||||
assert isinstance(result, type(data))
|
||||
|
||||
def test_series_constructor(self, data):
|
||||
result = pd.Series(data)
|
||||
assert result.dtype == data.dtype
|
||||
assert len(result) == len(data)
|
||||
if hasattr(result._mgr, "blocks"):
|
||||
assert isinstance(result._mgr.blocks[0], EABackedBlock)
|
||||
assert result._mgr.array is data
|
||||
|
||||
# Series[EA] is unboxed / boxed correctly
|
||||
result2 = pd.Series(result)
|
||||
assert result2.dtype == data.dtype
|
||||
if hasattr(result._mgr, "blocks"):
|
||||
assert isinstance(result2._mgr.blocks[0], EABackedBlock)
|
||||
|
||||
def test_series_constructor_no_data_with_index(self, dtype, na_value):
|
||||
result = pd.Series(index=[1, 2, 3], dtype=dtype)
|
||||
expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=dtype)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
# GH 33559 - empty index
|
||||
result = pd.Series(index=[], dtype=dtype)
|
||||
expected = pd.Series([], index=pd.Index([], dtype="object"), dtype=dtype)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
|
||||
result = pd.Series(na_value, index=[1, 2, 3], dtype=dtype)
|
||||
expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=dtype)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_series_constructor_scalar_with_index(self, data, dtype):
|
||||
scalar = data[0]
|
||||
result = pd.Series(scalar, index=[1, 2, 3], dtype=dtype)
|
||||
expected = pd.Series([scalar] * 3, index=[1, 2, 3], dtype=dtype)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
result = pd.Series(scalar, index=["foo"], dtype=dtype)
|
||||
expected = pd.Series([scalar], index=["foo"], dtype=dtype)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("from_series", [True, False])
|
||||
def test_dataframe_constructor_from_dict(self, data, from_series):
|
||||
if from_series:
|
||||
data = pd.Series(data)
|
||||
result = pd.DataFrame({"A": data})
|
||||
assert result.dtypes["A"] == data.dtype
|
||||
assert result.shape == (len(data), 1)
|
||||
if hasattr(result._mgr, "blocks"):
|
||||
assert isinstance(result._mgr.blocks[0], EABackedBlock)
|
||||
assert isinstance(result._mgr.arrays[0], ExtensionArray)
|
||||
|
||||
def test_dataframe_from_series(self, data):
|
||||
result = pd.DataFrame(pd.Series(data))
|
||||
assert result.dtypes[0] == data.dtype
|
||||
assert result.shape == (len(data), 1)
|
||||
if hasattr(result._mgr, "blocks"):
|
||||
assert isinstance(result._mgr.blocks[0], EABackedBlock)
|
||||
assert isinstance(result._mgr.arrays[0], ExtensionArray)
|
||||
|
||||
def test_series_given_mismatched_index_raises(self, data):
|
||||
msg = r"Length of values \(3\) does not match length of index \(5\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.Series(data[:3], index=[0, 1, 2, 3, 4])
|
||||
|
||||
def test_from_dtype(self, data):
|
||||
# construct from our dtype & string dtype
|
||||
dtype = data.dtype
|
||||
|
||||
expected = pd.Series(data)
|
||||
result = pd.Series(list(data), dtype=dtype)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
result = pd.Series(list(data), dtype=str(dtype))
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
# gh-30280
|
||||
|
||||
expected = pd.DataFrame(data).astype(dtype)
|
||||
result = pd.DataFrame(list(data), dtype=dtype)
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
result = pd.DataFrame(list(data), dtype=str(dtype))
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pandas_array(self, data):
|
||||
# pd.array(extension_array) should be idempotent...
|
||||
result = pd.array(data)
|
||||
self.assert_extension_array_equal(result, data)
|
||||
|
||||
def test_pandas_array_dtype(self, data):
|
||||
# ... but specifying dtype will override idempotency
|
||||
result = pd.array(data, dtype=np.dtype(object))
|
||||
expected = pd.arrays.PandasArray(np.asarray(data, dtype=object))
|
||||
self.assert_equal(result, expected)
|
||||
|
||||
def test_construct_empty_dataframe(self, dtype):
|
||||
# GH 33623
|
||||
result = pd.DataFrame(columns=["a"], dtype=dtype)
|
||||
expected = pd.DataFrame(
|
||||
{"a": pd.array([], dtype=dtype)}, index=pd.Index([], dtype="object")
|
||||
)
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_empty(self, dtype):
|
||||
cls = dtype.construct_array_type()
|
||||
result = cls._empty((4,), dtype=dtype)
|
||||
assert isinstance(result, cls)
|
||||
assert result.dtype == dtype
|
||||
assert result.shape == (4,)
|
||||
|
||||
# GH#19600 method on ExtensionDtype
|
||||
result2 = dtype.empty((4,))
|
||||
assert isinstance(result2, cls)
|
||||
assert result2.dtype == dtype
|
||||
assert result2.shape == (4,)
|
||||
|
||||
result2 = dtype.empty(4)
|
||||
assert isinstance(result2, cls)
|
||||
assert result2.dtype == dtype
|
||||
assert result2.shape == (4,)
|
||||
@@ -0,0 +1,301 @@
|
||||
"""
|
||||
Tests for 2D compatibility.
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.missing import is_matching_na
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.arrays.integer import INT_STR_TO_DTYPE
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class Dim2CompatTests(BaseExtensionTests):
|
||||
def test_transpose(self, data):
|
||||
arr2d = data.repeat(2).reshape(-1, 2)
|
||||
shape = arr2d.shape
|
||||
assert shape[0] != shape[-1] # otherwise the rest of the test is useless
|
||||
|
||||
assert arr2d.T.shape == shape[::-1]
|
||||
|
||||
def test_frame_from_2d_array(self, data):
|
||||
arr2d = data.repeat(2).reshape(-1, 2)
|
||||
|
||||
df = pd.DataFrame(arr2d)
|
||||
expected = pd.DataFrame({0: arr2d[:, 0], 1: arr2d[:, 1]})
|
||||
self.assert_frame_equal(df, expected)
|
||||
|
||||
def test_swapaxes(self, data):
|
||||
arr2d = data.repeat(2).reshape(-1, 2)
|
||||
|
||||
result = arr2d.swapaxes(0, 1)
|
||||
expected = arr2d.T
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_delete_2d(self, data):
|
||||
arr2d = data.repeat(3).reshape(-1, 3)
|
||||
|
||||
# axis = 0
|
||||
result = arr2d.delete(1, axis=0)
|
||||
expected = data.delete(1).repeat(3).reshape(-1, 3)
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
# axis = 1
|
||||
result = arr2d.delete(1, axis=1)
|
||||
expected = data.repeat(2).reshape(-1, 2)
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_take_2d(self, data):
|
||||
arr2d = data.reshape(-1, 1)
|
||||
|
||||
result = arr2d.take([0, 0, -1], axis=0)
|
||||
|
||||
expected = data.take([0, 0, -1]).reshape(-1, 1)
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_repr_2d(self, data):
|
||||
# this could fail in a corner case where an element contained the name
|
||||
res = repr(data.reshape(1, -1))
|
||||
assert res.count(f"<{type(data).__name__}") == 1
|
||||
|
||||
res = repr(data.reshape(-1, 1))
|
||||
assert res.count(f"<{type(data).__name__}") == 1
|
||||
|
||||
def test_reshape(self, data):
|
||||
arr2d = data.reshape(-1, 1)
|
||||
assert arr2d.shape == (data.size, 1)
|
||||
assert len(arr2d) == len(data)
|
||||
|
||||
arr2d = data.reshape((-1, 1))
|
||||
assert arr2d.shape == (data.size, 1)
|
||||
assert len(arr2d) == len(data)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
data.reshape((data.size, 2))
|
||||
with pytest.raises(ValueError):
|
||||
data.reshape(data.size, 2)
|
||||
|
||||
def test_getitem_2d(self, data):
|
||||
arr2d = data.reshape(1, -1)
|
||||
|
||||
result = arr2d[0]
|
||||
self.assert_extension_array_equal(result, data)
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
arr2d[1]
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
arr2d[-2]
|
||||
|
||||
result = arr2d[:]
|
||||
self.assert_extension_array_equal(result, arr2d)
|
||||
|
||||
result = arr2d[:, :]
|
||||
self.assert_extension_array_equal(result, arr2d)
|
||||
|
||||
result = arr2d[:, 0]
|
||||
expected = data[[0]]
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
# dimension-expanding getitem on 1D
|
||||
result = data[:, np.newaxis]
|
||||
self.assert_extension_array_equal(result, arr2d.T)
|
||||
|
||||
def test_iter_2d(self, data):
|
||||
arr2d = data.reshape(1, -1)
|
||||
|
||||
objs = list(iter(arr2d))
|
||||
assert len(objs) == arr2d.shape[0]
|
||||
|
||||
for obj in objs:
|
||||
assert isinstance(obj, type(data))
|
||||
assert obj.dtype == data.dtype
|
||||
assert obj.ndim == 1
|
||||
assert len(obj) == arr2d.shape[1]
|
||||
|
||||
def test_tolist_2d(self, data):
|
||||
arr2d = data.reshape(1, -1)
|
||||
|
||||
result = arr2d.tolist()
|
||||
expected = [data.tolist()]
|
||||
|
||||
assert isinstance(result, list)
|
||||
assert all(isinstance(x, list) for x in result)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_concat_2d(self, data):
|
||||
left = type(data)._concat_same_type([data, data]).reshape(-1, 2)
|
||||
right = left.copy()
|
||||
|
||||
# axis=0
|
||||
result = left._concat_same_type([left, right], axis=0)
|
||||
expected = data._concat_same_type([data] * 4).reshape(-1, 2)
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
# axis=1
|
||||
result = left._concat_same_type([left, right], axis=1)
|
||||
assert result.shape == (len(data), 4)
|
||||
self.assert_extension_array_equal(result[:, :2], left)
|
||||
self.assert_extension_array_equal(result[:, 2:], right)
|
||||
|
||||
# axis > 1 -> invalid
|
||||
msg = "axis 2 is out of bounds for array of dimension 2"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
left._concat_same_type([left, right], axis=2)
|
||||
|
||||
@pytest.mark.parametrize("method", ["backfill", "pad"])
|
||||
def test_fillna_2d_method(self, data_missing, method):
|
||||
arr = data_missing.repeat(2).reshape(2, 2)
|
||||
assert arr[0].isna().all()
|
||||
assert not arr[1].isna().any()
|
||||
|
||||
result = arr.fillna(method=method)
|
||||
|
||||
expected = data_missing.fillna(method=method).repeat(2).reshape(2, 2)
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"])
|
||||
def test_reductions_2d_axis_none(self, data, method):
|
||||
arr2d = data.reshape(1, -1)
|
||||
|
||||
err_expected = None
|
||||
err_result = None
|
||||
try:
|
||||
expected = getattr(data, method)()
|
||||
except Exception as err:
|
||||
# if the 1D reduction is invalid, the 2D reduction should be as well
|
||||
err_expected = err
|
||||
try:
|
||||
result = getattr(arr2d, method)(axis=None)
|
||||
except Exception as err2:
|
||||
err_result = err2
|
||||
|
||||
else:
|
||||
result = getattr(arr2d, method)(axis=None)
|
||||
|
||||
if err_result is not None or err_expected is not None:
|
||||
assert type(err_result) == type(err_expected)
|
||||
return
|
||||
|
||||
assert is_matching_na(result, expected) or result == expected
|
||||
|
||||
@pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"])
|
||||
def test_reductions_2d_axis0(self, data, method):
|
||||
arr2d = data.reshape(1, -1)
|
||||
|
||||
kwargs = {}
|
||||
if method == "std":
|
||||
# pass ddof=0 so we get all-zero std instead of all-NA std
|
||||
kwargs["ddof"] = 0
|
||||
|
||||
try:
|
||||
result = getattr(arr2d, method)(axis=0, **kwargs)
|
||||
except Exception as err:
|
||||
try:
|
||||
getattr(data, method)()
|
||||
except Exception as err2:
|
||||
assert type(err) == type(err2)
|
||||
return
|
||||
else:
|
||||
raise AssertionError("Both reductions should raise or neither")
|
||||
|
||||
def get_reduction_result_dtype(dtype):
|
||||
# windows and 32bit builds will in some cases have int32/uint32
|
||||
# where other builds will have int64/uint64.
|
||||
if dtype.itemsize == 8:
|
||||
return dtype
|
||||
elif dtype.kind in "ib":
|
||||
return INT_STR_TO_DTYPE[np.dtype(int).name]
|
||||
else:
|
||||
# i.e. dtype.kind == "u"
|
||||
return INT_STR_TO_DTYPE[np.dtype(np.uint).name]
|
||||
|
||||
if method in ["mean", "median", "sum", "prod"]:
|
||||
# std and var are not dtype-preserving
|
||||
expected = data
|
||||
if method in ["sum", "prod"] and data.dtype.kind in "iub":
|
||||
dtype = get_reduction_result_dtype(data.dtype)
|
||||
|
||||
expected = data.astype(dtype)
|
||||
if data.dtype.kind == "b" and method in ["sum", "prod"]:
|
||||
# We get IntegerArray instead of BooleanArray
|
||||
pass
|
||||
else:
|
||||
assert type(expected) == type(data), type(expected)
|
||||
assert dtype == expected.dtype
|
||||
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
elif method == "std":
|
||||
self.assert_extension_array_equal(result, data - data)
|
||||
# punt on method == "var"
|
||||
|
||||
@pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"])
|
||||
def test_reductions_2d_axis1(self, data, method):
|
||||
arr2d = data.reshape(1, -1)
|
||||
|
||||
try:
|
||||
result = getattr(arr2d, method)(axis=1)
|
||||
except Exception as err:
|
||||
try:
|
||||
getattr(data, method)()
|
||||
except Exception as err2:
|
||||
assert type(err) == type(err2)
|
||||
return
|
||||
else:
|
||||
raise AssertionError("Both reductions should raise or neither")
|
||||
|
||||
# not necessarily type/dtype-preserving, so weaker assertions
|
||||
assert result.shape == (1,)
|
||||
expected_scalar = getattr(data, method)()
|
||||
res = result[0]
|
||||
assert is_matching_na(res, expected_scalar) or res == expected_scalar
|
||||
|
||||
|
||||
class NDArrayBacked2DTests(Dim2CompatTests):
|
||||
# More specific tests for NDArrayBackedExtensionArray subclasses
|
||||
|
||||
def test_copy_order(self, data):
|
||||
# We should be matching numpy semantics for the "order" keyword in 'copy'
|
||||
arr2d = data.repeat(2).reshape(-1, 2)
|
||||
assert arr2d._ndarray.flags["C_CONTIGUOUS"]
|
||||
|
||||
res = arr2d.copy()
|
||||
assert res._ndarray.flags["C_CONTIGUOUS"]
|
||||
|
||||
res = arr2d[::2, ::2].copy()
|
||||
assert res._ndarray.flags["C_CONTIGUOUS"]
|
||||
|
||||
res = arr2d.copy("F")
|
||||
assert not res._ndarray.flags["C_CONTIGUOUS"]
|
||||
assert res._ndarray.flags["F_CONTIGUOUS"]
|
||||
|
||||
res = arr2d.copy("K")
|
||||
assert res._ndarray.flags["C_CONTIGUOUS"]
|
||||
|
||||
res = arr2d.T.copy("K")
|
||||
assert not res._ndarray.flags["C_CONTIGUOUS"]
|
||||
assert res._ndarray.flags["F_CONTIGUOUS"]
|
||||
|
||||
# order not accepted by numpy
|
||||
msg = r"order must be one of 'C', 'F', 'A', or 'K' \(got 'Q'\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
arr2d.copy("Q")
|
||||
|
||||
# neither contiguity
|
||||
arr_nc = arr2d[::2]
|
||||
assert not arr_nc._ndarray.flags["C_CONTIGUOUS"]
|
||||
assert not arr_nc._ndarray.flags["F_CONTIGUOUS"]
|
||||
|
||||
assert arr_nc.copy()._ndarray.flags["C_CONTIGUOUS"]
|
||||
assert not arr_nc.copy()._ndarray.flags["F_CONTIGUOUS"]
|
||||
|
||||
assert arr_nc.copy("C")._ndarray.flags["C_CONTIGUOUS"]
|
||||
assert not arr_nc.copy("C")._ndarray.flags["F_CONTIGUOUS"]
|
||||
|
||||
assert not arr_nc.copy("F")._ndarray.flags["C_CONTIGUOUS"]
|
||||
assert arr_nc.copy("F")._ndarray.flags["F_CONTIGUOUS"]
|
||||
|
||||
assert arr_nc.copy("K")._ndarray.flags["C_CONTIGUOUS"]
|
||||
assert not arr_nc.copy("K")._ndarray.flags["F_CONTIGUOUS"]
|
||||
@@ -0,0 +1,137 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.types import (
|
||||
infer_dtype,
|
||||
is_object_dtype,
|
||||
is_string_dtype,
|
||||
)
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseDtypeTests(BaseExtensionTests):
|
||||
"""Base class for ExtensionDtype classes"""
|
||||
|
||||
def test_name(self, dtype):
|
||||
assert isinstance(dtype.name, str)
|
||||
|
||||
def test_kind(self, dtype):
|
||||
valid = set("biufcmMOSUV")
|
||||
assert dtype.kind in valid
|
||||
|
||||
def test_construct_from_string_own_name(self, dtype):
|
||||
result = dtype.construct_from_string(dtype.name)
|
||||
assert type(result) is type(dtype)
|
||||
|
||||
# check OK as classmethod
|
||||
result = type(dtype).construct_from_string(dtype.name)
|
||||
assert type(result) is type(dtype)
|
||||
|
||||
def test_is_dtype_from_name(self, dtype):
|
||||
result = type(dtype).is_dtype(dtype.name)
|
||||
assert result is True
|
||||
|
||||
def test_is_dtype_unboxes_dtype(self, data, dtype):
|
||||
assert dtype.is_dtype(data) is True
|
||||
|
||||
def test_is_dtype_from_self(self, dtype):
|
||||
result = type(dtype).is_dtype(dtype)
|
||||
assert result is True
|
||||
|
||||
def test_is_dtype_other_input(self, dtype):
|
||||
assert dtype.is_dtype([1, 2, 3]) is False
|
||||
|
||||
def test_is_not_string_type(self, dtype):
|
||||
return not is_string_dtype(dtype)
|
||||
|
||||
def test_is_not_object_type(self, dtype):
|
||||
return not is_object_dtype(dtype)
|
||||
|
||||
def test_eq_with_str(self, dtype):
|
||||
assert dtype == dtype.name
|
||||
assert dtype != dtype.name + "-suffix"
|
||||
|
||||
def test_eq_with_numpy_object(self, dtype):
|
||||
assert dtype != np.dtype("object")
|
||||
|
||||
def test_eq_with_self(self, dtype):
|
||||
assert dtype == dtype
|
||||
assert dtype != object()
|
||||
|
||||
def test_array_type(self, data, dtype):
|
||||
assert dtype.construct_array_type() is type(data)
|
||||
|
||||
def test_check_dtype(self, data):
|
||||
dtype = data.dtype
|
||||
|
||||
# check equivalency for using .dtypes
|
||||
df = pd.DataFrame(
|
||||
{"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1}
|
||||
)
|
||||
|
||||
# TODO(numpy-1.20): This warnings filter and if block can be removed
|
||||
# once we require numpy>=1.20
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
result = df.dtypes == str(dtype)
|
||||
# NumPy>=1.20.0, but not pandas.compat.numpy till there
|
||||
# is a wheel available with this change.
|
||||
try:
|
||||
new_numpy_behavior = np.dtype("int64") != "Int64"
|
||||
except TypeError:
|
||||
new_numpy_behavior = True
|
||||
|
||||
if dtype.name == "Int64" and not new_numpy_behavior:
|
||||
expected = pd.Series([True, True, False, True], index=list("ABCD"))
|
||||
else:
|
||||
expected = pd.Series([True, True, False, False], index=list("ABCD"))
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
expected = pd.Series([True, True, False, False], index=list("ABCD"))
|
||||
result = df.dtypes.apply(str) == str(dtype)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_hashable(self, dtype):
|
||||
hash(dtype) # no error
|
||||
|
||||
def test_str(self, dtype):
|
||||
assert str(dtype) == dtype.name
|
||||
|
||||
def test_eq(self, dtype):
|
||||
assert dtype == dtype.name
|
||||
assert dtype != "anonther_type"
|
||||
|
||||
def test_construct_from_string(self, dtype):
|
||||
dtype_instance = type(dtype).construct_from_string(dtype.name)
|
||||
assert isinstance(dtype_instance, type(dtype))
|
||||
|
||||
def test_construct_from_string_another_type_raises(self, dtype):
|
||||
msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
type(dtype).construct_from_string("another_type")
|
||||
|
||||
def test_construct_from_string_wrong_type_raises(self, dtype):
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match="'construct_from_string' expects a string, got <class 'int'>",
|
||||
):
|
||||
type(dtype).construct_from_string(0)
|
||||
|
||||
def test_get_common_dtype(self, dtype):
|
||||
# in practice we will not typically call this with a 1-length list
|
||||
# (we shortcut to just use that dtype as the common dtype), but
|
||||
# still testing as good practice to have this working (and it is the
|
||||
# only case we can test in general)
|
||||
assert dtype._get_common_dtype([dtype]) == dtype
|
||||
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
def test_infer_dtype(self, data, data_missing, skipna):
|
||||
# only testing that this works without raising an error
|
||||
res = infer_dtype(data, skipna=skipna)
|
||||
assert isinstance(res, str)
|
||||
res = infer_dtype(data_missing, skipna=skipna)
|
||||
assert isinstance(res, str)
|
||||
@@ -0,0 +1,486 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseGetitemTests(BaseExtensionTests):
|
||||
"""Tests for ExtensionArray.__getitem__."""
|
||||
|
||||
def test_iloc_series(self, data):
|
||||
ser = pd.Series(data)
|
||||
result = ser.iloc[:4]
|
||||
expected = pd.Series(data[:4])
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.iloc[[0, 1, 2, 3]]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_iloc_frame(self, data):
|
||||
df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
|
||||
expected = pd.DataFrame({"A": data[:4]})
|
||||
|
||||
# slice -> frame
|
||||
result = df.iloc[:4, [0]]
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
# sequence -> frame
|
||||
result = df.iloc[[0, 1, 2, 3], [0]]
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
expected = pd.Series(data[:4], name="A")
|
||||
|
||||
# slice -> series
|
||||
result = df.iloc[:4, 0]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
# sequence -> series
|
||||
result = df.iloc[:4, 0]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
# GH#32959 slice columns with step
|
||||
result = df.iloc[:, ::2]
|
||||
self.assert_frame_equal(result, df[["A"]])
|
||||
result = df[["B", "A"]].iloc[:, ::2]
|
||||
self.assert_frame_equal(result, df[["B"]])
|
||||
|
||||
def test_iloc_frame_single_block(self, data):
|
||||
# GH#32959 null slice along index, slice along columns with single-block
|
||||
df = pd.DataFrame({"A": data})
|
||||
|
||||
result = df.iloc[:, :]
|
||||
self.assert_frame_equal(result, df)
|
||||
|
||||
result = df.iloc[:, :1]
|
||||
self.assert_frame_equal(result, df)
|
||||
|
||||
result = df.iloc[:, :2]
|
||||
self.assert_frame_equal(result, df)
|
||||
|
||||
result = df.iloc[:, ::2]
|
||||
self.assert_frame_equal(result, df)
|
||||
|
||||
result = df.iloc[:, 1:2]
|
||||
self.assert_frame_equal(result, df.iloc[:, :0])
|
||||
|
||||
result = df.iloc[:, -1:]
|
||||
self.assert_frame_equal(result, df)
|
||||
|
||||
def test_loc_series(self, data):
|
||||
ser = pd.Series(data)
|
||||
result = ser.loc[:3]
|
||||
expected = pd.Series(data[:4])
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.loc[[0, 1, 2, 3]]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_loc_frame(self, data):
|
||||
df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
|
||||
expected = pd.DataFrame({"A": data[:4]})
|
||||
|
||||
# slice -> frame
|
||||
result = df.loc[:3, ["A"]]
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
# sequence -> frame
|
||||
result = df.loc[[0, 1, 2, 3], ["A"]]
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
expected = pd.Series(data[:4], name="A")
|
||||
|
||||
# slice -> series
|
||||
result = df.loc[:3, "A"]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
# sequence -> series
|
||||
result = df.loc[:3, "A"]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_loc_iloc_frame_single_dtype(self, data):
|
||||
# GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly
|
||||
# return a scalar
|
||||
df = pd.DataFrame({"A": data})
|
||||
expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype)
|
||||
|
||||
result = df.loc[2]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
expected = pd.Series(
|
||||
[data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype
|
||||
)
|
||||
result = df.iloc[-1]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_getitem_scalar(self, data):
|
||||
result = data[0]
|
||||
assert isinstance(result, data.dtype.type)
|
||||
|
||||
result = pd.Series(data)[0]
|
||||
assert isinstance(result, data.dtype.type)
|
||||
|
||||
def test_getitem_invalid(self, data):
|
||||
# TODO: box over scalar, [scalar], (scalar,)?
|
||||
|
||||
msg = (
|
||||
r"only integers, slices \(`:`\), ellipsis \(`...`\), numpy.newaxis "
|
||||
r"\(`None`\) and integer or boolean arrays are valid indices"
|
||||
)
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
data["foo"]
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
data[2.5]
|
||||
|
||||
ub = len(data)
|
||||
msg = "|".join(
|
||||
[
|
||||
"list index out of range", # json
|
||||
"index out of bounds", # pyarrow
|
||||
"Out of bounds access", # Sparse
|
||||
f"loc must be an integer between -{ub} and {ub}", # Sparse
|
||||
f"index {ub+1} is out of bounds for axis 0 with size {ub}",
|
||||
f"index -{ub+1} is out of bounds for axis 0 with size {ub}",
|
||||
]
|
||||
)
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
data[ub + 1]
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
data[-ub - 1]
|
||||
|
||||
def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
|
||||
result = data_missing[0]
|
||||
assert na_cmp(result, na_value)
|
||||
|
||||
def test_getitem_empty(self, data):
|
||||
# Indexing with empty list
|
||||
result = data[[]]
|
||||
assert len(result) == 0
|
||||
assert isinstance(result, type(data))
|
||||
|
||||
expected = data[np.array([], dtype="int64")]
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_getitem_mask(self, data):
|
||||
# Empty mask, raw array
|
||||
mask = np.zeros(len(data), dtype=bool)
|
||||
result = data[mask]
|
||||
assert len(result) == 0
|
||||
assert isinstance(result, type(data))
|
||||
|
||||
# Empty mask, in series
|
||||
mask = np.zeros(len(data), dtype=bool)
|
||||
result = pd.Series(data)[mask]
|
||||
assert len(result) == 0
|
||||
assert result.dtype == data.dtype
|
||||
|
||||
# non-empty mask, raw array
|
||||
mask[0] = True
|
||||
result = data[mask]
|
||||
assert len(result) == 1
|
||||
assert isinstance(result, type(data))
|
||||
|
||||
# non-empty mask, in series
|
||||
result = pd.Series(data)[mask]
|
||||
assert len(result) == 1
|
||||
assert result.dtype == data.dtype
|
||||
|
||||
def test_getitem_mask_raises(self, data):
|
||||
mask = np.array([True, False])
|
||||
msg = f"Boolean index has wrong length: 2 instead of {len(data)}"
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
data[mask]
|
||||
|
||||
mask = pd.array(mask, dtype="boolean")
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
data[mask]
|
||||
|
||||
def test_getitem_boolean_array_mask(self, data):
|
||||
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
|
||||
result = data[mask]
|
||||
assert len(result) == 0
|
||||
assert isinstance(result, type(data))
|
||||
|
||||
result = pd.Series(data)[mask]
|
||||
assert len(result) == 0
|
||||
assert result.dtype == data.dtype
|
||||
|
||||
mask[:5] = True
|
||||
expected = data.take([0, 1, 2, 3, 4])
|
||||
result = data[mask]
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
expected = pd.Series(expected)
|
||||
result = pd.Series(data)[mask]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_getitem_boolean_na_treated_as_false(self, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/31503
|
||||
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
|
||||
mask[:2] = pd.NA
|
||||
mask[2:4] = True
|
||||
|
||||
result = data[mask]
|
||||
expected = data[mask.fillna(False)]
|
||||
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
s = pd.Series(data)
|
||||
|
||||
result = s[mask]
|
||||
expected = s[mask.fillna(False)]
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
|
||||
ids=["list", "integer-array", "numpy-array"],
|
||||
)
|
||||
def test_getitem_integer_array(self, data, idx):
|
||||
result = data[idx]
|
||||
assert len(result) == 3
|
||||
assert isinstance(result, type(data))
|
||||
expected = data.take([0, 1, 2])
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
expected = pd.Series(expected)
|
||||
result = pd.Series(data)[idx]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
|
||||
ids=["list", "integer-array"],
|
||||
)
|
||||
def test_getitem_integer_with_missing_raises(self, data, idx):
|
||||
msg = "Cannot index with an integer indexer containing NA values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
data[idx]
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="Tries label-based and raises KeyError; "
|
||||
"in some cases raises when calling np.asarray"
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
|
||||
ids=["list", "integer-array"],
|
||||
)
|
||||
def test_getitem_series_integer_with_missing_raises(self, data, idx):
|
||||
msg = "Cannot index with an integer indexer containing NA values"
|
||||
# TODO: this raises KeyError about labels not found (it tries label-based)
|
||||
|
||||
ser = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ser[idx]
|
||||
|
||||
def test_getitem_slice(self, data):
|
||||
# getitem[slice] should return an array
|
||||
result = data[slice(0)] # empty
|
||||
assert isinstance(result, type(data))
|
||||
|
||||
result = data[slice(1)] # scalar
|
||||
assert isinstance(result, type(data))
|
||||
|
||||
def test_getitem_ellipsis_and_slice(self, data):
|
||||
# GH#40353 this is called from getitem_block_index
|
||||
result = data[..., :]
|
||||
self.assert_extension_array_equal(result, data)
|
||||
|
||||
result = data[:, ...]
|
||||
self.assert_extension_array_equal(result, data)
|
||||
|
||||
result = data[..., :3]
|
||||
self.assert_extension_array_equal(result, data[:3])
|
||||
|
||||
result = data[:3, ...]
|
||||
self.assert_extension_array_equal(result, data[:3])
|
||||
|
||||
result = data[..., ::2]
|
||||
self.assert_extension_array_equal(result, data[::2])
|
||||
|
||||
result = data[::2, ...]
|
||||
self.assert_extension_array_equal(result, data[::2])
|
||||
|
||||
def test_get(self, data):
|
||||
# GH 20882
|
||||
s = pd.Series(data, index=[2 * i for i in range(len(data))])
|
||||
assert s.get(4) == s.iloc[2]
|
||||
|
||||
result = s.get([4, 6])
|
||||
expected = s.iloc[[2, 3]]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
result = s.get(slice(2))
|
||||
expected = s.iloc[[0, 1]]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
assert s.get(-1) is None
|
||||
assert s.get(s.index.max() + 1) is None
|
||||
|
||||
s = pd.Series(data[:6], index=list("abcdef"))
|
||||
assert s.get("c") == s.iloc[2]
|
||||
|
||||
result = s.get(slice("b", "d"))
|
||||
expected = s.iloc[[1, 2, 3]]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
result = s.get("Z")
|
||||
assert result is None
|
||||
|
||||
assert s.get(4) == s.iloc[4]
|
||||
assert s.get(-1) == s.iloc[-1]
|
||||
assert s.get(len(s)) is None
|
||||
|
||||
# GH 21257
|
||||
s = pd.Series(data)
|
||||
s2 = s[::2]
|
||||
assert s2.get(1) is None
|
||||
|
||||
def test_take_sequence(self, data):
|
||||
result = pd.Series(data)[[0, 1, 3]]
|
||||
assert result.iloc[0] == data[0]
|
||||
assert result.iloc[1] == data[1]
|
||||
assert result.iloc[2] == data[3]
|
||||
|
||||
def test_take(self, data, na_value, na_cmp):
|
||||
result = data.take([0, -1])
|
||||
assert result.dtype == data.dtype
|
||||
assert result[0] == data[0]
|
||||
assert result[1] == data[-1]
|
||||
|
||||
result = data.take([0, -1], allow_fill=True, fill_value=na_value)
|
||||
assert result[0] == data[0]
|
||||
assert na_cmp(result[1], na_value)
|
||||
|
||||
with pytest.raises(IndexError, match="out of bounds"):
|
||||
data.take([len(data) + 1])
|
||||
|
||||
def test_take_empty(self, data, na_value, na_cmp):
|
||||
empty = data[:0]
|
||||
|
||||
result = empty.take([-1], allow_fill=True)
|
||||
assert na_cmp(result[0], na_value)
|
||||
|
||||
msg = "cannot do a non-empty take from an empty axes|out of bounds"
|
||||
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
empty.take([-1])
|
||||
|
||||
with pytest.raises(IndexError, match="cannot do a non-empty take"):
|
||||
empty.take([0, 1])
|
||||
|
||||
def test_take_negative(self, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/20640
|
||||
n = len(data)
|
||||
result = data.take([0, -n, n - 1, -1])
|
||||
expected = data.take([0, 0, n - 1, n - 1])
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_take_non_na_fill_value(self, data_missing):
|
||||
fill_value = data_missing[1] # valid
|
||||
na = data_missing[0]
|
||||
|
||||
arr = data_missing._from_sequence(
|
||||
[na, fill_value, na], dtype=data_missing.dtype
|
||||
)
|
||||
result = arr.take([-1, 1], fill_value=fill_value, allow_fill=True)
|
||||
expected = arr.take([1, 1])
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_take_pandas_style_negative_raises(self, data, na_value):
|
||||
with pytest.raises(ValueError, match=""):
|
||||
data.take([0, -2], fill_value=na_value, allow_fill=True)
|
||||
|
||||
@pytest.mark.parametrize("allow_fill", [True, False])
|
||||
def test_take_out_of_bounds_raises(self, data, allow_fill):
|
||||
arr = data[:3]
|
||||
|
||||
with pytest.raises(IndexError, match="out of bounds|out-of-bounds"):
|
||||
arr.take(np.asarray([0, 3]), allow_fill=allow_fill)
|
||||
|
||||
def test_take_series(self, data):
|
||||
s = pd.Series(data)
|
||||
result = s.take([0, -1])
|
||||
expected = pd.Series(
|
||||
data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
|
||||
index=[0, len(data) - 1],
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_reindex(self, data, na_value):
|
||||
s = pd.Series(data)
|
||||
result = s.reindex([0, 1, 3])
|
||||
expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3])
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
n = len(data)
|
||||
result = s.reindex([-1, 0, n])
|
||||
expected = pd.Series(
|
||||
data._from_sequence([na_value, data[0], na_value], dtype=s.dtype),
|
||||
index=[-1, 0, n],
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
result = s.reindex([n, n + 1])
|
||||
expected = pd.Series(
|
||||
data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1]
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_reindex_non_na_fill_value(self, data_missing):
|
||||
valid = data_missing[1]
|
||||
na = data_missing[0]
|
||||
|
||||
arr = data_missing._from_sequence([na, valid], dtype=data_missing.dtype)
|
||||
ser = pd.Series(arr)
|
||||
result = ser.reindex([0, 1, 2], fill_value=valid)
|
||||
expected = pd.Series(
|
||||
data_missing._from_sequence([na, valid, valid], dtype=data_missing.dtype)
|
||||
)
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_loc_len1(self, data):
|
||||
# see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim
|
||||
df = pd.DataFrame({"A": data})
|
||||
res = df.loc[[0], "A"]
|
||||
assert res.ndim == 1
|
||||
assert res._mgr.arrays[0].ndim == 1
|
||||
if hasattr(res._mgr, "blocks"):
|
||||
assert res._mgr._block.ndim == 1
|
||||
|
||||
def test_item(self, data):
|
||||
# https://github.com/pandas-dev/pandas/pull/30175
|
||||
s = pd.Series(data)
|
||||
result = s[:1].item()
|
||||
assert result == data[0]
|
||||
|
||||
msg = "can only convert an array of size 1 to a Python scalar"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s[:0].item()
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s.item()
|
||||
|
||||
def test_ellipsis_index(self):
|
||||
# GH42430 1D slices over extension types turn into N-dimensional slices over
|
||||
# ExtensionArrays
|
||||
class CapturingStringArray(pd.arrays.StringArray):
|
||||
"""Extend StringArray to capture arguments to __getitem__"""
|
||||
|
||||
def __getitem__(self, item):
|
||||
self.last_item_arg = item
|
||||
return super().__getitem__(item)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{"col1": CapturingStringArray(np.array(["hello", "world"], dtype=object))}
|
||||
)
|
||||
_ = df.iloc[:1]
|
||||
|
||||
# String comparison because there's no native way to compare slices.
|
||||
# Before the fix for GH42430, last_item_arg would get set to the 2D slice
|
||||
# (Ellipsis, slice(None, 1, None))
|
||||
self.assert_equal(str(df["col1"].array.last_item_arg), "slice(None, 1, None)")
|
||||
@@ -0,0 +1,106 @@
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseGroupbyTests(BaseExtensionTests):
|
||||
"""Groupby-specific tests."""
|
||||
|
||||
def test_grouping_grouper(self, data_for_grouping):
|
||||
df = pd.DataFrame(
|
||||
{"A": ["B", "B", None, None, "A", "A", "B", "C"], "B": data_for_grouping}
|
||||
)
|
||||
gr1 = df.groupby("A").grouper.groupings[0]
|
||||
gr2 = df.groupby("B").grouper.groupings[0]
|
||||
|
||||
tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values)
|
||||
tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping)
|
||||
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_groupby_extension_agg(self, as_index, data_for_grouping):
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
|
||||
result = df.groupby("B", as_index=as_index).A.mean()
|
||||
_, uniques = pd.factorize(data_for_grouping, sort=True)
|
||||
|
||||
if as_index:
|
||||
index = pd.Index._with_infer(uniques, name="B")
|
||||
expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
|
||||
self.assert_series_equal(result, expected)
|
||||
else:
|
||||
expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]})
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_agg_extension(self, data_for_grouping):
|
||||
# GH#38980 groupby agg on extension type fails for non-numeric types
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
|
||||
|
||||
expected = df.iloc[[0, 2, 4, 7]]
|
||||
expected = expected.set_index("A")
|
||||
|
||||
result = df.groupby("A").agg({"B": "first"})
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").agg("first")
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").first()
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_extension_no_sort(self, data_for_grouping):
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
|
||||
result = df.groupby("B", sort=False).A.mean()
|
||||
_, index = pd.factorize(data_for_grouping, sort=False)
|
||||
|
||||
index = pd.Index._with_infer(index, name="B")
|
||||
expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A")
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_extension_transform(self, data_for_grouping):
|
||||
valid = data_for_grouping[~data_for_grouping.isna()]
|
||||
df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid})
|
||||
|
||||
result = df.groupby("B").A.transform(len)
|
||||
expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
|
||||
df.groupby("B").apply(groupby_apply_op)
|
||||
df.groupby("B").A.apply(groupby_apply_op)
|
||||
df.groupby("A").apply(groupby_apply_op)
|
||||
df.groupby("A").B.apply(groupby_apply_op)
|
||||
|
||||
def test_groupby_apply_identity(self, data_for_grouping):
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
|
||||
result = df.groupby("A").B.apply(lambda x: x.array)
|
||||
expected = pd.Series(
|
||||
[
|
||||
df.B.iloc[[0, 1, 6]].array,
|
||||
df.B.iloc[[2, 3]].array,
|
||||
df.B.iloc[[4, 5]].array,
|
||||
df.B.iloc[[7]].array,
|
||||
],
|
||||
index=pd.Index([1, 2, 3, 4], name="A"),
|
||||
name="B",
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_in_numeric_groupby(self, data_for_grouping):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 1, 2, 2, 3, 3, 1, 4],
|
||||
"B": data_for_grouping,
|
||||
"C": [1, 1, 1, 1, 1, 1, 1, 1],
|
||||
}
|
||||
)
|
||||
result = df.groupby("A").sum().columns
|
||||
|
||||
if data_for_grouping.dtype._is_numeric:
|
||||
expected = pd.Index(["B", "C"])
|
||||
else:
|
||||
expected = pd.Index(["C"])
|
||||
|
||||
tm.assert_index_equal(result, expected)
|
||||
@@ -0,0 +1,20 @@
|
||||
"""
|
||||
Tests for Indexes backed by arbitrary ExtensionArrays.
|
||||
"""
|
||||
import pandas as pd
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseIndexTests(BaseExtensionTests):
|
||||
"""Tests for Index object backed by an ExtensionArray"""
|
||||
|
||||
def test_index_from_array(self, data):
|
||||
idx = pd.Index(data)
|
||||
assert data.dtype == idx.dtype
|
||||
|
||||
def test_index_from_listlike_with_dtype(self, data):
|
||||
idx = pd.Index(data, dtype=data.dtype)
|
||||
assert idx.dtype == data.dtype
|
||||
|
||||
idx = pd.Index(list(data), dtype=data.dtype)
|
||||
assert idx.dtype == data.dtype
|
||||
@@ -0,0 +1,127 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.common import is_extension_array_dtype
|
||||
from pandas.core.dtypes.dtypes import ExtensionDtype
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseInterfaceTests(BaseExtensionTests):
|
||||
"""Tests that the basic interface is satisfied."""
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Interface
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
def test_len(self, data):
|
||||
assert len(data) == 100
|
||||
|
||||
def test_size(self, data):
|
||||
assert data.size == 100
|
||||
|
||||
def test_ndim(self, data):
|
||||
assert data.ndim == 1
|
||||
|
||||
def test_can_hold_na_valid(self, data):
|
||||
# GH-20761
|
||||
assert data._can_hold_na is True
|
||||
|
||||
def test_contains(self, data, data_missing):
|
||||
# GH-37867
|
||||
# Tests for membership checks. Membership checks for nan-likes is tricky and
|
||||
# the settled on rule is: `nan_like in arr` is True if nan_like is
|
||||
# arr.dtype.na_value and arr.isna().any() is True. Else the check returns False.
|
||||
|
||||
na_value = data.dtype.na_value
|
||||
# ensure data without missing values
|
||||
data = data[~data.isna()]
|
||||
|
||||
# first elements are non-missing
|
||||
assert data[0] in data
|
||||
assert data_missing[0] in data_missing
|
||||
|
||||
# check the presence of na_value
|
||||
assert na_value in data_missing
|
||||
assert na_value not in data
|
||||
|
||||
# the data can never contain other nan-likes than na_value
|
||||
for na_value_obj in tm.NULL_OBJECTS:
|
||||
if na_value_obj is na_value or type(na_value_obj) == type(na_value):
|
||||
# type check for e.g. two instances of Decimal("NAN")
|
||||
continue
|
||||
assert na_value_obj not in data
|
||||
assert na_value_obj not in data_missing
|
||||
|
||||
def test_memory_usage(self, data):
|
||||
s = pd.Series(data)
|
||||
result = s.memory_usage(index=False)
|
||||
assert result == s.nbytes
|
||||
|
||||
def test_array_interface(self, data):
|
||||
result = np.array(data)
|
||||
assert result[0] == data[0]
|
||||
|
||||
result = np.array(data, dtype=object)
|
||||
expected = np.array(list(data), dtype=object)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_is_extension_array_dtype(self, data):
|
||||
assert is_extension_array_dtype(data)
|
||||
assert is_extension_array_dtype(data.dtype)
|
||||
assert is_extension_array_dtype(pd.Series(data))
|
||||
assert isinstance(data.dtype, ExtensionDtype)
|
||||
|
||||
def test_no_values_attribute(self, data):
|
||||
# GH-20735: EA's with .values attribute give problems with internal
|
||||
# code, disallowing this for now until solved
|
||||
assert not hasattr(data, "values")
|
||||
assert not hasattr(data, "_values")
|
||||
|
||||
def test_is_numeric_honored(self, data):
|
||||
result = pd.Series(data)
|
||||
if hasattr(result._mgr, "blocks"):
|
||||
assert result._mgr.blocks[0].is_numeric is data.dtype._is_numeric
|
||||
|
||||
def test_isna_extension_array(self, data_missing):
|
||||
# If your `isna` returns an ExtensionArray, you must also implement
|
||||
# _reduce. At the *very* least, you must implement any and all
|
||||
na = data_missing.isna()
|
||||
if is_extension_array_dtype(na):
|
||||
assert na._reduce("any")
|
||||
assert na.any()
|
||||
|
||||
assert not na._reduce("all")
|
||||
assert not na.all()
|
||||
|
||||
assert na.dtype._is_boolean
|
||||
|
||||
def test_copy(self, data):
|
||||
# GH#27083 removing deep keyword from EA.copy
|
||||
assert data[0] != data[1]
|
||||
result = data.copy()
|
||||
|
||||
data[1] = data[0]
|
||||
assert result[1] != result[0]
|
||||
|
||||
def test_view(self, data):
|
||||
# view with no dtype should return a shallow copy, *not* the same
|
||||
# object
|
||||
assert data[1] != data[0]
|
||||
|
||||
result = data.view()
|
||||
assert result is not data
|
||||
assert type(result) == type(data)
|
||||
|
||||
result[1] = result[0]
|
||||
assert data[1] == data[0]
|
||||
|
||||
# check specifically that the `dtype` kwarg is accepted
|
||||
data.view(dtype=None)
|
||||
|
||||
def test_tolist(self, data):
|
||||
result = data.tolist()
|
||||
expected = list(data)
|
||||
assert isinstance(result, list)
|
||||
assert result == expected
|
||||
@@ -0,0 +1,19 @@
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseParsingTests(BaseExtensionTests):
|
||||
@pytest.mark.parametrize("engine", ["c", "python"])
|
||||
def test_EA_types(self, engine, data):
|
||||
df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
|
||||
csv_output = df.to_csv(index=False, na_rep=np.nan)
|
||||
result = pd.read_csv(
|
||||
StringIO(csv_output), dtype={"with_dtype": str(data.dtype)}, engine=engine
|
||||
)
|
||||
expected = df
|
||||
self.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,583 @@
|
||||
import inspect
|
||||
import operator
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import is_bool_dtype
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.core.sorting import nargsort
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseMethodsTests(BaseExtensionTests):
|
||||
"""Various Series and DataFrame methods."""
|
||||
|
||||
def test_value_counts_default_dropna(self, data):
|
||||
# make sure we have consistent default dropna kwarg
|
||||
if not hasattr(data, "value_counts"):
|
||||
pytest.skip("value_counts is not implemented")
|
||||
sig = inspect.signature(data.value_counts)
|
||||
kwarg = sig.parameters["dropna"]
|
||||
assert kwarg.default is True
|
||||
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_value_counts(self, all_data, dropna):
|
||||
all_data = all_data[:10]
|
||||
if dropna:
|
||||
other = np.array(all_data[~all_data.isna()])
|
||||
else:
|
||||
other = all_data
|
||||
|
||||
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
|
||||
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_value_counts_with_normalize(self, data):
|
||||
# GH 33172
|
||||
data = data[:10].unique()
|
||||
values = np.array(data[~data.isna()])
|
||||
ser = pd.Series(data, dtype=data.dtype)
|
||||
|
||||
result = ser.value_counts(normalize=True).sort_index()
|
||||
|
||||
if not isinstance(data, pd.Categorical):
|
||||
expected = pd.Series([1 / len(values)] * len(values), index=result.index)
|
||||
else:
|
||||
expected = pd.Series(0.0, index=result.index)
|
||||
expected[result > 0] = 1 / len(values)
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_count(self, data_missing):
|
||||
df = pd.DataFrame({"A": data_missing})
|
||||
result = df.count(axis="columns")
|
||||
expected = pd.Series([0, 1])
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_series_count(self, data_missing):
|
||||
# GH#26835
|
||||
ser = pd.Series(data_missing)
|
||||
result = ser.count()
|
||||
expected = 1
|
||||
assert result == expected
|
||||
|
||||
def test_apply_simple_series(self, data):
|
||||
result = pd.Series(data).apply(id)
|
||||
assert isinstance(result, pd.Series)
|
||||
|
||||
def test_argsort(self, data_for_sorting):
|
||||
result = pd.Series(data_for_sorting).argsort()
|
||||
# argsort result gets passed to take, so should be np.intp
|
||||
expected = pd.Series(np.array([2, 0, 1], dtype=np.intp))
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_argsort_missing_array(self, data_missing_for_sorting):
|
||||
result = data_missing_for_sorting.argsort()
|
||||
# argsort result gets passed to take, so should be np.intp
|
||||
expected = np.array([2, 0, 1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_argsort_missing(self, data_missing_for_sorting):
|
||||
result = pd.Series(data_missing_for_sorting).argsort()
|
||||
expected = pd.Series(np.array([1, -1, 0], dtype=np.intp))
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
|
||||
# GH 24382
|
||||
|
||||
# data_for_sorting -> [B, C, A] with A < B < C
|
||||
assert data_for_sorting.argmax() == 1
|
||||
assert data_for_sorting.argmin() == 2
|
||||
|
||||
# with repeated values -> first occurrence
|
||||
data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
|
||||
assert data.argmax() == 3
|
||||
assert data.argmin() == 0
|
||||
|
||||
# with missing values
|
||||
# data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
|
||||
assert data_missing_for_sorting.argmax() == 0
|
||||
assert data_missing_for_sorting.argmin() == 2
|
||||
|
||||
@pytest.mark.parametrize("method", ["argmax", "argmin"])
|
||||
def test_argmin_argmax_empty_array(self, method, data):
|
||||
# GH 24382
|
||||
err_msg = "attempt to get"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
getattr(data[:0], method)()
|
||||
|
||||
@pytest.mark.parametrize("method", ["argmax", "argmin"])
|
||||
def test_argmin_argmax_all_na(self, method, data, na_value):
|
||||
# all missing with skipna=True is the same as empty
|
||||
err_msg = "attempt to get"
|
||||
data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
getattr(data_na, method)()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name, skipna, expected",
|
||||
[
|
||||
("idxmax", True, 0),
|
||||
("idxmin", True, 2),
|
||||
("argmax", True, 0),
|
||||
("argmin", True, 2),
|
||||
("idxmax", False, np.nan),
|
||||
("idxmin", False, np.nan),
|
||||
("argmax", False, -1),
|
||||
("argmin", False, -1),
|
||||
],
|
||||
)
|
||||
def test_argreduce_series(
|
||||
self, data_missing_for_sorting, op_name, skipna, expected
|
||||
):
|
||||
# data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
|
||||
ser = pd.Series(data_missing_for_sorting)
|
||||
result = getattr(ser, op_name)(skipna=skipna)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting):
|
||||
# GH#38733
|
||||
data = data_missing_for_sorting
|
||||
|
||||
with pytest.raises(NotImplementedError, match=""):
|
||||
data.argmin(skipna=False)
|
||||
|
||||
with pytest.raises(NotImplementedError, match=""):
|
||||
data.argmax(skipna=False)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_position, expected",
|
||||
[
|
||||
("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
|
||||
("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
|
||||
],
|
||||
)
|
||||
def test_nargsort(self, data_missing_for_sorting, na_position, expected):
|
||||
# GH 25439
|
||||
result = nargsort(data_missing_for_sorting, na_position=na_position)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
|
||||
ser = pd.Series(data_for_sorting)
|
||||
result = ser.sort_values(ascending=ascending, key=sort_by_key)
|
||||
expected = ser.iloc[[2, 0, 1]]
|
||||
if not ascending:
|
||||
# GH 35922. Expect stable sort
|
||||
if ser.nunique() == 2:
|
||||
expected = ser.iloc[[0, 1, 2]]
|
||||
else:
|
||||
expected = ser.iloc[[1, 0, 2]]
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
def test_sort_values_missing(
|
||||
self, data_missing_for_sorting, ascending, sort_by_key
|
||||
):
|
||||
ser = pd.Series(data_missing_for_sorting)
|
||||
result = ser.sort_values(ascending=ascending, key=sort_by_key)
|
||||
if ascending:
|
||||
expected = ser.iloc[[2, 0, 1]]
|
||||
else:
|
||||
expected = ser.iloc[[0, 2, 1]]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
def test_sort_values_frame(self, data_for_sorting, ascending):
|
||||
df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting})
|
||||
result = df.sort_values(["A", "B"])
|
||||
expected = pd.DataFrame(
|
||||
{"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1]
|
||||
)
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("box", [pd.Series, lambda x: x])
|
||||
@pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
|
||||
def test_unique(self, data, box, method):
|
||||
duplicated = box(data._from_sequence([data[0], data[0]]))
|
||||
|
||||
result = method(duplicated)
|
||||
|
||||
assert len(result) == 1
|
||||
assert isinstance(result, type(data))
|
||||
assert result[0] == duplicated[0]
|
||||
|
||||
@pytest.mark.parametrize("na_sentinel", [-1, -2])
|
||||
def test_factorize(self, data_for_grouping, na_sentinel):
|
||||
codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
|
||||
expected_codes = np.array(
|
||||
[0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp
|
||||
)
|
||||
expected_uniques = data_for_grouping.take([0, 4, 7])
|
||||
|
||||
tm.assert_numpy_array_equal(codes, expected_codes)
|
||||
self.assert_extension_array_equal(uniques, expected_uniques)
|
||||
|
||||
@pytest.mark.parametrize("na_sentinel", [-1, -2])
|
||||
def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
|
||||
codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
|
||||
codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
|
||||
|
||||
tm.assert_numpy_array_equal(codes_1, codes_2)
|
||||
self.assert_extension_array_equal(uniques_1, uniques_2)
|
||||
assert len(uniques_1) == len(pd.unique(uniques_1))
|
||||
assert uniques_1.dtype == data_for_grouping.dtype
|
||||
|
||||
def test_factorize_empty(self, data):
|
||||
codes, uniques = pd.factorize(data[:0])
|
||||
expected_codes = np.array([], dtype=np.intp)
|
||||
expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
|
||||
|
||||
tm.assert_numpy_array_equal(codes, expected_codes)
|
||||
self.assert_extension_array_equal(uniques, expected_uniques)
|
||||
|
||||
def test_fillna_copy_frame(self, data_missing):
|
||||
arr = data_missing.take([1, 1])
|
||||
df = pd.DataFrame({"A": arr})
|
||||
|
||||
filled_val = df.iloc[0, 0]
|
||||
result = df.fillna(filled_val)
|
||||
|
||||
assert df.A.values is not result.A.values
|
||||
|
||||
def test_fillna_copy_series(self, data_missing):
|
||||
arr = data_missing.take([1, 1])
|
||||
ser = pd.Series(arr)
|
||||
|
||||
filled_val = ser[0]
|
||||
result = ser.fillna(filled_val)
|
||||
|
||||
assert ser._values is not result._values
|
||||
assert ser._values is arr
|
||||
|
||||
def test_fillna_length_mismatch(self, data_missing):
|
||||
msg = "Length of 'value' does not match."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
data_missing.fillna(data_missing.take([1]))
|
||||
|
||||
def test_combine_le(self, data_repeated):
|
||||
# GH 20825
|
||||
# Test that combine works when doing a <= (le) comparison
|
||||
orig_data1, orig_data2 = data_repeated(2)
|
||||
s1 = pd.Series(orig_data1)
|
||||
s2 = pd.Series(orig_data2)
|
||||
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
|
||||
expected = pd.Series(
|
||||
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
val = s1.iloc[0]
|
||||
result = s1.combine(val, lambda x1, x2: x1 <= x2)
|
||||
expected = pd.Series([a <= val for a in list(orig_data1)])
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_combine_add(self, data_repeated):
|
||||
# GH 20825
|
||||
orig_data1, orig_data2 = data_repeated(2)
|
||||
s1 = pd.Series(orig_data1)
|
||||
s2 = pd.Series(orig_data2)
|
||||
result = s1.combine(s2, lambda x1, x2: x1 + x2)
|
||||
with np.errstate(over="ignore"):
|
||||
expected = pd.Series(
|
||||
orig_data1._from_sequence(
|
||||
[a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
|
||||
)
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
val = s1.iloc[0]
|
||||
result = s1.combine(val, lambda x1, x2: x1 + x2)
|
||||
expected = pd.Series(
|
||||
orig_data1._from_sequence([a + val for a in list(orig_data1)])
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_combine_first(self, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/24147
|
||||
a = pd.Series(data[:3])
|
||||
b = pd.Series(data[2:5], index=[2, 3, 4])
|
||||
result = a.combine_first(b)
|
||||
expected = pd.Series(data[:5])
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("frame", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"periods, indices",
|
||||
[(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])],
|
||||
)
|
||||
def test_container_shift(self, data, frame, periods, indices):
|
||||
# https://github.com/pandas-dev/pandas/issues/22386
|
||||
subset = data[:5]
|
||||
data = pd.Series(subset, name="A")
|
||||
expected = pd.Series(subset.take(indices, allow_fill=True), name="A")
|
||||
|
||||
if frame:
|
||||
result = data.to_frame(name="A").assign(B=1).shift(periods)
|
||||
expected = pd.concat(
|
||||
[expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1
|
||||
)
|
||||
compare = self.assert_frame_equal
|
||||
else:
|
||||
result = data.shift(periods)
|
||||
compare = self.assert_series_equal
|
||||
|
||||
compare(result, expected)
|
||||
|
||||
def test_shift_0_periods(self, data):
|
||||
# GH#33856 shifting with periods=0 should return a copy, not same obj
|
||||
result = data.shift(0)
|
||||
assert data[0] != data[1] # otherwise below is invalid
|
||||
data[0] = data[1]
|
||||
assert result[0] != result[1] # i.e. not the same object/view
|
||||
|
||||
@pytest.mark.parametrize("periods", [1, -2])
|
||||
def test_diff(self, data, periods):
|
||||
data = data[:5]
|
||||
if is_bool_dtype(data.dtype):
|
||||
op = operator.xor
|
||||
else:
|
||||
op = operator.sub
|
||||
try:
|
||||
# does this array implement ops?
|
||||
op(data, data)
|
||||
except Exception:
|
||||
pytest.skip(f"{type(data)} does not support diff")
|
||||
s = pd.Series(data)
|
||||
result = s.diff(periods)
|
||||
expected = pd.Series(op(data, data.shift(periods)))
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
df = pd.DataFrame({"A": data, "B": [1.0] * 5})
|
||||
result = df.diff(periods)
|
||||
if periods == 1:
|
||||
b = [np.nan, 0, 0, 0, 0]
|
||||
else:
|
||||
b = [0, 0, 0, np.nan, np.nan]
|
||||
expected = pd.DataFrame({"A": expected, "B": b})
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"periods, indices",
|
||||
[[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
|
||||
)
|
||||
def test_shift_non_empty_array(self, data, periods, indices):
|
||||
# https://github.com/pandas-dev/pandas/issues/23911
|
||||
subset = data[:2]
|
||||
result = subset.shift(periods)
|
||||
expected = subset.take(indices, allow_fill=True)
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4])
|
||||
def test_shift_empty_array(self, data, periods):
|
||||
# https://github.com/pandas-dev/pandas/issues/23911
|
||||
empty = data[:0]
|
||||
result = empty.shift(periods)
|
||||
expected = empty
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_shift_zero_copies(self, data):
|
||||
result = data.shift(0)
|
||||
assert result is not data
|
||||
|
||||
result = data[:0].shift(2)
|
||||
assert result is not data
|
||||
|
||||
def test_shift_fill_value(self, data):
|
||||
arr = data[:4]
|
||||
fill_value = data[0]
|
||||
result = arr.shift(1, fill_value=fill_value)
|
||||
expected = data.take([0, 0, 1, 2])
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
result = arr.shift(-2, fill_value=fill_value)
|
||||
expected = data.take([2, 3, 0, 0])
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_not_hashable(self, data):
|
||||
# We are in general mutable, so not hashable
|
||||
with pytest.raises(TypeError, match="unhashable type"):
|
||||
hash(data)
|
||||
|
||||
def test_hash_pandas_object_works(self, data, as_frame):
|
||||
# https://github.com/pandas-dev/pandas/issues/23066
|
||||
data = pd.Series(data)
|
||||
if as_frame:
|
||||
data = data.to_frame()
|
||||
a = pd.util.hash_pandas_object(data)
|
||||
b = pd.util.hash_pandas_object(data)
|
||||
self.assert_equal(a, b)
|
||||
|
||||
def test_searchsorted(self, data_for_sorting, as_series):
|
||||
b, c, a = data_for_sorting
|
||||
arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
|
||||
|
||||
if as_series:
|
||||
arr = pd.Series(arr)
|
||||
assert arr.searchsorted(a) == 0
|
||||
assert arr.searchsorted(a, side="right") == 1
|
||||
|
||||
assert arr.searchsorted(b) == 1
|
||||
assert arr.searchsorted(b, side="right") == 2
|
||||
|
||||
assert arr.searchsorted(c) == 2
|
||||
assert arr.searchsorted(c, side="right") == 3
|
||||
|
||||
result = arr.searchsorted(arr.take([0, 2]))
|
||||
expected = np.array([0, 2], dtype=np.intp)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# sorter
|
||||
sorter = np.array([1, 2, 0])
|
||||
assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
|
||||
|
||||
def test_where_series(self, data, na_value, as_frame):
|
||||
assert data[0] != data[1]
|
||||
cls = type(data)
|
||||
a, b = data[:2]
|
||||
|
||||
ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
|
||||
cond = np.array([True, True, False, False])
|
||||
|
||||
if as_frame:
|
||||
ser = ser.to_frame(name="a")
|
||||
cond = cond.reshape(-1, 1)
|
||||
|
||||
result = ser.where(cond)
|
||||
expected = pd.Series(
|
||||
cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype)
|
||||
)
|
||||
|
||||
if as_frame:
|
||||
expected = expected.to_frame(name="a")
|
||||
self.assert_equal(result, expected)
|
||||
|
||||
# array other
|
||||
cond = np.array([True, False, True, True])
|
||||
other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
|
||||
if as_frame:
|
||||
other = pd.DataFrame({"a": other})
|
||||
cond = pd.DataFrame({"a": cond})
|
||||
result = ser.where(cond, other)
|
||||
expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
|
||||
if as_frame:
|
||||
expected = expected.to_frame(name="a")
|
||||
self.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
|
||||
def test_repeat(self, data, repeats, as_series, use_numpy):
|
||||
arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
|
||||
if as_series:
|
||||
arr = pd.Series(arr)
|
||||
|
||||
result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
|
||||
|
||||
repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
|
||||
expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
|
||||
expected = type(data)._from_sequence(expected, dtype=data.dtype)
|
||||
if as_series:
|
||||
expected = pd.Series(expected, index=arr.index.repeat(repeats))
|
||||
|
||||
self.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"repeats, kwargs, error, msg",
|
||||
[
|
||||
(2, {"axis": 1}, ValueError, "axis"),
|
||||
(-1, {}, ValueError, "negative"),
|
||||
([1, 2], {}, ValueError, "shape"),
|
||||
(2, {"foo": "bar"}, TypeError, "'foo'"),
|
||||
],
|
||||
)
|
||||
def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
|
||||
with pytest.raises(error, match=msg):
|
||||
if use_numpy:
|
||||
np.repeat(data, repeats, **kwargs)
|
||||
else:
|
||||
data.repeat(repeats, **kwargs)
|
||||
|
||||
def test_delete(self, data):
|
||||
result = data.delete(0)
|
||||
expected = data[1:]
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
result = data.delete([1, 3])
|
||||
expected = data._concat_same_type([data[[0]], data[[2]], data[4:]])
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_insert(self, data):
|
||||
# insert at the beginning
|
||||
result = data[1:].insert(0, data[0])
|
||||
self.assert_extension_array_equal(result, data)
|
||||
|
||||
result = data[1:].insert(-len(data[1:]), data[0])
|
||||
self.assert_extension_array_equal(result, data)
|
||||
|
||||
# insert at the middle
|
||||
result = data[:-1].insert(4, data[-1])
|
||||
|
||||
taker = np.arange(len(data))
|
||||
taker[5:] = taker[4:-1]
|
||||
taker[4] = len(data) - 1
|
||||
expected = data.take(taker)
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_insert_invalid(self, data, invalid_scalar):
|
||||
item = invalid_scalar
|
||||
|
||||
with pytest.raises((TypeError, ValueError)):
|
||||
data.insert(0, item)
|
||||
|
||||
with pytest.raises((TypeError, ValueError)):
|
||||
data.insert(4, item)
|
||||
|
||||
with pytest.raises((TypeError, ValueError)):
|
||||
data.insert(len(data) - 1, item)
|
||||
|
||||
def test_insert_invalid_loc(self, data):
|
||||
ub = len(data)
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
data.insert(ub + 1, data[0])
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
data.insert(-ub - 1, data[0])
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
# we expect TypeError here instead of IndexError to match np.insert
|
||||
data.insert(1.5, data[0])
|
||||
|
||||
@pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
|
||||
def test_equals(self, data, na_value, as_series, box):
|
||||
data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype)
|
||||
data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype)
|
||||
|
||||
data = tm.box_expected(data, box, transpose=False)
|
||||
data2 = tm.box_expected(data2, box, transpose=False)
|
||||
data_na = tm.box_expected(data_na, box, transpose=False)
|
||||
|
||||
# we are asserting with `is True/False` explicitly, to test that the
|
||||
# result is an actual Python bool, and not something "truthy"
|
||||
|
||||
assert data.equals(data) is True
|
||||
assert data.equals(data.copy()) is True
|
||||
|
||||
# unequal other data
|
||||
assert data.equals(data2) is False
|
||||
assert data.equals(data_na) is False
|
||||
|
||||
# different length
|
||||
assert data[:2].equals(data[:3]) is False
|
||||
|
||||
# empty are equal
|
||||
assert data[:0].equals(data[:0]) is True
|
||||
|
||||
# other types
|
||||
assert data.equals(None) is False
|
||||
assert data[[0]].equals(data[0]) is False
|
||||
@@ -0,0 +1,160 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import is_sparse
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseMissingTests(BaseExtensionTests):
|
||||
def test_isna(self, data_missing):
|
||||
expected = np.array([True, False])
|
||||
|
||||
result = pd.isna(data_missing)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = pd.Series(data_missing).isna()
|
||||
expected = pd.Series(expected)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
# GH 21189
|
||||
result = pd.Series(data_missing).drop([0, 1]).isna()
|
||||
expected = pd.Series([], dtype=bool)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("na_func", ["isna", "notna"])
|
||||
def test_isna_returns_copy(self, data_missing, na_func):
|
||||
result = pd.Series(data_missing)
|
||||
expected = result.copy()
|
||||
mask = getattr(result, na_func)()
|
||||
if is_sparse(mask):
|
||||
mask = np.array(mask)
|
||||
|
||||
mask[:] = True
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_dropna_array(self, data_missing):
|
||||
result = data_missing.dropna()
|
||||
expected = data_missing[[1]]
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_dropna_series(self, data_missing):
|
||||
ser = pd.Series(data_missing)
|
||||
result = ser.dropna()
|
||||
expected = ser.iloc[[1]]
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_dropna_frame(self, data_missing):
|
||||
df = pd.DataFrame({"A": data_missing})
|
||||
|
||||
# defaults
|
||||
result = df.dropna()
|
||||
expected = df.iloc[[1]]
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
# axis = 1
|
||||
result = df.dropna(axis="columns")
|
||||
expected = pd.DataFrame(index=[0, 1])
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
# multiple
|
||||
df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]})
|
||||
result = df.dropna()
|
||||
expected = df.iloc[:0]
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_scalar(self, data_missing):
|
||||
valid = data_missing[1]
|
||||
result = data_missing.fillna(valid)
|
||||
expected = data_missing.fillna(valid)
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
def test_fillna_limit_pad(self, data_missing):
|
||||
arr = data_missing.take([1, 0, 0, 0, 1])
|
||||
result = pd.Series(arr).fillna(method="ffill", limit=2)
|
||||
expected = pd.Series(data_missing.take([1, 1, 1, 0, 1]))
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_fillna_limit_backfill(self, data_missing):
|
||||
arr = data_missing.take([1, 0, 0, 0, 1])
|
||||
result = pd.Series(arr).fillna(method="backfill", limit=2)
|
||||
expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_fillna_no_op_returns_copy(self, data):
|
||||
data = data[~data.isna()]
|
||||
|
||||
valid = data[0]
|
||||
result = data.fillna(valid)
|
||||
assert result is not data
|
||||
self.assert_extension_array_equal(result, data)
|
||||
|
||||
result = data.fillna(method="backfill")
|
||||
assert result is not data
|
||||
self.assert_extension_array_equal(result, data)
|
||||
|
||||
def test_fillna_series(self, data_missing):
|
||||
fill_value = data_missing[1]
|
||||
ser = pd.Series(data_missing)
|
||||
|
||||
result = ser.fillna(fill_value)
|
||||
expected = pd.Series(
|
||||
data_missing._from_sequence(
|
||||
[fill_value, fill_value], dtype=data_missing.dtype
|
||||
)
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
# Fill with a series
|
||||
result = ser.fillna(expected)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
# Fill with a series not affecting the missing values
|
||||
result = ser.fillna(ser)
|
||||
self.assert_series_equal(result, ser)
|
||||
|
||||
def test_fillna_series_method(self, data_missing, fillna_method):
|
||||
fill_value = data_missing[1]
|
||||
|
||||
if fillna_method == "ffill":
|
||||
data_missing = data_missing[::-1]
|
||||
|
||||
result = pd.Series(data_missing).fillna(method=fillna_method)
|
||||
expected = pd.Series(
|
||||
data_missing._from_sequence(
|
||||
[fill_value, fill_value], dtype=data_missing.dtype
|
||||
)
|
||||
)
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_fillna_frame(self, data_missing):
|
||||
fill_value = data_missing[1]
|
||||
|
||||
result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value)
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": data_missing._from_sequence(
|
||||
[fill_value, fill_value], dtype=data_missing.dtype
|
||||
),
|
||||
"B": [1, 2],
|
||||
}
|
||||
)
|
||||
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_fill_other(self, data):
|
||||
result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0})
|
||||
|
||||
expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)})
|
||||
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_use_inf_as_na_no_effect(self, data_missing):
|
||||
ser = pd.Series(data_missing)
|
||||
expected = ser.isna()
|
||||
with pd.option_context("mode.use_inf_as_na", True):
|
||||
result = ser.isna()
|
||||
self.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,217 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.core import ops
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseOpsUtil(BaseExtensionTests):
|
||||
def get_op_from_name(self, op_name: str):
|
||||
return tm.get_op_from_name(op_name)
|
||||
|
||||
def check_opname(self, ser: pd.Series, op_name: str, other, exc=Exception):
|
||||
op = self.get_op_from_name(op_name)
|
||||
|
||||
self._check_op(ser, op, other, op_name, exc)
|
||||
|
||||
def _combine(self, obj, other, op):
|
||||
if isinstance(obj, pd.DataFrame):
|
||||
if len(obj.columns) != 1:
|
||||
raise NotImplementedError
|
||||
expected = obj.iloc[:, 0].combine(other, op).to_frame()
|
||||
else:
|
||||
expected = obj.combine(other, op)
|
||||
return expected
|
||||
|
||||
def _check_op(
|
||||
self, ser: pd.Series, op, other, op_name: str, exc=NotImplementedError
|
||||
):
|
||||
if exc is None:
|
||||
result = op(ser, other)
|
||||
expected = self._combine(ser, other, op)
|
||||
assert isinstance(result, type(ser))
|
||||
self.assert_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(exc):
|
||||
op(ser, other)
|
||||
|
||||
def _check_divmod_op(self, ser: pd.Series, op, other, exc=Exception):
|
||||
# divmod has multiple return values, so check separately
|
||||
if exc is None:
|
||||
result_div, result_mod = op(ser, other)
|
||||
if op is divmod:
|
||||
expected_div, expected_mod = ser // other, ser % other
|
||||
else:
|
||||
expected_div, expected_mod = other // ser, other % ser
|
||||
self.assert_series_equal(result_div, expected_div)
|
||||
self.assert_series_equal(result_mod, expected_mod)
|
||||
else:
|
||||
with pytest.raises(exc):
|
||||
divmod(ser, other)
|
||||
|
||||
|
||||
class BaseArithmeticOpsTests(BaseOpsUtil):
|
||||
"""
|
||||
Various Series and DataFrame arithmetic ops methods.
|
||||
|
||||
Subclasses supporting various ops should set the class variables
|
||||
to indicate that they support ops of that kind
|
||||
|
||||
* series_scalar_exc = TypeError
|
||||
* frame_scalar_exc = TypeError
|
||||
* series_array_exc = TypeError
|
||||
* divmod_exc = TypeError
|
||||
"""
|
||||
|
||||
series_scalar_exc: type[TypeError] | None = TypeError
|
||||
frame_scalar_exc: type[TypeError] | None = TypeError
|
||||
series_array_exc: type[TypeError] | None = TypeError
|
||||
divmod_exc: type[TypeError] | None = TypeError
|
||||
|
||||
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
|
||||
# series & scalar
|
||||
op_name = all_arithmetic_operators
|
||||
ser = pd.Series(data)
|
||||
self.check_opname(ser, op_name, ser.iloc[0], exc=self.series_scalar_exc)
|
||||
|
||||
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
|
||||
# frame & scalar
|
||||
op_name = all_arithmetic_operators
|
||||
df = pd.DataFrame({"A": data})
|
||||
self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc)
|
||||
|
||||
def test_arith_series_with_array(self, data, all_arithmetic_operators):
|
||||
# ndarray & other series
|
||||
op_name = all_arithmetic_operators
|
||||
ser = pd.Series(data)
|
||||
self.check_opname(
|
||||
ser, op_name, pd.Series([ser.iloc[0]] * len(ser)), exc=self.series_array_exc
|
||||
)
|
||||
|
||||
def test_divmod(self, data):
|
||||
ser = pd.Series(data)
|
||||
self._check_divmod_op(ser, divmod, 1, exc=self.divmod_exc)
|
||||
self._check_divmod_op(1, ops.rdivmod, ser, exc=self.divmod_exc)
|
||||
|
||||
def test_divmod_series_array(self, data, data_for_twos):
|
||||
ser = pd.Series(data)
|
||||
self._check_divmod_op(ser, divmod, data)
|
||||
|
||||
other = data_for_twos
|
||||
self._check_divmod_op(other, ops.rdivmod, ser)
|
||||
|
||||
other = pd.Series(other)
|
||||
self._check_divmod_op(other, ops.rdivmod, ser)
|
||||
|
||||
def test_add_series_with_extension_array(self, data):
|
||||
ser = pd.Series(data)
|
||||
result = ser + data
|
||||
expected = pd.Series(data + data)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("box", [pd.Series, pd.DataFrame])
|
||||
def test_direct_arith_with_ndframe_returns_not_implemented(
|
||||
self, request, data, box
|
||||
):
|
||||
# EAs should return NotImplemented for ops with Series/DataFrame
|
||||
# Pandas takes care of unboxing the series and calling the EA's op.
|
||||
other = pd.Series(data)
|
||||
if box is pd.DataFrame:
|
||||
other = other.to_frame()
|
||||
if not hasattr(data, "__add__"):
|
||||
request.node.add_marker(
|
||||
pytest.mark.xfail(
|
||||
reason=f"{type(data).__name__} does not implement add"
|
||||
)
|
||||
)
|
||||
result = data.__add__(other)
|
||||
assert result is NotImplemented
|
||||
|
||||
|
||||
class BaseComparisonOpsTests(BaseOpsUtil):
|
||||
"""Various Series and DataFrame comparison ops methods."""
|
||||
|
||||
def _compare_other(self, ser: pd.Series, data, op, other):
|
||||
|
||||
if op.__name__ in ["eq", "ne"]:
|
||||
# comparison should match point-wise comparisons
|
||||
result = op(ser, other)
|
||||
expected = ser.combine(other, op)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
else:
|
||||
exc = None
|
||||
try:
|
||||
result = op(ser, other)
|
||||
except Exception as err:
|
||||
exc = err
|
||||
|
||||
if exc is None:
|
||||
# Didn't error, then should match pointwise behavior
|
||||
expected = ser.combine(other, op)
|
||||
self.assert_series_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(type(exc)):
|
||||
ser.combine(other, op)
|
||||
|
||||
def test_compare_scalar(self, data, comparison_op):
|
||||
ser = pd.Series(data)
|
||||
self._compare_other(ser, data, comparison_op, 0)
|
||||
|
||||
def test_compare_array(self, data, comparison_op):
|
||||
ser = pd.Series(data)
|
||||
other = pd.Series([data[0]] * len(data))
|
||||
self._compare_other(ser, data, comparison_op, other)
|
||||
|
||||
@pytest.mark.parametrize("box", [pd.Series, pd.DataFrame])
|
||||
def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box):
|
||||
# EAs should return NotImplemented for ops with Series/DataFrame
|
||||
# Pandas takes care of unboxing the series and calling the EA's op.
|
||||
other = pd.Series(data)
|
||||
if box is pd.DataFrame:
|
||||
other = other.to_frame()
|
||||
|
||||
if hasattr(data, "__eq__"):
|
||||
result = data.__eq__(other)
|
||||
assert result is NotImplemented
|
||||
else:
|
||||
raise pytest.skip(f"{type(data).__name__} does not implement __eq__")
|
||||
|
||||
if hasattr(data, "__ne__"):
|
||||
result = data.__ne__(other)
|
||||
assert result is NotImplemented
|
||||
else:
|
||||
raise pytest.skip(f"{type(data).__name__} does not implement __ne__")
|
||||
|
||||
|
||||
class BaseUnaryOpsTests(BaseOpsUtil):
|
||||
def test_invert(self, data):
|
||||
ser = pd.Series(data, name="name")
|
||||
result = ~ser
|
||||
expected = pd.Series(~data, name="name")
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs])
|
||||
def test_unary_ufunc_dunder_equivalence(self, data, ufunc):
|
||||
# the dunder __pos__ works if and only if np.positive works,
|
||||
# same for __neg__/np.negative and __abs__/np.abs
|
||||
attr = {np.positive: "__pos__", np.negative: "__neg__", np.abs: "__abs__"}[
|
||||
ufunc
|
||||
]
|
||||
|
||||
exc = None
|
||||
try:
|
||||
result = getattr(data, attr)()
|
||||
except Exception as err:
|
||||
exc = err
|
||||
|
||||
# if __pos__ raised, then so should the ufunc
|
||||
with pytest.raises((type(exc), TypeError)):
|
||||
ufunc(data)
|
||||
else:
|
||||
alt = ufunc(data)
|
||||
self.assert_extension_array_equal(result, alt)
|
||||
@@ -0,0 +1,42 @@
|
||||
import io
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BasePrintingTests(BaseExtensionTests):
|
||||
"""Tests checking the formatting of your EA when printed."""
|
||||
|
||||
@pytest.mark.parametrize("size", ["big", "small"])
|
||||
def test_array_repr(self, data, size):
|
||||
if size == "small":
|
||||
data = data[:5]
|
||||
else:
|
||||
data = type(data)._concat_same_type([data] * 5)
|
||||
|
||||
result = repr(data)
|
||||
assert type(data).__name__ in result
|
||||
assert f"Length: {len(data)}" in result
|
||||
assert str(data.dtype) in result
|
||||
if size == "big":
|
||||
assert "..." in result
|
||||
|
||||
def test_array_repr_unicode(self, data):
|
||||
result = str(data)
|
||||
assert isinstance(result, str)
|
||||
|
||||
def test_series_repr(self, data):
|
||||
ser = pd.Series(data)
|
||||
assert data.dtype.name in repr(ser)
|
||||
|
||||
def test_dataframe_repr(self, data):
|
||||
df = pd.DataFrame({"A": data})
|
||||
repr(df)
|
||||
|
||||
def test_dtype_name_in_info(self, data):
|
||||
buf = io.StringIO()
|
||||
pd.DataFrame({"A": data}).info(buf=buf)
|
||||
result = buf.getvalue()
|
||||
assert data.dtype.name in result
|
||||
@@ -0,0 +1,69 @@
|
||||
import warnings
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseReduceTests(BaseExtensionTests):
|
||||
"""
|
||||
Reduction specific tests. Generally these only
|
||||
make sense for numeric/boolean operations.
|
||||
"""
|
||||
|
||||
def check_reduce(self, s, op_name, skipna):
|
||||
result = getattr(s, op_name)(skipna=skipna)
|
||||
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
class BaseNoReduceTests(BaseReduceTests):
|
||||
"""we don't define any reductions"""
|
||||
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
|
||||
op_name = all_numeric_reductions
|
||||
s = pd.Series(data)
|
||||
|
||||
msg = (
|
||||
"[Cc]annot perform|Categorical is not ordered for operation|"
|
||||
"does not support reduction|"
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
getattr(s, op_name)(skipna=skipna)
|
||||
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
|
||||
op_name = all_boolean_reductions
|
||||
s = pd.Series(data)
|
||||
|
||||
msg = (
|
||||
"[Cc]annot perform|Categorical is not ordered for operation|"
|
||||
"does not support reduction|"
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
getattr(s, op_name)(skipna=skipna)
|
||||
|
||||
|
||||
class BaseNumericReduceTests(BaseReduceTests):
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
def test_reduce_series(self, data, all_numeric_reductions, skipna):
|
||||
op_name = all_numeric_reductions
|
||||
s = pd.Series(data)
|
||||
|
||||
# min/max with empty produce numpy warnings
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", RuntimeWarning)
|
||||
self.check_reduce(s, op_name, skipna)
|
||||
|
||||
|
||||
class BaseBooleanReduceTests(BaseReduceTests):
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
def test_reduce_series(self, data, all_boolean_reductions, skipna):
|
||||
op_name = all_boolean_reductions
|
||||
s = pd.Series(data)
|
||||
self.check_reduce(s, op_name, skipna)
|
||||
@@ -0,0 +1,370 @@
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.extensions import ExtensionArray
|
||||
from pandas.core.internals.blocks import EABackedBlock
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseReshapingTests(BaseExtensionTests):
|
||||
"""Tests for reshaping and concatenation."""
|
||||
|
||||
@pytest.mark.parametrize("in_frame", [True, False])
|
||||
def test_concat(self, data, in_frame):
|
||||
wrapped = pd.Series(data)
|
||||
if in_frame:
|
||||
wrapped = pd.DataFrame(wrapped)
|
||||
result = pd.concat([wrapped, wrapped], ignore_index=True)
|
||||
|
||||
assert len(result) == len(data) * 2
|
||||
|
||||
if in_frame:
|
||||
dtype = result.dtypes[0]
|
||||
else:
|
||||
dtype = result.dtype
|
||||
|
||||
assert dtype == data.dtype
|
||||
if hasattr(result._mgr, "blocks"):
|
||||
assert isinstance(result._mgr.blocks[0], EABackedBlock)
|
||||
assert isinstance(result._mgr.arrays[0], ExtensionArray)
|
||||
|
||||
@pytest.mark.parametrize("in_frame", [True, False])
|
||||
def test_concat_all_na_block(self, data_missing, in_frame):
|
||||
valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
|
||||
na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
|
||||
if in_frame:
|
||||
valid_block = pd.DataFrame({"a": valid_block})
|
||||
na_block = pd.DataFrame({"a": na_block})
|
||||
result = pd.concat([valid_block, na_block])
|
||||
if in_frame:
|
||||
expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
|
||||
self.assert_frame_equal(result, expected)
|
||||
else:
|
||||
expected = pd.Series(data_missing.take([1, 1, 0, 0]))
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_mixed_dtypes(self, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/20762
|
||||
df1 = pd.DataFrame({"A": data[:3]})
|
||||
df2 = pd.DataFrame({"A": [1, 2, 3]})
|
||||
df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
|
||||
dfs = [df1, df2, df3]
|
||||
|
||||
# dataframes
|
||||
result = pd.concat(dfs)
|
||||
expected = pd.concat([x.astype(object) for x in dfs])
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
# series
|
||||
result = pd.concat([x["A"] for x in dfs])
|
||||
expected = pd.concat([x["A"].astype(object) for x in dfs])
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
# simple test for just EA and one other
|
||||
result = pd.concat([df1, df2.astype(object)])
|
||||
expected = pd.concat([df1.astype("object"), df2.astype("object")])
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
result = pd.concat([df1["A"], df2["A"].astype(object)])
|
||||
expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_concat_columns(self, data, na_value):
|
||||
df1 = pd.DataFrame({"A": data[:3]})
|
||||
df2 = pd.DataFrame({"B": [1, 2, 3]})
|
||||
|
||||
expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]})
|
||||
result = pd.concat([df1, df2], axis=1)
|
||||
self.assert_frame_equal(result, expected)
|
||||
result = pd.concat([df1["A"], df2["B"]], axis=1)
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
# non-aligned
|
||||
df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3])
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
|
||||
"B": [np.nan, 1, 2, 3],
|
||||
}
|
||||
)
|
||||
|
||||
result = pd.concat([df1, df2], axis=1)
|
||||
self.assert_frame_equal(result, expected)
|
||||
result = pd.concat([df1["A"], df2["B"]], axis=1)
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_extension_arrays_copy_false(self, data, na_value):
|
||||
# GH 20756
|
||||
df1 = pd.DataFrame({"A": data[:3]})
|
||||
df2 = pd.DataFrame({"B": data[3:7]})
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
|
||||
"B": data[3:7],
|
||||
}
|
||||
)
|
||||
result = pd.concat([df1, df2], axis=1, copy=False)
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_with_reindex(self, data):
|
||||
# GH-33027
|
||||
a = pd.DataFrame({"a": data[:5]})
|
||||
b = pd.DataFrame({"b": data[:5]})
|
||||
result = pd.concat([a, b], ignore_index=True)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
|
||||
"b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
|
||||
}
|
||||
)
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_align(self, data, na_value):
|
||||
a = data[:3]
|
||||
b = data[2:5]
|
||||
r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
|
||||
|
||||
# Assumes that the ctor can take a list of scalars of the type
|
||||
e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype))
|
||||
e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype))
|
||||
self.assert_series_equal(r1, e1)
|
||||
self.assert_series_equal(r2, e2)
|
||||
|
||||
def test_align_frame(self, data, na_value):
|
||||
a = data[:3]
|
||||
b = data[2:5]
|
||||
r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3]))
|
||||
|
||||
# Assumes that the ctor can take a list of scalars of the type
|
||||
e1 = pd.DataFrame(
|
||||
{"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)}
|
||||
)
|
||||
e2 = pd.DataFrame(
|
||||
{"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)}
|
||||
)
|
||||
self.assert_frame_equal(r1, e1)
|
||||
self.assert_frame_equal(r2, e2)
|
||||
|
||||
def test_align_series_frame(self, data, na_value):
|
||||
# https://github.com/pandas-dev/pandas/issues/20576
|
||||
ser = pd.Series(data, name="a")
|
||||
df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
|
||||
r1, r2 = ser.align(df)
|
||||
|
||||
e1 = pd.Series(
|
||||
data._from_sequence(list(data) + [na_value], dtype=data.dtype),
|
||||
name=ser.name,
|
||||
)
|
||||
|
||||
self.assert_series_equal(r1, e1)
|
||||
self.assert_frame_equal(r2, df)
|
||||
|
||||
def test_set_frame_expand_regular_with_extension(self, data):
|
||||
df = pd.DataFrame({"A": [1] * len(data)})
|
||||
df["B"] = data
|
||||
expected = pd.DataFrame({"A": [1] * len(data), "B": data})
|
||||
self.assert_frame_equal(df, expected)
|
||||
|
||||
def test_set_frame_expand_extension_with_regular(self, data):
|
||||
df = pd.DataFrame({"A": data})
|
||||
df["B"] = [1] * len(data)
|
||||
expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
|
||||
self.assert_frame_equal(df, expected)
|
||||
|
||||
def test_set_frame_overwrite_object(self, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/20555
|
||||
df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
|
||||
df["A"] = data
|
||||
assert df.dtypes["A"] == data.dtype
|
||||
|
||||
def test_merge(self, data, na_value):
|
||||
# GH-20743
|
||||
df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]})
|
||||
df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]})
|
||||
|
||||
res = pd.merge(df1, df2)
|
||||
exp = pd.DataFrame(
|
||||
{
|
||||
"int1": [1, 1, 2],
|
||||
"int2": [1, 2, 3],
|
||||
"key": [0, 0, 1],
|
||||
"ext": data._from_sequence(
|
||||
[data[0], data[0], data[1]], dtype=data.dtype
|
||||
),
|
||||
}
|
||||
)
|
||||
self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
|
||||
|
||||
res = pd.merge(df1, df2, how="outer")
|
||||
exp = pd.DataFrame(
|
||||
{
|
||||
"int1": [1, 1, 2, 3, np.nan],
|
||||
"int2": [1, 2, 3, np.nan, 4],
|
||||
"key": [0, 0, 1, 2, 3],
|
||||
"ext": data._from_sequence(
|
||||
[data[0], data[0], data[1], data[2], na_value], dtype=data.dtype
|
||||
),
|
||||
}
|
||||
)
|
||||
self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
|
||||
|
||||
def test_merge_on_extension_array(self, data):
|
||||
# GH 23020
|
||||
a, b = data[:2]
|
||||
key = type(data)._from_sequence([a, b], dtype=data.dtype)
|
||||
|
||||
df = pd.DataFrame({"key": key, "val": [1, 2]})
|
||||
result = pd.merge(df, df, on="key")
|
||||
expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]})
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
# order
|
||||
result = pd.merge(df.iloc[[1, 0]], df, on="key")
|
||||
expected = expected.iloc[[1, 0]].reset_index(drop=True)
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_merge_on_extension_array_duplicates(self, data):
|
||||
# GH 23020
|
||||
a, b = data[:2]
|
||||
key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
|
||||
df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
|
||||
df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
|
||||
|
||||
result = pd.merge(df1, df2, on="key")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"key": key.take([0, 0, 0, 0, 1]),
|
||||
"val_x": [1, 1, 3, 3, 2],
|
||||
"val_y": [1, 3, 1, 3, 2],
|
||||
}
|
||||
)
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"columns",
|
||||
[
|
||||
["A", "B"],
|
||||
pd.MultiIndex.from_tuples(
|
||||
[("A", "a"), ("A", "b")], names=["outer", "inner"]
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_stack(self, data, columns):
|
||||
df = pd.DataFrame({"A": data[:5], "B": data[:5]})
|
||||
df.columns = columns
|
||||
result = df.stack()
|
||||
expected = df.astype(object).stack()
|
||||
# we need a second astype(object), in case the constructor inferred
|
||||
# object -> specialized, as is done for period.
|
||||
expected = expected.astype(object)
|
||||
|
||||
if isinstance(expected, pd.Series):
|
||||
assert result.dtype == df.iloc[:, 0].dtype
|
||||
else:
|
||||
assert all(result.dtypes == df.iloc[:, 0].dtype)
|
||||
|
||||
result = result.astype(object)
|
||||
self.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index",
|
||||
[
|
||||
# Two levels, uniform.
|
||||
pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
|
||||
# non-uniform
|
||||
pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
|
||||
# three levels, non-uniform
|
||||
pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
|
||||
pd.MultiIndex.from_tuples(
|
||||
[
|
||||
("A", "a", 1),
|
||||
("A", "b", 0),
|
||||
("A", "a", 0),
|
||||
("B", "a", 0),
|
||||
("B", "c", 1),
|
||||
]
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("obj", ["series", "frame"])
|
||||
def test_unstack(self, data, index, obj):
|
||||
data = data[: len(index)]
|
||||
if obj == "series":
|
||||
ser = pd.Series(data, index=index)
|
||||
else:
|
||||
ser = pd.DataFrame({"A": data, "B": data}, index=index)
|
||||
|
||||
n = index.nlevels
|
||||
levels = list(range(n))
|
||||
# [0, 1, 2]
|
||||
# [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
|
||||
combinations = itertools.chain.from_iterable(
|
||||
itertools.permutations(levels, i) for i in range(1, n)
|
||||
)
|
||||
|
||||
for level in combinations:
|
||||
result = ser.unstack(level=level)
|
||||
assert all(
|
||||
isinstance(result[col].array, type(data)) for col in result.columns
|
||||
)
|
||||
|
||||
if obj == "series":
|
||||
# We should get the same result with to_frame+unstack+droplevel
|
||||
df = ser.to_frame()
|
||||
|
||||
alt = df.unstack(level=level).droplevel(0, axis=1)
|
||||
self.assert_frame_equal(result, alt)
|
||||
|
||||
expected = ser.astype(object).unstack(
|
||||
level=level, fill_value=data.dtype.na_value
|
||||
)
|
||||
if obj == "series" and not isinstance(ser.dtype, pd.SparseDtype):
|
||||
# GH#34457 SparseArray.astype(object) gives Sparse[object]
|
||||
# instead of np.dtype(object)
|
||||
assert (expected.dtypes == object).all()
|
||||
|
||||
result = result.astype(object)
|
||||
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ravel(self, data):
|
||||
# as long as EA is 1D-only, ravel is a no-op
|
||||
result = data.ravel()
|
||||
assert type(result) == type(data)
|
||||
|
||||
# Check that we have a view, not a copy
|
||||
result[0] = result[1]
|
||||
assert data[0] == data[1]
|
||||
|
||||
def test_transpose(self, data):
|
||||
result = data.transpose()
|
||||
assert type(result) == type(data)
|
||||
|
||||
# check we get a new object
|
||||
assert result is not data
|
||||
|
||||
# If we ever _did_ support 2D, shape should be reversed
|
||||
assert result.shape == data.shape[::-1]
|
||||
|
||||
# Check that we have a view, not a copy
|
||||
result[0] = result[1]
|
||||
assert data[0] == data[1]
|
||||
|
||||
def test_transpose_frame(self, data):
|
||||
df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"])
|
||||
result = df.T
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype),
|
||||
"b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype),
|
||||
"c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype),
|
||||
"d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype),
|
||||
},
|
||||
index=["A", "B"],
|
||||
)
|
||||
self.assert_frame_equal(result, expected)
|
||||
self.assert_frame_equal(np.transpose(np.transpose(df)), df)
|
||||
self.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]])
|
||||
@@ -0,0 +1,428 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
DatetimeTZDtype,
|
||||
IntervalDtype,
|
||||
PandasDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.extension.base.base import BaseExtensionTests
|
||||
|
||||
|
||||
class BaseSetitemTests(BaseExtensionTests):
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
lambda x: x.index,
|
||||
lambda x: list(x.index),
|
||||
lambda x: slice(None),
|
||||
lambda x: slice(0, len(x)),
|
||||
lambda x: range(len(x)),
|
||||
lambda x: list(range(len(x))),
|
||||
lambda x: np.ones(len(x), dtype=bool),
|
||||
],
|
||||
ids=[
|
||||
"index",
|
||||
"list[index]",
|
||||
"null_slice",
|
||||
"full_slice",
|
||||
"range",
|
||||
"list(range)",
|
||||
"mask",
|
||||
],
|
||||
)
|
||||
def full_indexer(self, request):
|
||||
"""
|
||||
Fixture for an indexer to pass to obj.loc to get/set the full length of the
|
||||
object.
|
||||
|
||||
In some cases, assumes that obj.index is the default RangeIndex.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
def test_setitem_scalar_series(self, data, box_in_series):
|
||||
if box_in_series:
|
||||
data = pd.Series(data)
|
||||
data[0] = data[1]
|
||||
assert data[0] == data[1]
|
||||
|
||||
def test_setitem_sequence(self, data, box_in_series):
|
||||
if box_in_series:
|
||||
data = pd.Series(data)
|
||||
original = data.copy()
|
||||
|
||||
data[[0, 1]] = [data[1], data[0]]
|
||||
assert data[0] == original[1]
|
||||
assert data[1] == original[0]
|
||||
|
||||
def test_setitem_sequence_mismatched_length_raises(self, data, as_array):
|
||||
ser = pd.Series(data)
|
||||
original = ser.copy()
|
||||
value = [data[0]]
|
||||
if as_array:
|
||||
value = data._from_sequence(value)
|
||||
|
||||
xpr = "cannot set using a {} indexer with a different length"
|
||||
with pytest.raises(ValueError, match=xpr.format("list-like")):
|
||||
ser[[0, 1]] = value
|
||||
# Ensure no modifications made before the exception
|
||||
self.assert_series_equal(ser, original)
|
||||
|
||||
with pytest.raises(ValueError, match=xpr.format("slice")):
|
||||
ser[slice(3)] = value
|
||||
self.assert_series_equal(ser, original)
|
||||
|
||||
def test_setitem_empty_indexer(self, data, box_in_series):
|
||||
if box_in_series:
|
||||
data = pd.Series(data)
|
||||
original = data.copy()
|
||||
data[np.array([], dtype=int)] = []
|
||||
self.assert_equal(data, original)
|
||||
|
||||
def test_setitem_sequence_broadcasts(self, data, box_in_series):
|
||||
if box_in_series:
|
||||
data = pd.Series(data)
|
||||
data[[0, 1]] = data[2]
|
||||
assert data[0] == data[2]
|
||||
assert data[1] == data[2]
|
||||
|
||||
@pytest.mark.parametrize("setter", ["loc", "iloc"])
|
||||
def test_setitem_scalar(self, data, setter):
|
||||
arr = pd.Series(data)
|
||||
setter = getattr(arr, setter)
|
||||
setter[0] = data[1]
|
||||
assert arr[0] == data[1]
|
||||
|
||||
def test_setitem_loc_scalar_mixed(self, data):
|
||||
df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
|
||||
df.loc[0, "B"] = data[1]
|
||||
assert df.loc[0, "B"] == data[1]
|
||||
|
||||
def test_setitem_loc_scalar_single(self, data):
|
||||
df = pd.DataFrame({"B": data})
|
||||
df.loc[10, "B"] = data[1]
|
||||
assert df.loc[10, "B"] == data[1]
|
||||
|
||||
def test_setitem_loc_scalar_multiple_homogoneous(self, data):
|
||||
df = pd.DataFrame({"A": data, "B": data})
|
||||
df.loc[10, "B"] = data[1]
|
||||
assert df.loc[10, "B"] == data[1]
|
||||
|
||||
def test_setitem_iloc_scalar_mixed(self, data):
|
||||
df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
|
||||
df.iloc[0, 1] = data[1]
|
||||
assert df.loc[0, "B"] == data[1]
|
||||
|
||||
def test_setitem_iloc_scalar_single(self, data):
|
||||
df = pd.DataFrame({"B": data})
|
||||
df.iloc[10, 0] = data[1]
|
||||
assert df.loc[10, "B"] == data[1]
|
||||
|
||||
def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
|
||||
df = pd.DataFrame({"A": data, "B": data})
|
||||
df.iloc[10, 1] = data[1]
|
||||
assert df.loc[10, "B"] == data[1]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"mask",
|
||||
[
|
||||
np.array([True, True, True, False, False]),
|
||||
pd.array([True, True, True, False, False], dtype="boolean"),
|
||||
pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
|
||||
],
|
||||
ids=["numpy-array", "boolean-array", "boolean-array-na"],
|
||||
)
|
||||
def test_setitem_mask(self, data, mask, box_in_series):
|
||||
arr = data[:5].copy()
|
||||
expected = arr.take([0, 0, 0, 3, 4])
|
||||
if box_in_series:
|
||||
arr = pd.Series(arr)
|
||||
expected = pd.Series(expected)
|
||||
arr[mask] = data[0]
|
||||
self.assert_equal(expected, arr)
|
||||
|
||||
def test_setitem_mask_raises(self, data, box_in_series):
|
||||
# wrong length
|
||||
mask = np.array([True, False])
|
||||
|
||||
if box_in_series:
|
||||
data = pd.Series(data)
|
||||
|
||||
with pytest.raises(IndexError, match="wrong length"):
|
||||
data[mask] = data[0]
|
||||
|
||||
mask = pd.array(mask, dtype="boolean")
|
||||
with pytest.raises(IndexError, match="wrong length"):
|
||||
data[mask] = data[0]
|
||||
|
||||
def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
|
||||
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
|
||||
mask[:3] = True
|
||||
mask[3:5] = pd.NA
|
||||
|
||||
if box_in_series:
|
||||
data = pd.Series(data)
|
||||
|
||||
data[mask] = data[0]
|
||||
|
||||
assert (data[:3] == data[0]).all()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
|
||||
ids=["list", "integer-array", "numpy-array"],
|
||||
)
|
||||
def test_setitem_integer_array(self, data, idx, box_in_series):
|
||||
arr = data[:5].copy()
|
||||
expected = data.take([0, 0, 0, 3, 4])
|
||||
|
||||
if box_in_series:
|
||||
arr = pd.Series(arr)
|
||||
expected = pd.Series(expected)
|
||||
|
||||
arr[idx] = arr[0]
|
||||
self.assert_equal(arr, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx, box_in_series",
|
||||
[
|
||||
([0, 1, 2, pd.NA], False),
|
||||
pytest.param(
|
||||
[0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
|
||||
),
|
||||
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
|
||||
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
|
||||
],
|
||||
ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
|
||||
)
|
||||
def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
|
||||
arr = data.copy()
|
||||
|
||||
# TODO(xfail) this raises KeyError about labels not found (it tries label-based)
|
||||
# for list of labels with Series
|
||||
if box_in_series:
|
||||
arr = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))])
|
||||
|
||||
msg = "Cannot index with an integer indexer containing NA values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
arr[idx] = arr[0]
|
||||
|
||||
@pytest.mark.parametrize("as_callable", [True, False])
|
||||
@pytest.mark.parametrize("setter", ["loc", None])
|
||||
def test_setitem_mask_aligned(self, data, as_callable, setter):
|
||||
ser = pd.Series(data)
|
||||
mask = np.zeros(len(data), dtype=bool)
|
||||
mask[:2] = True
|
||||
|
||||
if as_callable:
|
||||
mask2 = lambda x: mask
|
||||
else:
|
||||
mask2 = mask
|
||||
|
||||
if setter:
|
||||
# loc
|
||||
target = getattr(ser, setter)
|
||||
else:
|
||||
# Series.__setitem__
|
||||
target = ser
|
||||
|
||||
target[mask2] = data[5:7]
|
||||
|
||||
ser[mask2] = data[5:7]
|
||||
assert ser[0] == data[5]
|
||||
assert ser[1] == data[6]
|
||||
|
||||
@pytest.mark.parametrize("setter", ["loc", None])
|
||||
def test_setitem_mask_broadcast(self, data, setter):
|
||||
ser = pd.Series(data)
|
||||
mask = np.zeros(len(data), dtype=bool)
|
||||
mask[:2] = True
|
||||
|
||||
if setter: # loc
|
||||
target = getattr(ser, setter)
|
||||
else: # __setitem__
|
||||
target = ser
|
||||
|
||||
target[mask] = data[10]
|
||||
assert ser[0] == data[10]
|
||||
assert ser[1] == data[10]
|
||||
|
||||
def test_setitem_expand_columns(self, data):
|
||||
df = pd.DataFrame({"A": data})
|
||||
result = df.copy()
|
||||
result["B"] = 1
|
||||
expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.copy()
|
||||
result.loc[:, "B"] = 1
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
# overwrite with new type
|
||||
result["B"] = data
|
||||
expected = pd.DataFrame({"A": data, "B": data})
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_setitem_expand_with_extension(self, data):
|
||||
df = pd.DataFrame({"A": [1] * len(data)})
|
||||
result = df.copy()
|
||||
result["B"] = data
|
||||
expected = pd.DataFrame({"A": [1] * len(data), "B": data})
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.copy()
|
||||
result.loc[:, "B"] = data
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_setitem_frame_invalid_length(self, data):
|
||||
df = pd.DataFrame({"A": [1] * len(data)})
|
||||
xpr = (
|
||||
rf"Length of values \({len(data[:5])}\) "
|
||||
rf"does not match length of index \({len(df)}\)"
|
||||
)
|
||||
with pytest.raises(ValueError, match=xpr):
|
||||
df["B"] = data[:5]
|
||||
|
||||
def test_setitem_tuple_index(self, data):
|
||||
ser = pd.Series(data[:2], index=[(0, 0), (0, 1)])
|
||||
expected = pd.Series(data.take([1, 1]), index=ser.index)
|
||||
ser[(0, 0)] = data[1]
|
||||
self.assert_series_equal(ser, expected)
|
||||
|
||||
def test_setitem_slice(self, data, box_in_series):
|
||||
arr = data[:5].copy()
|
||||
expected = data.take([0, 0, 0, 3, 4])
|
||||
if box_in_series:
|
||||
arr = pd.Series(arr)
|
||||
expected = pd.Series(expected)
|
||||
|
||||
arr[:3] = data[0]
|
||||
self.assert_equal(arr, expected)
|
||||
|
||||
def test_setitem_loc_iloc_slice(self, data):
|
||||
arr = data[:5].copy()
|
||||
s = pd.Series(arr, index=["a", "b", "c", "d", "e"])
|
||||
expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index)
|
||||
|
||||
result = s.copy()
|
||||
result.iloc[:3] = data[0]
|
||||
self.assert_equal(result, expected)
|
||||
|
||||
result = s.copy()
|
||||
result.loc[:"c"] = data[0]
|
||||
self.assert_equal(result, expected)
|
||||
|
||||
def test_setitem_slice_mismatch_length_raises(self, data):
|
||||
arr = data[:5]
|
||||
with pytest.raises(ValueError):
|
||||
arr[:1] = arr[:2]
|
||||
|
||||
def test_setitem_slice_array(self, data):
|
||||
arr = data[:5].copy()
|
||||
arr[:5] = data[-5:]
|
||||
self.assert_extension_array_equal(arr, data[-5:])
|
||||
|
||||
def test_setitem_scalar_key_sequence_raise(self, data):
|
||||
arr = data[:5].copy()
|
||||
with pytest.raises(ValueError):
|
||||
arr[0] = arr[[0, 1]]
|
||||
|
||||
def test_setitem_preserves_views(self, data):
|
||||
# GH#28150 setitem shouldn't swap the underlying data
|
||||
view1 = data.view()
|
||||
view2 = data[:]
|
||||
|
||||
data[0] = data[1]
|
||||
assert view1[0] == data[1]
|
||||
assert view2[0] == data[1]
|
||||
|
||||
def test_setitem_with_expansion_dataframe_column(self, data, full_indexer):
|
||||
# https://github.com/pandas-dev/pandas/issues/32395
|
||||
df = expected = pd.DataFrame({"data": pd.Series(data)})
|
||||
result = pd.DataFrame(index=df.index)
|
||||
|
||||
key = full_indexer(df)
|
||||
result.loc[key, "data"] = df["data"]
|
||||
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_setitem_with_expansion_row(self, data, na_value):
|
||||
df = pd.DataFrame({"data": data[:1]})
|
||||
|
||||
df.loc[1, "data"] = data[1]
|
||||
expected = pd.DataFrame({"data": data[:2]})
|
||||
self.assert_frame_equal(df, expected)
|
||||
|
||||
# https://github.com/pandas-dev/pandas/issues/47284
|
||||
df.loc[2, "data"] = na_value
|
||||
expected = pd.DataFrame(
|
||||
{"data": pd.Series([data[0], data[1], na_value], dtype=data.dtype)}
|
||||
)
|
||||
self.assert_frame_equal(df, expected)
|
||||
|
||||
def test_setitem_series(self, data, full_indexer):
|
||||
# https://github.com/pandas-dev/pandas/issues/32395
|
||||
ser = pd.Series(data, name="data")
|
||||
result = pd.Series(index=ser.index, dtype=object, name="data")
|
||||
|
||||
# because result has object dtype, the attempt to do setting inplace
|
||||
# is successful, and object dtype is retained
|
||||
key = full_indexer(ser)
|
||||
result.loc[key] = ser
|
||||
|
||||
expected = pd.Series(
|
||||
data.astype(object), index=ser.index, name="data", dtype=object
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_setitem_frame_2d_values(self, data, request):
|
||||
# GH#44514
|
||||
df = pd.DataFrame({"A": data})
|
||||
|
||||
# Avoiding using_array_manager fixture
|
||||
# https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410
|
||||
using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager)
|
||||
if using_array_manager:
|
||||
if not isinstance(
|
||||
data.dtype, (PandasDtype, PeriodDtype, IntervalDtype, DatetimeTZDtype)
|
||||
):
|
||||
# These dtypes have non-broken implementations of _can_hold_element
|
||||
mark = pytest.mark.xfail(reason="Goes through split path, loses dtype")
|
||||
request.node.add_marker(mark)
|
||||
|
||||
df = pd.DataFrame({"A": data})
|
||||
orig = df.copy()
|
||||
|
||||
df.iloc[:] = df
|
||||
self.assert_frame_equal(df, orig)
|
||||
|
||||
df.iloc[:-1] = df.iloc[:-1]
|
||||
self.assert_frame_equal(df, orig)
|
||||
|
||||
df.iloc[:] = df.values
|
||||
self.assert_frame_equal(df, orig)
|
||||
|
||||
df.iloc[:-1] = df.values[:-1]
|
||||
self.assert_frame_equal(df, orig)
|
||||
|
||||
def test_delitem_series(self, data):
|
||||
# GH#40763
|
||||
ser = pd.Series(data, name="data")
|
||||
|
||||
taker = np.arange(len(ser))
|
||||
taker = np.delete(taker, 1)
|
||||
|
||||
expected = ser[taker]
|
||||
del ser[1]
|
||||
self.assert_series_equal(ser, expected)
|
||||
|
||||
def test_setitem_invalid(self, data, invalid_scalar):
|
||||
msg = "" # messages vary by subclass, so we do not test it
|
||||
with pytest.raises((ValueError, TypeError), match=msg):
|
||||
data[0] = invalid_scalar
|
||||
|
||||
with pytest.raises((ValueError, TypeError), match=msg):
|
||||
data[:] = invalid_scalar
|
||||
@@ -0,0 +1,195 @@
|
||||
import operator
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import Series
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
"""A fixture providing the ExtensionDtype to validate."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
"""
|
||||
Length-100 array for this type.
|
||||
|
||||
* data[0] and data[1] should both be non missing
|
||||
* data[0] and data[1] should not be equal
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_twos():
|
||||
"""Length-100 array in which all the elements are two."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing():
|
||||
"""Length-2 array with [NA, Valid]"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@pytest.fixture(params=["data", "data_missing"])
|
||||
def all_data(request, data, data_missing):
|
||||
"""Parametrized fixture giving 'data' and 'data_missing'"""
|
||||
if request.param == "data":
|
||||
return data
|
||||
elif request.param == "data_missing":
|
||||
return data_missing
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_repeated(data):
|
||||
"""
|
||||
Generate many datasets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : fixture implementing `data`
|
||||
|
||||
Returns
|
||||
-------
|
||||
Callable[[int], Generator]:
|
||||
A callable that takes a `count` argument and
|
||||
returns a generator yielding `count` datasets.
|
||||
"""
|
||||
|
||||
def gen(count):
|
||||
for _ in range(count):
|
||||
yield data
|
||||
|
||||
return gen
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting():
|
||||
"""
|
||||
Length-3 array with a known sort order.
|
||||
|
||||
This should be three items [B, C, A] with
|
||||
A < B < C
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting():
|
||||
"""
|
||||
Length-3 array with a known sort order.
|
||||
|
||||
This should be three items [B, NA, A] with
|
||||
A < B and NA missing.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_cmp():
|
||||
"""
|
||||
Binary operator for comparing NA values.
|
||||
|
||||
Should return a function of two arguments that returns
|
||||
True if both arguments are (scalar) NA for your type.
|
||||
|
||||
By default, uses ``operator.is_``
|
||||
"""
|
||||
return operator.is_
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
"""The scalar missing value for this type. Default 'None'"""
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping():
|
||||
"""
|
||||
Data for factorization, grouping, and unique tests.
|
||||
|
||||
Expected to be like [B, B, NA, NA, A, A, B, C]
|
||||
|
||||
Where A < B < C and NA is missing
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def box_in_series(request):
|
||||
"""Whether to box the data in a Series"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
lambda x: 1,
|
||||
lambda x: [1] * len(x),
|
||||
lambda x: Series([1] * len(x)),
|
||||
lambda x: x,
|
||||
],
|
||||
ids=["scalar", "list", "series", "object"],
|
||||
)
|
||||
def groupby_apply_op(request):
|
||||
"""
|
||||
Functions to test groupby.apply().
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def as_frame(request):
|
||||
"""
|
||||
Boolean fixture to support Series and Series.to_frame() comparison testing.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def as_series(request):
|
||||
"""
|
||||
Boolean fixture to support arr and Series(arr) comparison testing.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def use_numpy(request):
|
||||
"""
|
||||
Boolean fixture to support comparison testing of ExtensionDtype array
|
||||
and numpy array.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=["ffill", "bfill"])
|
||||
def fillna_method(request):
|
||||
"""
|
||||
Parametrized fixture giving method parameters 'ffill' and 'bfill' for
|
||||
Series.fillna(method=<method>) testing.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def as_array(request):
|
||||
"""
|
||||
Boolean fixture to support ExtensionDtype _from_sequence method testing.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def invalid_scalar(data):
|
||||
"""
|
||||
A scalar that *cannot* be held by this ExtensionArray.
|
||||
|
||||
The default should work for most subclasses, but is not guaranteed.
|
||||
|
||||
If the array can hold any item (i.e. object dtype), then use pytest.skip.
|
||||
"""
|
||||
return object.__new__(object)
|
||||
@@ -0,0 +1,6 @@
|
||||
from pandas.tests.extension.date.array import (
|
||||
DateArray,
|
||||
DateDtype,
|
||||
)
|
||||
|
||||
__all__ = ["DateArray", "DateDtype"]
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,183 @@
|
||||
import datetime as dt
|
||||
from typing import (
|
||||
Any,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
PositionalIndexer,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.dtypes import register_extension_dtype
|
||||
|
||||
from pandas.api.extensions import (
|
||||
ExtensionArray,
|
||||
ExtensionDtype,
|
||||
)
|
||||
from pandas.api.types import pandas_dtype
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class DateDtype(ExtensionDtype):
|
||||
@property
|
||||
def type(self):
|
||||
return dt.date
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return "DateDtype"
|
||||
|
||||
@classmethod
|
||||
def construct_from_string(cls, string: str):
|
||||
if not isinstance(string, str):
|
||||
raise TypeError(
|
||||
f"'construct_from_string' expects a string, got {type(string)}"
|
||||
)
|
||||
|
||||
if string == cls.__name__:
|
||||
return cls()
|
||||
else:
|
||||
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls):
|
||||
return DateArray
|
||||
|
||||
@property
|
||||
def na_value(self):
|
||||
return dt.date.min
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.name
|
||||
|
||||
|
||||
class DateArray(ExtensionArray):
|
||||
def __init__(
|
||||
self,
|
||||
dates: Union[
|
||||
dt.date,
|
||||
Sequence[dt.date],
|
||||
Tuple[np.ndarray, np.ndarray, np.ndarray],
|
||||
np.ndarray,
|
||||
],
|
||||
) -> None:
|
||||
if isinstance(dates, dt.date):
|
||||
self._year = np.array([dates.year])
|
||||
self._month = np.array([dates.month])
|
||||
self._day = np.array([dates.year])
|
||||
return
|
||||
|
||||
ldates = len(dates)
|
||||
if isinstance(dates, list):
|
||||
# pre-allocate the arrays since we know the size before hand
|
||||
self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999)
|
||||
self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31)
|
||||
self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12)
|
||||
# populate them
|
||||
for i, (y, m, d) in enumerate(
|
||||
map(lambda date: (date.year, date.month, date.day), dates)
|
||||
):
|
||||
self._year[i] = y
|
||||
self._month[i] = m
|
||||
self._day[i] = d
|
||||
|
||||
elif isinstance(dates, tuple):
|
||||
# only support triples
|
||||
if ldates != 3:
|
||||
raise ValueError("only triples are valid")
|
||||
# check if all elements have the same type
|
||||
if any(map(lambda x: not isinstance(x, np.ndarray), dates)):
|
||||
raise TypeError("invalid type")
|
||||
ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates)
|
||||
if not ly == lm == ld:
|
||||
raise ValueError(
|
||||
f"tuple members must have the same length: {(ly, lm, ld)}"
|
||||
)
|
||||
self._year = dates[0].astype(np.uint16)
|
||||
self._month = dates[1].astype(np.uint8)
|
||||
self._day = dates[2].astype(np.uint8)
|
||||
|
||||
elif isinstance(dates, np.ndarray) and dates.dtype == "U10":
|
||||
self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999)
|
||||
self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31)
|
||||
self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12)
|
||||
|
||||
# "object_" object is not iterable [misc]
|
||||
for (i,), (y, m, d) in np.ndenumerate( # type: ignore[misc]
|
||||
np.char.split(dates, sep="-")
|
||||
):
|
||||
self._year[i] = int(y)
|
||||
self._month[i] = int(m)
|
||||
self._day[i] = int(d)
|
||||
|
||||
else:
|
||||
raise TypeError(f"{type(dates)} is not supported")
|
||||
|
||||
@property
|
||||
def dtype(self) -> ExtensionDtype:
|
||||
return DateDtype()
|
||||
|
||||
def astype(self, dtype, copy=True):
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
if isinstance(dtype, DateDtype):
|
||||
data = self.copy() if copy else self
|
||||
else:
|
||||
data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min)
|
||||
|
||||
return data
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
return self._year.nbytes + self._month.nbytes + self._day.nbytes
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._year) # all 3 arrays are enforced to have the same length
|
||||
|
||||
def __getitem__(self, item: PositionalIndexer):
|
||||
if isinstance(item, int):
|
||||
return dt.date(self._year[item], self._month[item], self._day[item])
|
||||
else:
|
||||
raise NotImplementedError("only ints are supported as indexes")
|
||||
|
||||
def __setitem__(self, key: Union[int, slice, np.ndarray], value: Any):
|
||||
if not isinstance(key, int):
|
||||
raise NotImplementedError("only ints are supported as indexes")
|
||||
|
||||
if not isinstance(value, dt.date):
|
||||
raise TypeError("you can only set datetime.date types")
|
||||
|
||||
self._year[key] = value.year
|
||||
self._month[key] = value.month
|
||||
self._day[key] = value.day
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"DateArray{list(zip(self._year, self._month, self._day))}"
|
||||
|
||||
def copy(self) -> "DateArray":
|
||||
return DateArray((self._year.copy(), self._month.copy(), self._day.copy()))
|
||||
|
||||
def isna(self) -> np.ndarray:
|
||||
return np.logical_and(
|
||||
np.logical_and(
|
||||
self._year == dt.date.min.year, self._month == dt.date.min.month
|
||||
),
|
||||
self._day == dt.date.min.day,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False):
|
||||
if isinstance(scalars, dt.date):
|
||||
pass
|
||||
elif isinstance(scalars, DateArray):
|
||||
pass
|
||||
elif isinstance(scalars, np.ndarray):
|
||||
scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd
|
||||
return DateArray(scalars)
|
||||
@@ -0,0 +1,8 @@
|
||||
from pandas.tests.extension.decimal.array import (
|
||||
DecimalArray,
|
||||
DecimalDtype,
|
||||
make_data,
|
||||
to_decimal,
|
||||
)
|
||||
|
||||
__all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,272 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import decimal
|
||||
import numbers
|
||||
import random
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import type_t
|
||||
|
||||
from pandas.core.dtypes.base import ExtensionDtype
|
||||
from pandas.core.dtypes.common import (
|
||||
is_dtype_equal,
|
||||
is_float,
|
||||
pandas_dtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.extensions import (
|
||||
no_default,
|
||||
register_extension_dtype,
|
||||
)
|
||||
from pandas.api.types import (
|
||||
is_list_like,
|
||||
is_scalar,
|
||||
)
|
||||
from pandas.core import arraylike
|
||||
from pandas.core.arraylike import OpsMixin
|
||||
from pandas.core.arrays import (
|
||||
ExtensionArray,
|
||||
ExtensionScalarOpsMixin,
|
||||
)
|
||||
from pandas.core.indexers import check_array_indexer
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class DecimalDtype(ExtensionDtype):
|
||||
type = decimal.Decimal
|
||||
name = "decimal"
|
||||
na_value = decimal.Decimal("NaN")
|
||||
_metadata = ("context",)
|
||||
|
||||
def __init__(self, context=None):
|
||||
self.context = context or decimal.getcontext()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"DecimalDtype(context={self.context})"
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[DecimalArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return DecimalArray
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
class DecimalArray(OpsMixin, ExtensionScalarOpsMixin, ExtensionArray):
|
||||
__array_priority__ = 1000
|
||||
|
||||
def __init__(self, values, dtype=None, copy=False, context=None):
|
||||
for i, val in enumerate(values):
|
||||
if is_float(val):
|
||||
if np.isnan(val):
|
||||
values[i] = DecimalDtype.na_value
|
||||
else:
|
||||
values[i] = DecimalDtype.type(val)
|
||||
elif not isinstance(val, decimal.Decimal):
|
||||
raise TypeError("All values must be of type " + str(decimal.Decimal))
|
||||
values = np.asarray(values, dtype=object)
|
||||
|
||||
self._data = values
|
||||
# Some aliases for common attribute names to ensure pandas supports
|
||||
# these
|
||||
self._items = self.data = self._data
|
||||
# those aliases are currently not working due to assumptions
|
||||
# in internal code (GH-20735)
|
||||
# self._values = self.values = self.data
|
||||
self._dtype = DecimalDtype(context)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self._dtype
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(cls, scalars, dtype=None, copy=False):
|
||||
return cls(scalars)
|
||||
|
||||
@classmethod
|
||||
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
|
||||
return cls._from_sequence([decimal.Decimal(x) for x in strings], dtype, copy)
|
||||
|
||||
@classmethod
|
||||
def _from_factorized(cls, values, original):
|
||||
return cls(values)
|
||||
|
||||
_HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray)
|
||||
|
||||
def to_numpy(
|
||||
self, dtype=None, copy: bool = False, na_value=no_default, decimals=None
|
||||
) -> np.ndarray:
|
||||
result = np.asarray(self, dtype=dtype)
|
||||
if decimals is not None:
|
||||
result = np.asarray([round(x, decimals) for x in result])
|
||||
return result
|
||||
|
||||
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
||||
#
|
||||
if not all(
|
||||
isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) for t in inputs
|
||||
):
|
||||
return NotImplemented
|
||||
|
||||
inputs = tuple(x._data if isinstance(x, DecimalArray) else x for x in inputs)
|
||||
result = getattr(ufunc, method)(*inputs, **kwargs)
|
||||
|
||||
if method == "reduce":
|
||||
result = arraylike.dispatch_reduction_ufunc(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
if result is not NotImplemented:
|
||||
return result
|
||||
|
||||
def reconstruct(x):
|
||||
if isinstance(x, (decimal.Decimal, numbers.Number)):
|
||||
return x
|
||||
else:
|
||||
return DecimalArray._from_sequence(x)
|
||||
|
||||
if ufunc.nout > 1:
|
||||
return tuple(reconstruct(x) for x in result)
|
||||
else:
|
||||
return reconstruct(result)
|
||||
|
||||
def __getitem__(self, item):
|
||||
if isinstance(item, numbers.Integral):
|
||||
return self._data[item]
|
||||
else:
|
||||
# array, slice.
|
||||
item = pd.api.indexers.check_array_indexer(self, item)
|
||||
return type(self)(self._data[item])
|
||||
|
||||
def take(self, indexer, allow_fill=False, fill_value=None):
|
||||
from pandas.api.extensions import take
|
||||
|
||||
data = self._data
|
||||
if allow_fill and fill_value is None:
|
||||
fill_value = self.dtype.na_value
|
||||
|
||||
result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
|
||||
return self._from_sequence(result)
|
||||
|
||||
def copy(self):
|
||||
return type(self)(self._data.copy(), dtype=self.dtype)
|
||||
|
||||
def astype(self, dtype, copy=True):
|
||||
if is_dtype_equal(dtype, self._dtype):
|
||||
if not copy:
|
||||
return self
|
||||
dtype = pandas_dtype(dtype)
|
||||
if isinstance(dtype, type(self.dtype)):
|
||||
return type(self)(self._data, copy=copy, context=dtype.context)
|
||||
|
||||
return super().astype(dtype, copy=copy)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if is_list_like(value):
|
||||
if is_scalar(key):
|
||||
raise ValueError("setting an array element with a sequence.")
|
||||
value = [decimal.Decimal(v) for v in value]
|
||||
else:
|
||||
value = decimal.Decimal(value)
|
||||
|
||||
key = check_array_indexer(self, key)
|
||||
self._data[key] = value
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._data)
|
||||
|
||||
def __contains__(self, item) -> bool | np.bool_:
|
||||
if not isinstance(item, decimal.Decimal):
|
||||
return False
|
||||
elif item.is_nan():
|
||||
return self.isna().any()
|
||||
else:
|
||||
return super().__contains__(item)
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
n = len(self)
|
||||
if n:
|
||||
return n * sys.getsizeof(self[0])
|
||||
return 0
|
||||
|
||||
def isna(self):
|
||||
return np.array([x.is_nan() for x in self._data], dtype=bool)
|
||||
|
||||
@property
|
||||
def _na_value(self):
|
||||
return decimal.Decimal("NaN")
|
||||
|
||||
def _formatter(self, boxed=False):
|
||||
if boxed:
|
||||
return "Decimal: {}".format
|
||||
return repr
|
||||
|
||||
@classmethod
|
||||
def _concat_same_type(cls, to_concat):
|
||||
return cls(np.concatenate([x._data for x in to_concat]))
|
||||
|
||||
def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
|
||||
|
||||
if skipna:
|
||||
# If we don't have any NAs, we can ignore skipna
|
||||
if self.isna().any():
|
||||
other = self[~self.isna()]
|
||||
return other._reduce(name, **kwargs)
|
||||
|
||||
if name == "sum" and len(self) == 0:
|
||||
# GH#29630 avoid returning int 0 or np.bool_(False) on old numpy
|
||||
return decimal.Decimal(0)
|
||||
|
||||
try:
|
||||
op = getattr(self.data, name)
|
||||
except AttributeError as err:
|
||||
raise NotImplementedError(
|
||||
f"decimal does not support the {name} operation"
|
||||
) from err
|
||||
return op(axis=0)
|
||||
|
||||
def _cmp_method(self, other, op):
|
||||
# For use with OpsMixin
|
||||
def convert_values(param):
|
||||
if isinstance(param, ExtensionArray) or is_list_like(param):
|
||||
ovalues = param
|
||||
else:
|
||||
# Assume it's an object
|
||||
ovalues = [param] * len(self)
|
||||
return ovalues
|
||||
|
||||
lvalues = self
|
||||
rvalues = convert_values(other)
|
||||
|
||||
# If the operator is not defined for the underlying objects,
|
||||
# a TypeError should be raised
|
||||
res = [op(a, b) for (a, b) in zip(lvalues, rvalues)]
|
||||
|
||||
return np.asarray(res, dtype=bool)
|
||||
|
||||
def value_counts(self, dropna: bool = True):
|
||||
from pandas.core.algorithms import value_counts
|
||||
|
||||
return value_counts(self.to_numpy(), dropna=dropna)
|
||||
|
||||
|
||||
def to_decimal(values, context=None):
|
||||
return DecimalArray([decimal.Decimal(x) for x in values], context=context)
|
||||
|
||||
|
||||
def make_data():
|
||||
return [decimal.Decimal(random.random()) for _ in range(100)]
|
||||
|
||||
|
||||
DecimalArray._add_arithmetic_ops()
|
||||
@@ -0,0 +1,481 @@
|
||||
import decimal
|
||||
import operator
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import infer_dtype
|
||||
from pandas.tests.extension import base
|
||||
from pandas.tests.extension.decimal.array import (
|
||||
DecimalArray,
|
||||
DecimalDtype,
|
||||
make_data,
|
||||
to_decimal,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return DecimalDtype()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
return DecimalArray(make_data())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_twos():
|
||||
return DecimalArray([decimal.Decimal(2) for _ in range(100)])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing():
|
||||
return DecimalArray([decimal.Decimal("NaN"), decimal.Decimal(1)])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting():
|
||||
return DecimalArray(
|
||||
[decimal.Decimal("1"), decimal.Decimal("2"), decimal.Decimal("0")]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting():
|
||||
return DecimalArray(
|
||||
[decimal.Decimal("1"), decimal.Decimal("NaN"), decimal.Decimal("0")]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_cmp():
|
||||
return lambda x, y: x.is_nan() and y.is_nan()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return decimal.Decimal("NaN")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping():
|
||||
b = decimal.Decimal("1.0")
|
||||
a = decimal.Decimal("0.0")
|
||||
c = decimal.Decimal("2.0")
|
||||
na = decimal.Decimal("NaN")
|
||||
return DecimalArray([b, b, na, na, a, a, b, c])
|
||||
|
||||
|
||||
class TestDtype(base.BaseDtypeTests):
|
||||
def test_hashable(self, dtype):
|
||||
pass
|
||||
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
def test_infer_dtype(self, data, data_missing, skipna):
|
||||
# here overriding base test to ensure we fall back to return
|
||||
# "unknown-array" for an EA pandas doesn't know
|
||||
assert infer_dtype(data, skipna=skipna) == "unknown-array"
|
||||
assert infer_dtype(data_missing, skipna=skipna) == "unknown-array"
|
||||
|
||||
|
||||
class TestInterface(base.BaseInterfaceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestConstructors(base.BaseConstructorsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestReshaping(base.BaseReshapingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGetitem(base.BaseGetitemTests):
|
||||
def test_take_na_value_other_decimal(self):
|
||||
arr = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("2.0")])
|
||||
result = arr.take([0, -1], allow_fill=True, fill_value=decimal.Decimal("-1.0"))
|
||||
expected = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("-1.0")])
|
||||
self.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
class TestIndex(base.BaseIndexTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMissing(base.BaseMissingTests):
|
||||
pass
|
||||
|
||||
|
||||
class Reduce:
|
||||
def check_reduce(self, s, op_name, skipna):
|
||||
|
||||
if op_name in ["median", "skew", "kurt"]:
|
||||
msg = r"decimal does not support the .* operation"
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
getattr(s, op_name)(skipna=skipna)
|
||||
|
||||
else:
|
||||
result = getattr(s, op_name)(skipna=skipna)
|
||||
expected = getattr(np.asarray(s), op_name)()
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
class TestNumericReduce(Reduce, base.BaseNumericReduceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestBooleanReduce(Reduce, base.BaseBooleanReduceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMethods(base.BaseMethodsTests):
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_value_counts(self, all_data, dropna, request):
|
||||
all_data = all_data[:10]
|
||||
if dropna:
|
||||
other = np.array(all_data[~all_data.isna()])
|
||||
else:
|
||||
other = all_data
|
||||
|
||||
vcs = pd.Series(all_data).value_counts(dropna=dropna)
|
||||
vcs_ex = pd.Series(other).value_counts(dropna=dropna)
|
||||
|
||||
with decimal.localcontext() as ctx:
|
||||
# avoid raising when comparing Decimal("NAN") < Decimal(2)
|
||||
ctx.traps[decimal.InvalidOperation] = False
|
||||
|
||||
result = vcs.sort_index()
|
||||
expected = vcs_ex.sort_index()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
class TestCasting(base.BaseCastingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGroupby(base.BaseGroupbyTests):
|
||||
def test_groupby_agg_extension(self, data_for_grouping):
|
||||
super().test_groupby_agg_extension(data_for_grouping)
|
||||
|
||||
|
||||
class TestSetitem(base.BaseSetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestPrinting(base.BasePrintingTests):
|
||||
def test_series_repr(self, data):
|
||||
# Overriding this base test to explicitly test that
|
||||
# the custom _formatter is used
|
||||
ser = pd.Series(data)
|
||||
assert data.dtype.name in repr(ser)
|
||||
assert "Decimal: " in repr(ser)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"DecimalArray constructor raises bc _from_sequence wants Decimals, not ints."
|
||||
"Easy to fix, just need to do it."
|
||||
),
|
||||
raises=TypeError,
|
||||
)
|
||||
def test_series_constructor_coerce_data_to_extension_dtype_raises():
|
||||
xpr = (
|
||||
"Cannot cast data to extension dtype 'decimal'. Pass the "
|
||||
"extension array directly."
|
||||
)
|
||||
with pytest.raises(ValueError, match=xpr):
|
||||
pd.Series([0, 1, 2], dtype=DecimalDtype())
|
||||
|
||||
|
||||
def test_series_constructor_with_dtype():
|
||||
arr = DecimalArray([decimal.Decimal("10.0")])
|
||||
result = pd.Series(arr, dtype=DecimalDtype())
|
||||
expected = pd.Series(arr)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = pd.Series(arr, dtype="int64")
|
||||
expected = pd.Series([10])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_dataframe_constructor_with_dtype():
|
||||
arr = DecimalArray([decimal.Decimal("10.0")])
|
||||
|
||||
result = pd.DataFrame({"A": arr}, dtype=DecimalDtype())
|
||||
expected = pd.DataFrame({"A": arr})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
arr = DecimalArray([decimal.Decimal("10.0")])
|
||||
result = pd.DataFrame({"A": arr}, dtype="int64")
|
||||
expected = pd.DataFrame({"A": [10]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("frame", [True, False])
|
||||
def test_astype_dispatches(frame):
|
||||
# This is a dtype-specific test that ensures Series[decimal].astype
|
||||
# gets all the way through to ExtensionArray.astype
|
||||
# Designing a reliable smoke test that works for arbitrary data types
|
||||
# is difficult.
|
||||
data = pd.Series(DecimalArray([decimal.Decimal(2)]), name="a")
|
||||
ctx = decimal.Context()
|
||||
ctx.prec = 5
|
||||
|
||||
if frame:
|
||||
data = data.to_frame()
|
||||
|
||||
result = data.astype(DecimalDtype(ctx))
|
||||
|
||||
if frame:
|
||||
result = result["a"]
|
||||
|
||||
assert result.dtype.context.prec == ctx.prec
|
||||
|
||||
|
||||
class TestArithmeticOps(base.BaseArithmeticOpsTests):
|
||||
def check_opname(self, s, op_name, other, exc=None):
|
||||
super().check_opname(s, op_name, other, exc=None)
|
||||
|
||||
def test_arith_series_with_array(self, data, all_arithmetic_operators):
|
||||
op_name = all_arithmetic_operators
|
||||
s = pd.Series(data)
|
||||
|
||||
context = decimal.getcontext()
|
||||
divbyzerotrap = context.traps[decimal.DivisionByZero]
|
||||
invalidoptrap = context.traps[decimal.InvalidOperation]
|
||||
context.traps[decimal.DivisionByZero] = 0
|
||||
context.traps[decimal.InvalidOperation] = 0
|
||||
|
||||
# Decimal supports ops with int, but not float
|
||||
other = pd.Series([int(d * 100) for d in data])
|
||||
self.check_opname(s, op_name, other)
|
||||
|
||||
if "mod" not in op_name:
|
||||
self.check_opname(s, op_name, s * 2)
|
||||
|
||||
self.check_opname(s, op_name, 0)
|
||||
self.check_opname(s, op_name, 5)
|
||||
context.traps[decimal.DivisionByZero] = divbyzerotrap
|
||||
context.traps[decimal.InvalidOperation] = invalidoptrap
|
||||
|
||||
def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
|
||||
# We implement divmod
|
||||
super()._check_divmod_op(s, op, other, exc=None)
|
||||
|
||||
|
||||
class TestComparisonOps(base.BaseComparisonOpsTests):
|
||||
def test_compare_scalar(self, data, comparison_op):
|
||||
s = pd.Series(data)
|
||||
self._compare_other(s, data, comparison_op, 0.5)
|
||||
|
||||
def test_compare_array(self, data, comparison_op):
|
||||
s = pd.Series(data)
|
||||
|
||||
alter = np.random.choice([-1, 0, 1], len(data))
|
||||
# Randomly double, halve or keep same value
|
||||
other = pd.Series(data) * [decimal.Decimal(pow(2.0, i)) for i in alter]
|
||||
self._compare_other(s, data, comparison_op, other)
|
||||
|
||||
|
||||
class DecimalArrayWithoutFromSequence(DecimalArray):
|
||||
"""Helper class for testing error handling in _from_sequence."""
|
||||
|
||||
def _from_sequence(cls, scalars, dtype=None, copy=False):
|
||||
raise KeyError("For the test")
|
||||
|
||||
|
||||
class DecimalArrayWithoutCoercion(DecimalArrayWithoutFromSequence):
|
||||
@classmethod
|
||||
def _create_arithmetic_method(cls, op):
|
||||
return cls._create_method(op, coerce_to_dtype=False)
|
||||
|
||||
|
||||
DecimalArrayWithoutCoercion._add_arithmetic_ops()
|
||||
|
||||
|
||||
def test_combine_from_sequence_raises(monkeypatch):
|
||||
# https://github.com/pandas-dev/pandas/issues/22850
|
||||
cls = DecimalArrayWithoutFromSequence
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls):
|
||||
return DecimalArrayWithoutFromSequence
|
||||
|
||||
monkeypatch.setattr(DecimalDtype, "construct_array_type", construct_array_type)
|
||||
|
||||
arr = cls([decimal.Decimal("1.0"), decimal.Decimal("2.0")])
|
||||
ser = pd.Series(arr)
|
||||
result = ser.combine(ser, operator.add)
|
||||
|
||||
# note: object dtype
|
||||
expected = pd.Series(
|
||||
[decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object"
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"class_", [DecimalArrayWithoutFromSequence, DecimalArrayWithoutCoercion]
|
||||
)
|
||||
def test_scalar_ops_from_sequence_raises(class_):
|
||||
# op(EA, EA) should return an EA, or an ndarray if it's not possible
|
||||
# to return an EA with the return values.
|
||||
arr = class_([decimal.Decimal("1.0"), decimal.Decimal("2.0")])
|
||||
result = arr + arr
|
||||
expected = np.array(
|
||||
[decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object"
|
||||
)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"reverse, expected_div, expected_mod",
|
||||
[(False, [0, 1, 1, 2], [1, 0, 1, 0]), (True, [2, 1, 0, 0], [0, 0, 2, 2])],
|
||||
)
|
||||
def test_divmod_array(reverse, expected_div, expected_mod):
|
||||
# https://github.com/pandas-dev/pandas/issues/22930
|
||||
arr = to_decimal([1, 2, 3, 4])
|
||||
if reverse:
|
||||
div, mod = divmod(2, arr)
|
||||
else:
|
||||
div, mod = divmod(arr, 2)
|
||||
expected_div = to_decimal(expected_div)
|
||||
expected_mod = to_decimal(expected_mod)
|
||||
|
||||
tm.assert_extension_array_equal(div, expected_div)
|
||||
tm.assert_extension_array_equal(mod, expected_mod)
|
||||
|
||||
|
||||
def test_ufunc_fallback(data):
|
||||
a = data[:5]
|
||||
s = pd.Series(a, index=range(3, 8))
|
||||
result = np.abs(s)
|
||||
expected = pd.Series(np.abs(a), index=range(3, 8))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_array_ufunc():
|
||||
a = to_decimal([1, 2, 3])
|
||||
result = np.exp(a)
|
||||
expected = to_decimal(np.exp(a._data))
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_array_ufunc_series():
|
||||
a = to_decimal([1, 2, 3])
|
||||
s = pd.Series(a)
|
||||
result = np.exp(s)
|
||||
expected = pd.Series(to_decimal(np.exp(a._data)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_array_ufunc_series_scalar_other():
|
||||
# check _HANDLED_TYPES
|
||||
a = to_decimal([1, 2, 3])
|
||||
s = pd.Series(a)
|
||||
result = np.add(s, decimal.Decimal(1))
|
||||
expected = pd.Series(np.add(a, decimal.Decimal(1)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_array_ufunc_series_defer():
|
||||
a = to_decimal([1, 2, 3])
|
||||
s = pd.Series(a)
|
||||
|
||||
expected = pd.Series(to_decimal([2, 4, 6]))
|
||||
r1 = np.add(s, a)
|
||||
r2 = np.add(a, s)
|
||||
|
||||
tm.assert_series_equal(r1, expected)
|
||||
tm.assert_series_equal(r2, expected)
|
||||
|
||||
|
||||
def test_groupby_agg():
|
||||
# Ensure that the result of agg is inferred to be decimal dtype
|
||||
# https://github.com/pandas-dev/pandas/issues/29141
|
||||
|
||||
data = make_data()[:5]
|
||||
df = pd.DataFrame(
|
||||
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
|
||||
)
|
||||
|
||||
# single key, selected column
|
||||
expected = pd.Series(to_decimal([data[0], data[3]]))
|
||||
result = df.groupby("id1")["decimals"].agg(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
result = df["decimals"].groupby(df["id1"]).agg(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
|
||||
# multiple keys, selected column
|
||||
expected = pd.Series(
|
||||
to_decimal([data[0], data[1], data[3]]),
|
||||
index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 1)]),
|
||||
)
|
||||
result = df.groupby(["id1", "id2"])["decimals"].agg(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
result = df["decimals"].groupby([df["id1"], df["id2"]]).agg(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
|
||||
# multiple columns
|
||||
expected = pd.DataFrame({"id2": [0, 1], "decimals": to_decimal([data[0], data[3]])})
|
||||
result = df.groupby("id1").agg(lambda x: x.iloc[0])
|
||||
tm.assert_frame_equal(result, expected, check_names=False)
|
||||
|
||||
|
||||
def test_groupby_agg_ea_method(monkeypatch):
|
||||
# Ensure that the result of agg is inferred to be decimal dtype
|
||||
# https://github.com/pandas-dev/pandas/issues/29141
|
||||
|
||||
def DecimalArray__my_sum(self):
|
||||
return np.sum(np.array(self))
|
||||
|
||||
monkeypatch.setattr(DecimalArray, "my_sum", DecimalArray__my_sum, raising=False)
|
||||
|
||||
data = make_data()[:5]
|
||||
df = pd.DataFrame({"id": [0, 0, 0, 1, 1], "decimals": DecimalArray(data)})
|
||||
expected = pd.Series(to_decimal([data[0] + data[1] + data[2], data[3] + data[4]]))
|
||||
|
||||
result = df.groupby("id")["decimals"].agg(lambda x: x.values.my_sum())
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
s = pd.Series(DecimalArray(data))
|
||||
result = s.groupby(np.array([0, 0, 0, 1, 1])).agg(lambda x: x.values.my_sum())
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
|
||||
|
||||
def test_indexing_no_materialize(monkeypatch):
|
||||
# See https://github.com/pandas-dev/pandas/issues/29708
|
||||
# Ensure that indexing operations do not materialize (convert to a numpy
|
||||
# array) the ExtensionArray unnecessary
|
||||
|
||||
def DecimalArray__array__(self, dtype=None):
|
||||
raise Exception("tried to convert a DecimalArray to a numpy array")
|
||||
|
||||
monkeypatch.setattr(DecimalArray, "__array__", DecimalArray__array__, raising=False)
|
||||
|
||||
data = make_data()
|
||||
s = pd.Series(DecimalArray(data))
|
||||
df = pd.DataFrame({"a": s, "b": range(len(s))})
|
||||
|
||||
# ensure the following operations do not raise an error
|
||||
s[s > 0.5]
|
||||
df[s > 0.5]
|
||||
s.at[0]
|
||||
df.at[0, "a"]
|
||||
|
||||
|
||||
def test_to_numpy_keyword():
|
||||
# test the extra keyword
|
||||
values = [decimal.Decimal("1.1111"), decimal.Decimal("2.2222")]
|
||||
expected = np.array(
|
||||
[decimal.Decimal("1.11"), decimal.Decimal("2.22")], dtype="object"
|
||||
)
|
||||
a = pd.array(values, dtype="decimal")
|
||||
result = a.to_numpy(decimals=2)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = pd.Series(a).to_numpy(decimals=2)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
@@ -0,0 +1,7 @@
|
||||
from pandas.tests.extension.json.array import (
|
||||
JSONArray,
|
||||
JSONDtype,
|
||||
make_data,
|
||||
)
|
||||
|
||||
__all__ = ["JSONArray", "JSONDtype", "make_data"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,241 @@
|
||||
"""
|
||||
Test extension array for storing nested data in a pandas container.
|
||||
|
||||
The JSONArray stores lists of dictionaries. The storage mechanism is a list,
|
||||
not an ndarray.
|
||||
|
||||
Note
|
||||
----
|
||||
We currently store lists of UserDicts. Pandas has a few places
|
||||
internally that specifically check for dicts, and does non-scalar things
|
||||
in that case. We *want* the dictionaries to be treated as scalars, so we
|
||||
hack around pandas by using UserDicts.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import (
|
||||
UserDict,
|
||||
abc,
|
||||
)
|
||||
import itertools
|
||||
import numbers
|
||||
import random
|
||||
import string
|
||||
import sys
|
||||
from typing import (
|
||||
Any,
|
||||
Mapping,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import type_t
|
||||
|
||||
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_list_like,
|
||||
pandas_dtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.extensions import (
|
||||
ExtensionArray,
|
||||
ExtensionDtype,
|
||||
)
|
||||
from pandas.core.indexers import unpack_tuple_and_ellipses
|
||||
|
||||
|
||||
class JSONDtype(ExtensionDtype):
|
||||
type = abc.Mapping
|
||||
name = "json"
|
||||
na_value: Mapping[str, Any] = UserDict()
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[JSONArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return JSONArray
|
||||
|
||||
|
||||
class JSONArray(ExtensionArray):
|
||||
dtype = JSONDtype()
|
||||
__array_priority__ = 1000
|
||||
|
||||
def __init__(self, values, dtype=None, copy=False):
|
||||
for val in values:
|
||||
if not isinstance(val, self.dtype.type):
|
||||
raise TypeError("All values must be of type " + str(self.dtype.type))
|
||||
self.data = values
|
||||
|
||||
# Some aliases for common attribute names to ensure pandas supports
|
||||
# these
|
||||
self._items = self._data = self.data
|
||||
# those aliases are currently not working due to assumptions
|
||||
# in internal code (GH-20735)
|
||||
# self._values = self.values = self.data
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(cls, scalars, dtype=None, copy=False):
|
||||
return cls(scalars)
|
||||
|
||||
@classmethod
|
||||
def _from_factorized(cls, values, original):
|
||||
return cls([UserDict(x) for x in values if x != ()])
|
||||
|
||||
def __getitem__(self, item):
|
||||
if isinstance(item, tuple):
|
||||
item = unpack_tuple_and_ellipses(item)
|
||||
|
||||
if isinstance(item, numbers.Integral):
|
||||
return self.data[item]
|
||||
elif isinstance(item, slice) and item == slice(None):
|
||||
# Make sure we get a view
|
||||
return type(self)(self.data)
|
||||
elif isinstance(item, slice):
|
||||
# slice
|
||||
return type(self)(self.data[item])
|
||||
elif not is_list_like(item):
|
||||
# e.g. "foo" or 2.5
|
||||
# exception message copied from numpy
|
||||
raise IndexError(
|
||||
r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
|
||||
r"(`None`) and integer or boolean arrays are valid indices"
|
||||
)
|
||||
else:
|
||||
item = pd.api.indexers.check_array_indexer(self, item)
|
||||
if is_bool_dtype(item.dtype):
|
||||
return self._from_sequence([x for x, m in zip(self, item) if m])
|
||||
# integer
|
||||
return type(self)([self.data[i] for i in item])
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if isinstance(key, numbers.Integral):
|
||||
self.data[key] = value
|
||||
else:
|
||||
if not isinstance(value, (type(self), abc.Sequence)):
|
||||
# broadcast value
|
||||
value = itertools.cycle([value])
|
||||
|
||||
if isinstance(key, np.ndarray) and key.dtype == "bool":
|
||||
# masking
|
||||
for i, (k, v) in enumerate(zip(key, value)):
|
||||
if k:
|
||||
assert isinstance(v, self.dtype.type)
|
||||
self.data[i] = v
|
||||
else:
|
||||
for k, v in zip(key, value):
|
||||
assert isinstance(v, self.dtype.type)
|
||||
self.data[k] = v
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.data)
|
||||
|
||||
def __eq__(self, other):
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other):
|
||||
return NotImplemented
|
||||
|
||||
def __array__(self, dtype=None):
|
||||
if dtype is None:
|
||||
dtype = object
|
||||
return np.asarray(self.data, dtype=dtype)
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
return sys.getsizeof(self.data)
|
||||
|
||||
def isna(self):
|
||||
return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
|
||||
|
||||
def take(self, indexer, allow_fill=False, fill_value=None):
|
||||
# re-implement here, since NumPy has trouble setting
|
||||
# sized objects like UserDicts into scalar slots of
|
||||
# an ndarary.
|
||||
indexer = np.asarray(indexer)
|
||||
msg = (
|
||||
"Index is out of bounds or cannot do a "
|
||||
"non-empty take from an empty array."
|
||||
)
|
||||
|
||||
if allow_fill:
|
||||
if fill_value is None:
|
||||
fill_value = self.dtype.na_value
|
||||
# bounds check
|
||||
if (indexer < -1).any():
|
||||
raise ValueError
|
||||
try:
|
||||
output = [
|
||||
self.data[loc] if loc != -1 else fill_value for loc in indexer
|
||||
]
|
||||
except IndexError as err:
|
||||
raise IndexError(msg) from err
|
||||
else:
|
||||
try:
|
||||
output = [self.data[loc] for loc in indexer]
|
||||
except IndexError as err:
|
||||
raise IndexError(msg) from err
|
||||
|
||||
return self._from_sequence(output)
|
||||
|
||||
def copy(self):
|
||||
return type(self)(self.data[:])
|
||||
|
||||
def astype(self, dtype, copy=True):
|
||||
# NumPy has issues when all the dicts are the same length.
|
||||
# np.array([UserDict(...), UserDict(...)]) fails,
|
||||
# but np.array([{...}, {...}]) works, so cast.
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
|
||||
dtype = pandas_dtype(dtype)
|
||||
# needed to add this check for the Series constructor
|
||||
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
elif isinstance(dtype, StringDtype):
|
||||
value = self.astype(str) # numpy doesn'y like nested dicts
|
||||
return dtype.construct_array_type()._from_sequence(value, copy=False)
|
||||
|
||||
return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
|
||||
|
||||
def unique(self):
|
||||
# Parent method doesn't work since np.array will try to infer
|
||||
# a 2-dim object.
|
||||
return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
|
||||
|
||||
@classmethod
|
||||
def _concat_same_type(cls, to_concat):
|
||||
data = list(itertools.chain.from_iterable(x.data for x in to_concat))
|
||||
return cls(data)
|
||||
|
||||
def _values_for_factorize(self):
|
||||
frozen = self._values_for_argsort()
|
||||
if len(frozen) == 0:
|
||||
# factorize_array expects 1-d array, this is a len-0 2-d array.
|
||||
frozen = frozen.ravel()
|
||||
return frozen, ()
|
||||
|
||||
def _values_for_argsort(self):
|
||||
# Bypass NumPy's shape inference to get a (N,) array of tuples.
|
||||
frozen = [tuple(x.items()) for x in self]
|
||||
return construct_1d_object_array_from_listlike(frozen)
|
||||
|
||||
|
||||
def make_data():
|
||||
# TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
|
||||
return [
|
||||
UserDict(
|
||||
[
|
||||
(random.choice(string.ascii_letters), random.randint(0, 100))
|
||||
for _ in range(random.randint(0, 10))
|
||||
]
|
||||
)
|
||||
for _ in range(100)
|
||||
]
|
||||
@@ -0,0 +1,371 @@
|
||||
import collections
|
||||
import operator
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.extension import base
|
||||
from pandas.tests.extension.json.array import (
|
||||
JSONArray,
|
||||
JSONDtype,
|
||||
make_data,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return JSONDtype()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
"""Length-100 PeriodArray for semantics test."""
|
||||
data = make_data()
|
||||
|
||||
# Why the while loop? NumPy is unable to construct an ndarray from
|
||||
# equal-length ndarrays. Many of our operations involve coercing the
|
||||
# EA to an ndarray of objects. To avoid random test failures, we ensure
|
||||
# that our data is coercible to an ndarray. Several tests deal with only
|
||||
# the first two elements, so that's what we'll check.
|
||||
|
||||
while len(data[0]) == len(data[1]):
|
||||
data = make_data()
|
||||
|
||||
return JSONArray(data)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing():
|
||||
"""Length 2 array with [NA, Valid]"""
|
||||
return JSONArray([{}, {"a": 10}])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting():
|
||||
return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting():
|
||||
return JSONArray([{"b": 1}, {}, {"a": 4}])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value(dtype):
|
||||
return dtype.na_value
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_cmp():
|
||||
return operator.eq
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping():
|
||||
return JSONArray(
|
||||
[
|
||||
{"b": 1},
|
||||
{"b": 1},
|
||||
{},
|
||||
{},
|
||||
{"a": 0, "c": 2},
|
||||
{"a": 0, "c": 2},
|
||||
{"b": 1},
|
||||
{"c": 2},
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class BaseJSON:
|
||||
# NumPy doesn't handle an array of equal-length UserDicts.
|
||||
# The default assert_series_equal eventually does a
|
||||
# Series.values, which raises. We work around it by
|
||||
# converting the UserDicts to dicts.
|
||||
@classmethod
|
||||
def assert_series_equal(cls, left, right, *args, **kwargs):
|
||||
if left.dtype.name == "json":
|
||||
assert left.dtype == right.dtype
|
||||
left = pd.Series(
|
||||
JSONArray(left.values.astype(object)), index=left.index, name=left.name
|
||||
)
|
||||
right = pd.Series(
|
||||
JSONArray(right.values.astype(object)),
|
||||
index=right.index,
|
||||
name=right.name,
|
||||
)
|
||||
tm.assert_series_equal(left, right, *args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def assert_frame_equal(cls, left, right, *args, **kwargs):
|
||||
obj_type = kwargs.get("obj", "DataFrame")
|
||||
tm.assert_index_equal(
|
||||
left.columns,
|
||||
right.columns,
|
||||
exact=kwargs.get("check_column_type", "equiv"),
|
||||
check_names=kwargs.get("check_names", True),
|
||||
check_exact=kwargs.get("check_exact", False),
|
||||
check_categorical=kwargs.get("check_categorical", True),
|
||||
obj=f"{obj_type}.columns",
|
||||
)
|
||||
|
||||
jsons = (left.dtypes == "json").index
|
||||
|
||||
for col in jsons:
|
||||
cls.assert_series_equal(left[col], right[col], *args, **kwargs)
|
||||
|
||||
left = left.drop(columns=jsons)
|
||||
right = right.drop(columns=jsons)
|
||||
tm.assert_frame_equal(left, right, *args, **kwargs)
|
||||
|
||||
|
||||
class TestDtype(BaseJSON, base.BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestInterface(BaseJSON, base.BaseInterfaceTests):
|
||||
def test_custom_asserts(self):
|
||||
# This would always trigger the KeyError from trying to put
|
||||
# an array of equal-length UserDicts inside an ndarray.
|
||||
data = JSONArray(
|
||||
[
|
||||
collections.UserDict({"a": 1}),
|
||||
collections.UserDict({"b": 2}),
|
||||
collections.UserDict({"c": 3}),
|
||||
]
|
||||
)
|
||||
a = pd.Series(data)
|
||||
self.assert_series_equal(a, a)
|
||||
self.assert_frame_equal(a.to_frame(), a.to_frame())
|
||||
|
||||
b = pd.Series(data.take([0, 0, 1]))
|
||||
msg = r"ExtensionArray are different"
|
||||
with pytest.raises(AssertionError, match=msg):
|
||||
self.assert_series_equal(a, b)
|
||||
|
||||
with pytest.raises(AssertionError, match=msg):
|
||||
self.assert_frame_equal(a.to_frame(), b.to_frame())
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="comparison method not implemented for JSONArray (GH-37867)"
|
||||
)
|
||||
def test_contains(self, data):
|
||||
# GH-37867
|
||||
super().test_contains(data)
|
||||
|
||||
|
||||
class TestConstructors(BaseJSON, base.BaseConstructorsTests):
|
||||
@pytest.mark.xfail(reason="not implemented constructor from dtype")
|
||||
def test_from_dtype(self, data):
|
||||
# construct from our dtype & string dtype
|
||||
super(self).test_from_dtype(data)
|
||||
|
||||
@pytest.mark.xfail(reason="RecursionError, GH-33900")
|
||||
def test_series_constructor_no_data_with_index(self, dtype, na_value):
|
||||
# RecursionError: maximum recursion depth exceeded in comparison
|
||||
rec_limit = sys.getrecursionlimit()
|
||||
try:
|
||||
# Limit to avoid stack overflow on Windows CI
|
||||
sys.setrecursionlimit(100)
|
||||
super().test_series_constructor_no_data_with_index(dtype, na_value)
|
||||
finally:
|
||||
sys.setrecursionlimit(rec_limit)
|
||||
|
||||
@pytest.mark.xfail(reason="RecursionError, GH-33900")
|
||||
def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
|
||||
# RecursionError: maximum recursion depth exceeded in comparison
|
||||
rec_limit = sys.getrecursionlimit()
|
||||
try:
|
||||
# Limit to avoid stack overflow on Windows CI
|
||||
sys.setrecursionlimit(100)
|
||||
super().test_series_constructor_scalar_na_with_index(dtype, na_value)
|
||||
finally:
|
||||
sys.setrecursionlimit(rec_limit)
|
||||
|
||||
@pytest.mark.xfail(reason="collection as scalar, GH-33901")
|
||||
def test_series_constructor_scalar_with_index(self, data, dtype):
|
||||
# TypeError: All values must be of type <class 'collections.abc.Mapping'>
|
||||
super().test_series_constructor_scalar_with_index(data, dtype)
|
||||
|
||||
|
||||
class TestReshaping(BaseJSON, base.BaseReshapingTests):
|
||||
@pytest.mark.skip(reason="Different definitions of NA")
|
||||
def test_stack(self):
|
||||
"""
|
||||
The test does .astype(object).stack(). If we happen to have
|
||||
any missing values in `data`, then we'll end up with different
|
||||
rows since we consider `{}` NA, but `.astype(object)` doesn't.
|
||||
"""
|
||||
|
||||
@pytest.mark.xfail(reason="dict for NA")
|
||||
def test_unstack(self, data, index):
|
||||
# The base test has NaN for the expected NA value.
|
||||
# this matches otherwise
|
||||
return super().test_unstack(data, index)
|
||||
|
||||
|
||||
class TestGetitem(BaseJSON, base.BaseGetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestIndex(BaseJSON, base.BaseIndexTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMissing(BaseJSON, base.BaseMissingTests):
|
||||
@pytest.mark.skip(reason="Setting a dict as a scalar")
|
||||
def test_fillna_series(self):
|
||||
"""We treat dictionaries as a mapping in fillna, not a scalar."""
|
||||
|
||||
@pytest.mark.skip(reason="Setting a dict as a scalar")
|
||||
def test_fillna_frame(self):
|
||||
"""We treat dictionaries as a mapping in fillna, not a scalar."""
|
||||
|
||||
|
||||
unhashable = pytest.mark.skip(reason="Unhashable")
|
||||
|
||||
|
||||
class TestReduce(base.BaseNoReduceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMethods(BaseJSON, base.BaseMethodsTests):
|
||||
@unhashable
|
||||
def test_value_counts(self, all_data, dropna):
|
||||
pass
|
||||
|
||||
@unhashable
|
||||
def test_value_counts_with_normalize(self, data):
|
||||
pass
|
||||
|
||||
@unhashable
|
||||
def test_sort_values_frame(self):
|
||||
# TODO (EA.factorize): see if _values_for_factorize allows this.
|
||||
pass
|
||||
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
|
||||
super().test_sort_values(data_for_sorting, ascending, sort_by_key)
|
||||
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
def test_sort_values_missing(
|
||||
self, data_missing_for_sorting, ascending, sort_by_key
|
||||
):
|
||||
super().test_sort_values_missing(
|
||||
data_missing_for_sorting, ascending, sort_by_key
|
||||
)
|
||||
|
||||
@pytest.mark.skip(reason="combine for JSONArray not supported")
|
||||
def test_combine_le(self, data_repeated):
|
||||
pass
|
||||
|
||||
@pytest.mark.skip(reason="combine for JSONArray not supported")
|
||||
def test_combine_add(self, data_repeated):
|
||||
pass
|
||||
|
||||
@pytest.mark.skip(reason="combine for JSONArray not supported")
|
||||
def test_combine_first(self, data):
|
||||
pass
|
||||
|
||||
@unhashable
|
||||
def test_hash_pandas_object_works(self, data, kind):
|
||||
super().test_hash_pandas_object_works(data, kind)
|
||||
|
||||
@pytest.mark.skip(reason="broadcasting error")
|
||||
def test_where_series(self, data, na_value):
|
||||
# Fails with
|
||||
# *** ValueError: operands could not be broadcast together
|
||||
# with shapes (4,) (4,) (0,)
|
||||
super().test_where_series(data, na_value)
|
||||
|
||||
@pytest.mark.skip(reason="Can't compare dicts.")
|
||||
def test_searchsorted(self, data_for_sorting):
|
||||
super().test_searchsorted(data_for_sorting)
|
||||
|
||||
@pytest.mark.skip(reason="Can't compare dicts.")
|
||||
def test_equals(self, data, na_value, as_series):
|
||||
pass
|
||||
|
||||
|
||||
class TestCasting(BaseJSON, base.BaseCastingTests):
|
||||
@pytest.mark.skip(reason="failing on np.array(self, dtype=str)")
|
||||
def test_astype_str(self):
|
||||
"""This currently fails in NumPy on np.array(self, dtype=str) with
|
||||
|
||||
*** ValueError: setting an array element with a sequence
|
||||
"""
|
||||
|
||||
|
||||
# We intentionally don't run base.BaseSetitemTests because pandas'
|
||||
# internals has trouble setting sequences of values into scalar positions.
|
||||
|
||||
|
||||
class TestGroupby(BaseJSON, base.BaseGroupbyTests):
|
||||
@unhashable
|
||||
def test_groupby_extension_transform(self):
|
||||
"""
|
||||
This currently fails in Series.name.setter, since the
|
||||
name must be hashable, but the value is a dictionary.
|
||||
I think this is what we want, i.e. `.name` should be the original
|
||||
values, and not the values for factorization.
|
||||
"""
|
||||
|
||||
@unhashable
|
||||
def test_groupby_extension_apply(self):
|
||||
"""
|
||||
This fails in Index._do_unique_check with
|
||||
|
||||
> hash(val)
|
||||
E TypeError: unhashable type: 'UserDict' with
|
||||
|
||||
I suspect that once we support Index[ExtensionArray],
|
||||
we'll be able to dispatch unique.
|
||||
"""
|
||||
|
||||
@unhashable
|
||||
def test_groupby_extension_agg(self):
|
||||
"""
|
||||
This fails when we get to tm.assert_series_equal when left.index
|
||||
contains dictionaries, which are not hashable.
|
||||
"""
|
||||
|
||||
@unhashable
|
||||
def test_groupby_extension_no_sort(self):
|
||||
"""
|
||||
This fails when we get to tm.assert_series_equal when left.index
|
||||
contains dictionaries, which are not hashable.
|
||||
"""
|
||||
|
||||
@pytest.mark.xfail(reason="GH#39098: Converts agg result to object")
|
||||
def test_groupby_agg_extension(self, data_for_grouping):
|
||||
super().test_groupby_agg_extension(data_for_grouping)
|
||||
|
||||
|
||||
class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests):
|
||||
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
|
||||
if len(data[0]) != 1:
|
||||
mark = pytest.mark.xfail(reason="raises in coercing to Series")
|
||||
request.node.add_marker(mark)
|
||||
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
def test_add_series_with_extension_array(self, data):
|
||||
ser = pd.Series(data)
|
||||
with pytest.raises(TypeError, match="unsupported"):
|
||||
ser + data
|
||||
|
||||
def test_divmod_series_array(self):
|
||||
# GH 23287
|
||||
# skipping because it is not implemented
|
||||
pass
|
||||
|
||||
def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
|
||||
return super()._check_divmod_op(s, op, other, exc=TypeError)
|
||||
|
||||
|
||||
class TestComparisonOps(BaseJSON, base.BaseComparisonOpsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestPrinting(BaseJSON, base.BasePrintingTests):
|
||||
pass
|
||||
@@ -0,0 +1,7 @@
|
||||
from pandas.tests.extension.list.array import (
|
||||
ListArray,
|
||||
ListDtype,
|
||||
make_data,
|
||||
)
|
||||
|
||||
__all__ = ["ListArray", "ListDtype", "make_data"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,132 @@
|
||||
"""
|
||||
Test extension array for storing nested data in a pandas container.
|
||||
|
||||
The ListArray stores an ndarray of lists.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import numbers
|
||||
import random
|
||||
import string
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import type_t
|
||||
|
||||
from pandas.core.dtypes.base import ExtensionDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api.types import (
|
||||
is_object_dtype,
|
||||
is_string_dtype,
|
||||
)
|
||||
from pandas.core.arrays import ExtensionArray
|
||||
|
||||
|
||||
class ListDtype(ExtensionDtype):
|
||||
type = list
|
||||
name = "list"
|
||||
na_value = np.nan
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[ListArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
return ListArray
|
||||
|
||||
|
||||
class ListArray(ExtensionArray):
|
||||
dtype = ListDtype()
|
||||
__array_priority__ = 1000
|
||||
|
||||
def __init__(self, values, dtype=None, copy=False):
|
||||
if not isinstance(values, np.ndarray):
|
||||
raise TypeError("Need to pass a numpy array as values")
|
||||
for val in values:
|
||||
if not isinstance(val, self.dtype.type) and not pd.isna(val):
|
||||
raise TypeError("All values must be of type " + str(self.dtype.type))
|
||||
self.data = values
|
||||
|
||||
@classmethod
|
||||
def _from_sequence(cls, scalars, dtype=None, copy=False):
|
||||
data = np.empty(len(scalars), dtype=object)
|
||||
data[:] = scalars
|
||||
return cls(data)
|
||||
|
||||
def __getitem__(self, item):
|
||||
if isinstance(item, numbers.Integral):
|
||||
return self.data[item]
|
||||
else:
|
||||
# slice, list-like, mask
|
||||
return type(self)(self.data[item])
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.data)
|
||||
|
||||
def isna(self):
|
||||
return np.array(
|
||||
[not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool
|
||||
)
|
||||
|
||||
def take(self, indexer, allow_fill=False, fill_value=None):
|
||||
# re-implement here, since NumPy has trouble setting
|
||||
# sized objects like UserDicts into scalar slots of
|
||||
# an ndarary.
|
||||
indexer = np.asarray(indexer)
|
||||
msg = (
|
||||
"Index is out of bounds or cannot do a "
|
||||
"non-empty take from an empty array."
|
||||
)
|
||||
|
||||
if allow_fill:
|
||||
if fill_value is None:
|
||||
fill_value = self.dtype.na_value
|
||||
# bounds check
|
||||
if (indexer < -1).any():
|
||||
raise ValueError
|
||||
try:
|
||||
output = [
|
||||
self.data[loc] if loc != -1 else fill_value for loc in indexer
|
||||
]
|
||||
except IndexError as err:
|
||||
raise IndexError(msg) from err
|
||||
else:
|
||||
try:
|
||||
output = [self.data[loc] for loc in indexer]
|
||||
except IndexError as err:
|
||||
raise IndexError(msg) from err
|
||||
|
||||
return self._from_sequence(output)
|
||||
|
||||
def copy(self):
|
||||
return type(self)(self.data[:])
|
||||
|
||||
def astype(self, dtype, copy=True):
|
||||
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
|
||||
if copy:
|
||||
return self.copy()
|
||||
return self
|
||||
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
|
||||
# numpy has problems with astype(str) for nested elements
|
||||
return np.array([str(x) for x in self.data], dtype=dtype)
|
||||
return np.array(self.data, dtype=dtype, copy=copy)
|
||||
|
||||
@classmethod
|
||||
def _concat_same_type(cls, to_concat):
|
||||
data = np.concatenate([x.data for x in to_concat])
|
||||
return cls(data)
|
||||
|
||||
|
||||
def make_data():
|
||||
# TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
|
||||
data = np.empty(100, dtype=object)
|
||||
data[:] = [
|
||||
[random.choice(string.ascii_letters) for _ in range(random.randint(0, 10))]
|
||||
for _ in range(100)
|
||||
]
|
||||
return data
|
||||
@@ -0,0 +1,33 @@
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.tests.extension.list.array import (
|
||||
ListArray,
|
||||
ListDtype,
|
||||
make_data,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return ListDtype()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
"""Length-100 ListArray for semantics test."""
|
||||
data = make_data()
|
||||
|
||||
while len(data[0]) == len(data[1]):
|
||||
data = make_data()
|
||||
|
||||
return ListArray(data)
|
||||
|
||||
|
||||
def test_to_csv(data):
|
||||
# https://github.com/pandas-dev/pandas/issues/28840
|
||||
# array with list-likes fail when doing astype(str) on the numpy array
|
||||
# which was done in to_native_types
|
||||
df = pd.DataFrame({"a": data})
|
||||
res = df.to_csv()
|
||||
assert str(data[0]) in res
|
||||
@@ -0,0 +1,394 @@
|
||||
"""
|
||||
This file contains a minimal set of tests for compliance with the extension
|
||||
array interface test suite, and should contain no other tests.
|
||||
The test suite for the full functionality of the array is located in
|
||||
`pandas/tests/arrays/`.
|
||||
|
||||
The tests in this file are inherited from the BaseExtensionTests, and only
|
||||
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||||
parent method).
|
||||
|
||||
Additional tests should either be added to one of the BaseExtensionTests
|
||||
classes (if they are relevant for the extension interface for all dtypes), or
|
||||
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays.boolean import BooleanDtype
|
||||
from pandas.tests.extension import base
|
||||
|
||||
|
||||
def make_data():
|
||||
return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return BooleanDtype()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data(dtype):
|
||||
return pd.array(make_data(), dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_twos(dtype):
|
||||
return pd.array(np.ones(100), dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing(dtype):
|
||||
return pd.array([np.nan, True], dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting(dtype):
|
||||
return pd.array([True, True, False], dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting(dtype):
|
||||
return pd.array([True, np.nan, False], dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_cmp():
|
||||
# we are pd.NA
|
||||
return lambda x, y: x is pd.NA and y is pd.NA
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return pd.NA
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping(dtype):
|
||||
b = True
|
||||
a = False
|
||||
na = np.nan
|
||||
return pd.array([b, b, na, na, a, a, b], dtype=dtype)
|
||||
|
||||
|
||||
class TestDtype(base.BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestInterface(base.BaseInterfaceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestConstructors(base.BaseConstructorsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGetitem(base.BaseGetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestSetitem(base.BaseSetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestIndex(base.BaseIndexTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMissing(base.BaseMissingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestArithmeticOps(base.BaseArithmeticOpsTests):
|
||||
implements = {"__sub__", "__rsub__"}
|
||||
|
||||
def check_opname(self, s, op_name, other, exc=None):
|
||||
# overwriting to indicate ops don't raise an error
|
||||
super().check_opname(s, op_name, other, exc=None)
|
||||
|
||||
def _check_op(self, obj, op, other, op_name, exc=NotImplementedError):
|
||||
if exc is None:
|
||||
if op_name in self.implements:
|
||||
msg = r"numpy boolean subtract"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
op(obj, other)
|
||||
return
|
||||
|
||||
result = op(obj, other)
|
||||
expected = self._combine(obj, other, op)
|
||||
|
||||
if op_name in (
|
||||
"__floordiv__",
|
||||
"__rfloordiv__",
|
||||
"__pow__",
|
||||
"__rpow__",
|
||||
"__mod__",
|
||||
"__rmod__",
|
||||
):
|
||||
# combine keeps boolean type
|
||||
expected = expected.astype("Int8")
|
||||
elif op_name in ("__truediv__", "__rtruediv__"):
|
||||
# combine with bools does not generate the correct result
|
||||
# (numpy behaviour for div is to regard the bools as numeric)
|
||||
expected = self._combine(obj.astype(float), other, op)
|
||||
expected = expected.astype("Float64")
|
||||
if op_name == "__rpow__":
|
||||
# for rpow, combine does not propagate NaN
|
||||
expected[result.isna()] = np.nan
|
||||
self.assert_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(exc):
|
||||
op(obj, other)
|
||||
|
||||
def _check_divmod_op(self, s, op, other, exc=None):
|
||||
# override to not raise an error
|
||||
super()._check_divmod_op(s, op, other, None)
|
||||
|
||||
|
||||
class TestComparisonOps(base.BaseComparisonOpsTests):
|
||||
def check_opname(self, s, op_name, other, exc=None):
|
||||
# overwriting to indicate ops don't raise an error
|
||||
super().check_opname(s, op_name, other, exc=None)
|
||||
|
||||
|
||||
class TestReshaping(base.BaseReshapingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMethods(base.BaseMethodsTests):
|
||||
@pytest.mark.parametrize("na_sentinel", [-1, -2])
|
||||
def test_factorize(self, data_for_grouping, na_sentinel):
|
||||
# override because we only have 2 unique values
|
||||
labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
|
||||
expected_labels = np.array(
|
||||
[0, 0, na_sentinel, na_sentinel, 1, 1, 0], dtype=np.intp
|
||||
)
|
||||
expected_uniques = data_for_grouping.take([0, 4])
|
||||
|
||||
tm.assert_numpy_array_equal(labels, expected_labels)
|
||||
self.assert_extension_array_equal(uniques, expected_uniques)
|
||||
|
||||
def test_combine_le(self, data_repeated):
|
||||
# override because expected needs to be boolean instead of bool dtype
|
||||
orig_data1, orig_data2 = data_repeated(2)
|
||||
s1 = pd.Series(orig_data1)
|
||||
s2 = pd.Series(orig_data2)
|
||||
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
|
||||
expected = pd.Series(
|
||||
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
|
||||
dtype="boolean",
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
val = s1.iloc[0]
|
||||
result = s1.combine(val, lambda x1, x2: x1 <= x2)
|
||||
expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean")
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_searchsorted(self, data_for_sorting, as_series):
|
||||
# override because we only have 2 unique values
|
||||
data_for_sorting = pd.array([True, False], dtype="boolean")
|
||||
b, a = data_for_sorting
|
||||
arr = type(data_for_sorting)._from_sequence([a, b])
|
||||
|
||||
if as_series:
|
||||
arr = pd.Series(arr)
|
||||
assert arr.searchsorted(a) == 0
|
||||
assert arr.searchsorted(a, side="right") == 1
|
||||
|
||||
assert arr.searchsorted(b) == 1
|
||||
assert arr.searchsorted(b, side="right") == 2
|
||||
|
||||
result = arr.searchsorted(arr.take([0, 1]))
|
||||
expected = np.array([0, 1], dtype=np.intp)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# sorter
|
||||
sorter = np.array([1, 0])
|
||||
assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
|
||||
|
||||
@pytest.mark.xfail(reason="uses nullable integer")
|
||||
def test_value_counts(self, all_data, dropna):
|
||||
return super().test_value_counts(all_data, dropna)
|
||||
|
||||
@pytest.mark.xfail(reason="uses nullable integer")
|
||||
def test_value_counts_with_normalize(self, data):
|
||||
super().test_value_counts_with_normalize(data)
|
||||
|
||||
def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting):
|
||||
# override because there are only 2 unique values
|
||||
|
||||
# data_for_sorting -> [B, C, A] with A < B < C -> here True, True, False
|
||||
assert data_for_sorting.argmax() == 0
|
||||
assert data_for_sorting.argmin() == 2
|
||||
|
||||
# with repeated values -> first occurrence
|
||||
data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
|
||||
assert data.argmax() == 1
|
||||
assert data.argmin() == 0
|
||||
|
||||
# with missing values
|
||||
# data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
|
||||
assert data_missing_for_sorting.argmax() == 0
|
||||
assert data_missing_for_sorting.argmin() == 2
|
||||
|
||||
|
||||
class TestCasting(base.BaseCastingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGroupby(base.BaseGroupbyTests):
|
||||
"""
|
||||
Groupby-specific tests are overridden because boolean only has 2
|
||||
unique values, base tests uses 3 groups.
|
||||
"""
|
||||
|
||||
def test_grouping_grouper(self, data_for_grouping):
|
||||
df = pd.DataFrame(
|
||||
{"A": ["B", "B", None, None, "A", "A", "B"], "B": data_for_grouping}
|
||||
)
|
||||
gr1 = df.groupby("A").grouper.groupings[0]
|
||||
gr2 = df.groupby("B").grouper.groupings[0]
|
||||
|
||||
tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values)
|
||||
tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping)
|
||||
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_groupby_extension_agg(self, as_index, data_for_grouping):
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
|
||||
result = df.groupby("B", as_index=as_index).A.mean()
|
||||
_, uniques = pd.factorize(data_for_grouping, sort=True)
|
||||
|
||||
if as_index:
|
||||
index = pd.Index(uniques, name="B")
|
||||
expected = pd.Series([3.0, 1.0], index=index, name="A")
|
||||
self.assert_series_equal(result, expected)
|
||||
else:
|
||||
expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0]})
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_agg_extension(self, data_for_grouping):
|
||||
# GH#38980 groupby agg on extension type fails for non-numeric types
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
|
||||
|
||||
expected = df.iloc[[0, 2, 4]]
|
||||
expected = expected.set_index("A")
|
||||
|
||||
result = df.groupby("A").agg({"B": "first"})
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").agg("first")
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").first()
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_extension_no_sort(self, data_for_grouping):
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
|
||||
result = df.groupby("B", sort=False).A.mean()
|
||||
_, index = pd.factorize(data_for_grouping, sort=False)
|
||||
|
||||
index = pd.Index(index, name="B")
|
||||
expected = pd.Series([1.0, 3.0], index=index, name="A")
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_extension_transform(self, data_for_grouping):
|
||||
valid = data_for_grouping[~data_for_grouping.isna()]
|
||||
df = pd.DataFrame({"A": [1, 1, 3, 3, 1], "B": valid})
|
||||
|
||||
result = df.groupby("B").A.transform(len)
|
||||
expected = pd.Series([3, 3, 2, 2, 3], name="A")
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
|
||||
df.groupby("B").apply(groupby_apply_op)
|
||||
df.groupby("B").A.apply(groupby_apply_op)
|
||||
df.groupby("A").apply(groupby_apply_op)
|
||||
df.groupby("A").B.apply(groupby_apply_op)
|
||||
|
||||
def test_groupby_apply_identity(self, data_for_grouping):
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
|
||||
result = df.groupby("A").B.apply(lambda x: x.array)
|
||||
expected = pd.Series(
|
||||
[
|
||||
df.B.iloc[[0, 1, 6]].array,
|
||||
df.B.iloc[[2, 3]].array,
|
||||
df.B.iloc[[4, 5]].array,
|
||||
],
|
||||
index=pd.Index([1, 2, 3], name="A"),
|
||||
name="B",
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_in_numeric_groupby(self, data_for_grouping):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 1, 2, 2, 3, 3, 1],
|
||||
"B": data_for_grouping,
|
||||
"C": [1, 1, 1, 1, 1, 1, 1],
|
||||
}
|
||||
)
|
||||
result = df.groupby("A").sum().columns
|
||||
|
||||
if data_for_grouping.dtype._is_numeric:
|
||||
expected = pd.Index(["B", "C"])
|
||||
else:
|
||||
expected = pd.Index(["C"])
|
||||
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("min_count", [0, 10])
|
||||
def test_groupby_sum_mincount(self, data_for_grouping, min_count):
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
|
||||
result = df.groupby("A").sum(min_count=min_count)
|
||||
if min_count == 0:
|
||||
expected = pd.DataFrame(
|
||||
{"B": pd.array([3, 0, 0], dtype="Int64")},
|
||||
index=pd.Index([1, 2, 3], name="A"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
expected = pd.DataFrame(
|
||||
{"B": pd.array([pd.NA] * 3, dtype="Int64")},
|
||||
index=pd.Index([1, 2, 3], name="A"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestNumericReduce(base.BaseNumericReduceTests):
|
||||
def check_reduce(self, s, op_name, skipna):
|
||||
result = getattr(s, op_name)(skipna=skipna)
|
||||
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
|
||||
# override parent function to cast to bool for min/max
|
||||
if np.isnan(expected):
|
||||
expected = pd.NA
|
||||
elif op_name in ("min", "max"):
|
||||
expected = bool(expected)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
class TestBooleanReduce(base.BaseBooleanReduceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestPrinting(base.BasePrintingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestUnaryOps(base.BaseUnaryOpsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestParsing(base.BaseParsingTests):
|
||||
pass
|
||||
|
||||
|
||||
class Test2DCompat(base.Dim2CompatTests):
|
||||
pass
|
||||
@@ -0,0 +1,310 @@
|
||||
"""
|
||||
This file contains a minimal set of tests for compliance with the extension
|
||||
array interface test suite, and should contain no other tests.
|
||||
The test suite for the full functionality of the array is located in
|
||||
`pandas/tests/arrays/`.
|
||||
|
||||
The tests in this file are inherited from the BaseExtensionTests, and only
|
||||
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||||
parent method).
|
||||
|
||||
Additional tests should either be added to one of the BaseExtensionTests
|
||||
classes (if they are relevant for the extension interface for all dtypes), or
|
||||
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||||
|
||||
"""
|
||||
import string
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
CategoricalIndex,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import CategoricalDtype
|
||||
from pandas.tests.extension import base
|
||||
|
||||
|
||||
def make_data():
|
||||
while True:
|
||||
values = np.random.choice(list(string.ascii_letters), size=100)
|
||||
# ensure we meet the requirements
|
||||
# 1. first two not null
|
||||
# 2. first and second are different
|
||||
if values[0] != values[1]:
|
||||
break
|
||||
return values
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return CategoricalDtype()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
"""Length-100 array for this type.
|
||||
|
||||
* data[0] and data[1] should both be non missing
|
||||
* data[0] and data[1] should not be equal
|
||||
"""
|
||||
return Categorical(make_data())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing():
|
||||
"""Length 2 array with [NA, Valid]"""
|
||||
return Categorical([np.nan, "A"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting():
|
||||
return Categorical(["A", "B", "C"], categories=["C", "A", "B"], ordered=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting():
|
||||
return Categorical(["A", None, "B"], categories=["B", "A"], ordered=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return np.nan
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping():
|
||||
return Categorical(["a", "a", None, None, "b", "b", "a", "c"])
|
||||
|
||||
|
||||
class TestDtype(base.BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestInterface(base.BaseInterfaceTests):
|
||||
@pytest.mark.xfail(reason="Memory usage doesn't match")
|
||||
def test_memory_usage(self, data):
|
||||
# Is this deliberate?
|
||||
super().test_memory_usage(data)
|
||||
|
||||
def test_contains(self, data, data_missing):
|
||||
# GH-37867
|
||||
# na value handling in Categorical.__contains__ is deprecated.
|
||||
# See base.BaseInterFaceTests.test_contains for more details.
|
||||
|
||||
na_value = data.dtype.na_value
|
||||
# ensure data without missing values
|
||||
data = data[~data.isna()]
|
||||
|
||||
# first elements are non-missing
|
||||
assert data[0] in data
|
||||
assert data_missing[0] in data_missing
|
||||
|
||||
# check the presence of na_value
|
||||
assert na_value in data_missing
|
||||
assert na_value not in data
|
||||
|
||||
# Categoricals can contain other nan-likes than na_value
|
||||
for na_value_obj in tm.NULL_OBJECTS:
|
||||
if na_value_obj is na_value:
|
||||
continue
|
||||
assert na_value_obj not in data
|
||||
assert na_value_obj in data_missing # this line differs from super method
|
||||
|
||||
|
||||
class TestConstructors(base.BaseConstructorsTests):
|
||||
def test_empty(self, dtype):
|
||||
cls = dtype.construct_array_type()
|
||||
result = cls._empty((4,), dtype=dtype)
|
||||
|
||||
assert isinstance(result, cls)
|
||||
# the dtype we passed is not initialized, so will not match the
|
||||
# dtype on our result.
|
||||
assert result.dtype == CategoricalDtype([])
|
||||
|
||||
|
||||
class TestReshaping(base.BaseReshapingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGetitem(base.BaseGetitemTests):
|
||||
@pytest.mark.skip(reason="Backwards compatibility")
|
||||
def test_getitem_scalar(self, data):
|
||||
# CategoricalDtype.type isn't "correct" since it should
|
||||
# be a parent of the elements (object). But don't want
|
||||
# to break things by changing.
|
||||
super().test_getitem_scalar(data)
|
||||
|
||||
|
||||
class TestSetitem(base.BaseSetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestIndex(base.BaseIndexTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMissing(base.BaseMissingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestReduce(base.BaseNoReduceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMethods(base.BaseMethodsTests):
|
||||
@pytest.mark.xfail(reason="Unobserved categories included")
|
||||
def test_value_counts(self, all_data, dropna):
|
||||
return super().test_value_counts(all_data, dropna)
|
||||
|
||||
def test_combine_add(self, data_repeated):
|
||||
# GH 20825
|
||||
# When adding categoricals in combine, result is a string
|
||||
orig_data1, orig_data2 = data_repeated(2)
|
||||
s1 = pd.Series(orig_data1)
|
||||
s2 = pd.Series(orig_data2)
|
||||
result = s1.combine(s2, lambda x1, x2: x1 + x2)
|
||||
expected = pd.Series(
|
||||
[a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
val = s1.iloc[0]
|
||||
result = s1.combine(val, lambda x1, x2: x1 + x2)
|
||||
expected = pd.Series([a + val for a in list(orig_data1)])
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
class TestCasting(base.BaseCastingTests):
|
||||
@pytest.mark.parametrize("cls", [Categorical, CategoricalIndex])
|
||||
@pytest.mark.parametrize("values", [[1, np.nan], [Timestamp("2000"), pd.NaT]])
|
||||
def test_cast_nan_to_int(self, cls, values):
|
||||
# GH 28406
|
||||
s = cls(values)
|
||||
|
||||
msg = "Cannot (cast|convert)"
|
||||
with pytest.raises((ValueError, TypeError), match=msg):
|
||||
s.astype(int)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"expected",
|
||||
[
|
||||
pd.Series(["2019", "2020"], dtype="datetime64[ns, UTC]"),
|
||||
pd.Series([0, 0], dtype="timedelta64[ns]"),
|
||||
pd.Series([pd.Period("2019"), pd.Period("2020")], dtype="period[A-DEC]"),
|
||||
pd.Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval"),
|
||||
pd.Series([1, np.nan], dtype="Int64"),
|
||||
],
|
||||
)
|
||||
def test_cast_category_to_extension_dtype(self, expected):
|
||||
# GH 28668
|
||||
result = expected.astype("category").astype(expected.dtype)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, expected",
|
||||
[
|
||||
(
|
||||
"datetime64[ns]",
|
||||
np.array(["2015-01-01T00:00:00.000000000"], dtype="datetime64[ns]"),
|
||||
),
|
||||
(
|
||||
"datetime64[ns, MET]",
|
||||
pd.DatetimeIndex(
|
||||
[Timestamp("2015-01-01 00:00:00+0100", tz="MET")]
|
||||
).array,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_consistent_casting(self, dtype, expected):
|
||||
# GH 28448
|
||||
result = Categorical(["2015-01-01"]).astype(dtype)
|
||||
assert result == expected
|
||||
|
||||
|
||||
class TestArithmeticOps(base.BaseArithmeticOpsTests):
|
||||
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
|
||||
# frame & scalar
|
||||
op_name = all_arithmetic_operators
|
||||
if op_name == "__rmod__":
|
||||
request.node.add_marker(
|
||||
pytest.mark.xfail(
|
||||
reason="rmod never called when string is first argument"
|
||||
)
|
||||
)
|
||||
super().test_arith_frame_with_scalar(data, op_name)
|
||||
|
||||
def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
|
||||
op_name = all_arithmetic_operators
|
||||
if op_name == "__rmod__":
|
||||
request.node.add_marker(
|
||||
pytest.mark.xfail(
|
||||
reason="rmod never called when string is first argument"
|
||||
)
|
||||
)
|
||||
super().test_arith_series_with_scalar(data, op_name)
|
||||
|
||||
def test_add_series_with_extension_array(self, data):
|
||||
ser = pd.Series(data)
|
||||
with pytest.raises(TypeError, match="cannot perform|unsupported operand"):
|
||||
ser + data
|
||||
|
||||
def test_divmod_series_array(self):
|
||||
# GH 23287
|
||||
# skipping because it is not implemented
|
||||
pass
|
||||
|
||||
def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
|
||||
return super()._check_divmod_op(s, op, other, exc=TypeError)
|
||||
|
||||
|
||||
class TestComparisonOps(base.BaseComparisonOpsTests):
|
||||
def _compare_other(self, s, data, op, other):
|
||||
op_name = f"__{op.__name__}__"
|
||||
if op_name == "__eq__":
|
||||
result = op(s, other)
|
||||
expected = s.combine(other, lambda x, y: x == y)
|
||||
assert (result == expected).all()
|
||||
|
||||
elif op_name == "__ne__":
|
||||
result = op(s, other)
|
||||
expected = s.combine(other, lambda x, y: x != y)
|
||||
assert (result == expected).all()
|
||||
|
||||
else:
|
||||
msg = "Unordered Categoricals can only compare equality or not"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
op(data, other)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"categories",
|
||||
[["a", "b"], [0, 1], [Timestamp("2019"), Timestamp("2020")]],
|
||||
)
|
||||
def test_not_equal_with_na(self, categories):
|
||||
# https://github.com/pandas-dev/pandas/issues/32276
|
||||
c1 = Categorical.from_codes([-1, 0], categories=categories)
|
||||
c2 = Categorical.from_codes([0, 1], categories=categories)
|
||||
|
||||
result = c1 != c2
|
||||
|
||||
assert result.all()
|
||||
|
||||
|
||||
class TestParsing(base.BaseParsingTests):
|
||||
pass
|
||||
|
||||
|
||||
class Test2DCompat(base.NDArrayBacked2DTests):
|
||||
def test_repr_2d(self, data):
|
||||
# Categorical __repr__ doesn't include "Categorical", so we need
|
||||
# to special-case
|
||||
res = repr(data.reshape(1, -1))
|
||||
assert res.count("\nCategories") == 1
|
||||
|
||||
res = repr(data.reshape(-1, 1))
|
||||
assert res.count("\nCategories") == 1
|
||||
@@ -0,0 +1,81 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes import dtypes
|
||||
from pandas.core.dtypes.common import is_extension_array_dtype
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import ExtensionArray
|
||||
|
||||
|
||||
class DummyDtype(dtypes.ExtensionDtype):
|
||||
pass
|
||||
|
||||
|
||||
class DummyArray(ExtensionArray):
|
||||
def __init__(self, data):
|
||||
self.data = data
|
||||
|
||||
def __array__(self, dtype):
|
||||
return self.data
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return DummyDtype()
|
||||
|
||||
def astype(self, dtype, copy=True):
|
||||
# we don't support anything but a single dtype
|
||||
if isinstance(dtype, DummyDtype):
|
||||
if copy:
|
||||
return type(self)(self.data)
|
||||
return self
|
||||
|
||||
return np.array(self, dtype=dtype, copy=copy)
|
||||
|
||||
|
||||
class TestExtensionArrayDtype:
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.Categorical([]),
|
||||
pd.Categorical([]).dtype,
|
||||
pd.Series(pd.Categorical([])),
|
||||
DummyDtype(),
|
||||
DummyArray(np.array([1, 2])),
|
||||
],
|
||||
)
|
||||
def test_is_extension_array_dtype(self, values):
|
||||
assert is_extension_array_dtype(values)
|
||||
|
||||
@pytest.mark.parametrize("values", [np.array([]), pd.Series(np.array([]))])
|
||||
def test_is_not_extension_array_dtype(self, values):
|
||||
assert not is_extension_array_dtype(values)
|
||||
|
||||
|
||||
def test_astype():
|
||||
|
||||
arr = DummyArray(np.array([1, 2, 3]))
|
||||
expected = np.array([1, 2, 3], dtype=object)
|
||||
|
||||
result = arr.astype(object)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = arr.astype("object")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_astype_no_copy():
|
||||
arr = DummyArray(np.array([1, 2, 3], dtype=np.int64))
|
||||
result = arr.astype(arr.dtype, copy=False)
|
||||
|
||||
assert arr is result
|
||||
|
||||
result = arr.astype(arr.dtype)
|
||||
assert arr is not result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [dtypes.CategoricalDtype(), dtypes.IntervalDtype()])
|
||||
def test_is_extension_array_dtype(dtype):
|
||||
assert isinstance(dtype, dtypes.ExtensionDtype)
|
||||
assert is_extension_array_dtype(dtype)
|
||||
@@ -0,0 +1,194 @@
|
||||
"""
|
||||
This file contains a minimal set of tests for compliance with the extension
|
||||
array interface test suite, and should contain no other tests.
|
||||
The test suite for the full functionality of the array is located in
|
||||
`pandas/tests/arrays/`.
|
||||
|
||||
The tests in this file are inherited from the BaseExtensionTests, and only
|
||||
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||||
parent method).
|
||||
|
||||
Additional tests should either be added to one of the BaseExtensionTests
|
||||
classes (if they are relevant for the extension interface for all dtypes), or
|
||||
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import DatetimeTZDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.arrays import DatetimeArray
|
||||
from pandas.tests.extension import base
|
||||
|
||||
|
||||
@pytest.fixture(params=["US/Central"])
|
||||
def dtype(request):
|
||||
return DatetimeTZDtype(unit="ns", tz=request.param)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data(dtype):
|
||||
data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype)
|
||||
return data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing(dtype):
|
||||
return DatetimeArray(
|
||||
np.array(["NaT", "2000-01-01"], dtype="datetime64[ns]"), dtype=dtype
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting(dtype):
|
||||
a = pd.Timestamp("2000-01-01")
|
||||
b = pd.Timestamp("2000-01-02")
|
||||
c = pd.Timestamp("2000-01-03")
|
||||
return DatetimeArray(np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting(dtype):
|
||||
a = pd.Timestamp("2000-01-01")
|
||||
b = pd.Timestamp("2000-01-02")
|
||||
return DatetimeArray(np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping(dtype):
|
||||
"""
|
||||
Expected to be like [B, B, NA, NA, A, A, B, C]
|
||||
|
||||
Where A < B < C and NA is missing
|
||||
"""
|
||||
a = pd.Timestamp("2000-01-01")
|
||||
b = pd.Timestamp("2000-01-02")
|
||||
c = pd.Timestamp("2000-01-03")
|
||||
na = "NaT"
|
||||
return DatetimeArray(
|
||||
np.array([b, b, na, na, a, a, b, c], dtype="datetime64[ns]"), dtype=dtype
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_cmp():
|
||||
def cmp(a, b):
|
||||
return a is pd.NaT and a is b
|
||||
|
||||
return cmp
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return pd.NaT
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
class BaseDatetimeTests:
|
||||
pass
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Tests
|
||||
class TestDatetimeDtype(BaseDatetimeTests, base.BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestConstructors(BaseDatetimeTests, base.BaseConstructorsTests):
|
||||
def test_series_constructor(self, data):
|
||||
# Series construction drops any .freq attr
|
||||
data = data._with_freq(None)
|
||||
super().test_series_constructor(data)
|
||||
|
||||
|
||||
class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestIndex(base.BaseIndexTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMethods(BaseDatetimeTests, base.BaseMethodsTests):
|
||||
def test_combine_add(self, data_repeated):
|
||||
# Timestamp.__add__(Timestamp) not defined
|
||||
pass
|
||||
|
||||
|
||||
class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestArithmeticOps(BaseDatetimeTests, base.BaseArithmeticOpsTests):
|
||||
implements = {"__sub__", "__rsub__"}
|
||||
|
||||
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
|
||||
# frame & scalar
|
||||
if all_arithmetic_operators in self.implements:
|
||||
df = pd.DataFrame({"A": data})
|
||||
self.check_opname(df, all_arithmetic_operators, data[0], exc=None)
|
||||
else:
|
||||
# ... but not the rest.
|
||||
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
|
||||
if all_arithmetic_operators in self.implements:
|
||||
ser = pd.Series(data)
|
||||
self.check_opname(ser, all_arithmetic_operators, ser.iloc[0], exc=None)
|
||||
else:
|
||||
# ... but not the rest.
|
||||
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
def test_add_series_with_extension_array(self, data):
|
||||
# Datetime + Datetime not implemented
|
||||
ser = pd.Series(data)
|
||||
msg = "cannot add DatetimeArray and DatetimeArray"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ser + data
|
||||
|
||||
def test_arith_series_with_array(self, data, all_arithmetic_operators):
|
||||
if all_arithmetic_operators in self.implements:
|
||||
ser = pd.Series(data)
|
||||
self.check_opname(ser, all_arithmetic_operators, ser.iloc[0], exc=None)
|
||||
else:
|
||||
# ... but not the rest.
|
||||
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
def test_divmod_series_array(self):
|
||||
# GH 23287
|
||||
# skipping because it is not implemented
|
||||
pass
|
||||
|
||||
|
||||
class TestCasting(BaseDatetimeTests, base.BaseCastingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestComparisonOps(BaseDatetimeTests, base.BaseComparisonOpsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMissing(BaseDatetimeTests, base.BaseMissingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestReshaping(BaseDatetimeTests, base.BaseReshapingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGroupby(BaseDatetimeTests, base.BaseGroupbyTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestPrinting(BaseDatetimeTests, base.BasePrintingTests):
|
||||
pass
|
||||
|
||||
|
||||
class Test2DCompat(BaseDatetimeTests, base.NDArrayBacked2DTests):
|
||||
pass
|
||||
@@ -0,0 +1,26 @@
|
||||
"""
|
||||
Tests for behavior if an author does *not* implement EA methods.
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.arrays import ExtensionArray
|
||||
|
||||
|
||||
class MyEA(ExtensionArray):
|
||||
def __init__(self, values):
|
||||
self._values = values
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
arr = np.arange(10)
|
||||
return MyEA(arr)
|
||||
|
||||
|
||||
class TestExtensionArray:
|
||||
def test_errors(self, data, all_arithmetic_operators):
|
||||
# invalid ops
|
||||
op_name = all_arithmetic_operators
|
||||
with pytest.raises(AttributeError):
|
||||
getattr(data, op_name)
|
||||
@@ -0,0 +1,40 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.internals import BlockPlacement
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.internals import BlockManager
|
||||
from pandas.core.internals.blocks import ExtensionBlock
|
||||
|
||||
pytestmark = td.skip_array_manager_invalid_test
|
||||
|
||||
|
||||
class CustomBlock(ExtensionBlock):
|
||||
|
||||
_holder = np.ndarray
|
||||
|
||||
# Cannot override final attribute "_can_hold_na"
|
||||
@property # type: ignore[misc]
|
||||
def _can_hold_na(self) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
df1 = pd.DataFrame({"a": [1, 2, 3]})
|
||||
blocks = df1._mgr.blocks
|
||||
values = np.arange(3, dtype="int64")
|
||||
bp = BlockPlacement(slice(1, 2))
|
||||
custom_block = CustomBlock(values, placement=bp, ndim=2)
|
||||
blocks = blocks + (custom_block,)
|
||||
block_manager = BlockManager(blocks, [pd.Index(["a", "b"]), df1.index])
|
||||
return pd.DataFrame(block_manager)
|
||||
|
||||
|
||||
def test_concat_axis1(df):
|
||||
# GH17954
|
||||
df2 = pd.DataFrame({"c": [0.1, 0.2, 0.3]})
|
||||
res = pd.concat([df, df2], axis=1)
|
||||
assert isinstance(res._mgr.blocks[1], CustomBlock)
|
||||
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
This file contains a minimal set of tests for compliance with the extension
|
||||
array interface test suite, and should contain no other tests.
|
||||
The test suite for the full functionality of the array is located in
|
||||
`pandas/tests/arrays/`.
|
||||
|
||||
The tests in this file are inherited from the BaseExtensionTests, and only
|
||||
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||||
parent method).
|
||||
|
||||
Additional tests should either be added to one of the BaseExtensionTests
|
||||
classes (if they are relevant for the extension interface for all dtypes), or
|
||||
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import is_extension_array_dtype
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import is_float_dtype
|
||||
from pandas.core.arrays.floating import (
|
||||
Float32Dtype,
|
||||
Float64Dtype,
|
||||
)
|
||||
from pandas.tests.extension import base
|
||||
|
||||
|
||||
def make_data():
|
||||
return (
|
||||
list(np.arange(0.1, 0.9, 0.1))
|
||||
+ [pd.NA]
|
||||
+ list(np.arange(1, 9.8, 0.1))
|
||||
+ [pd.NA]
|
||||
+ [9.9, 10.0]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[Float32Dtype, Float64Dtype])
|
||||
def dtype(request):
|
||||
return request.param()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data(dtype):
|
||||
return pd.array(make_data(), dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_twos(dtype):
|
||||
return pd.array(np.ones(100) * 2, dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing(dtype):
|
||||
return pd.array([pd.NA, 0.1], dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting(dtype):
|
||||
return pd.array([0.1, 0.2, 0.0], dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting(dtype):
|
||||
return pd.array([0.1, pd.NA, 0.0], dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_cmp():
|
||||
# we are pd.NA
|
||||
return lambda x, y: x is pd.NA and y is pd.NA
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return pd.NA
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping(dtype):
|
||||
b = 0.1
|
||||
a = 0.0
|
||||
c = 0.2
|
||||
na = pd.NA
|
||||
return pd.array([b, b, na, na, a, a, b, c], dtype=dtype)
|
||||
|
||||
|
||||
class TestDtype(base.BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestArithmeticOps(base.BaseArithmeticOpsTests):
|
||||
def check_opname(self, s, op_name, other, exc=None):
|
||||
# overwriting to indicate ops don't raise an error
|
||||
super().check_opname(s, op_name, other, exc=None)
|
||||
|
||||
def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
|
||||
if exc is None:
|
||||
sdtype = tm.get_dtype(s)
|
||||
if (
|
||||
hasattr(other, "dtype")
|
||||
and not is_extension_array_dtype(other.dtype)
|
||||
and is_float_dtype(other.dtype)
|
||||
):
|
||||
# other is np.float64 and would therefore always result in
|
||||
# upcasting, so keeping other as same numpy_dtype
|
||||
other = other.astype(sdtype.numpy_dtype)
|
||||
|
||||
result = op(s, other)
|
||||
expected = self._combine(s, other, op)
|
||||
|
||||
# combine method result in 'biggest' (float64) dtype
|
||||
expected = expected.astype(sdtype)
|
||||
|
||||
self.assert_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(exc):
|
||||
op(s, other)
|
||||
|
||||
def _check_divmod_op(self, s, op, other, exc=None):
|
||||
super()._check_divmod_op(s, op, other, None)
|
||||
|
||||
|
||||
class TestComparisonOps(base.BaseComparisonOpsTests):
|
||||
# TODO: share with IntegerArray?
|
||||
def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
|
||||
if exc is None:
|
||||
result = op(s, other)
|
||||
# Override to do the astype to boolean
|
||||
expected = s.combine(other, op).astype("boolean")
|
||||
self.assert_series_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(exc):
|
||||
op(s, other)
|
||||
|
||||
def check_opname(self, s, op_name, other, exc=None):
|
||||
super().check_opname(s, op_name, other, exc=None)
|
||||
|
||||
def _compare_other(self, s, data, op, other):
|
||||
op_name = f"__{op.__name__}__"
|
||||
self.check_opname(s, op_name, other)
|
||||
|
||||
|
||||
class TestInterface(base.BaseInterfaceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestConstructors(base.BaseConstructorsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestReshaping(base.BaseReshapingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGetitem(base.BaseGetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestSetitem(base.BaseSetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestIndex(base.BaseIndexTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMissing(base.BaseMissingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMethods(base.BaseMethodsTests):
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_value_counts(self, all_data, dropna):
|
||||
all_data = all_data[:10]
|
||||
if dropna:
|
||||
other = np.array(all_data[~all_data.isna()])
|
||||
else:
|
||||
other = all_data
|
||||
|
||||
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
|
||||
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
|
||||
expected = expected.astype("Int64")
|
||||
expected.index = expected.index.astype(all_data.dtype)
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.xfail(reason="uses nullable integer")
|
||||
def test_value_counts_with_normalize(self, data):
|
||||
super().test_value_counts_with_normalize(data)
|
||||
|
||||
|
||||
class TestCasting(base.BaseCastingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGroupby(base.BaseGroupbyTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestNumericReduce(base.BaseNumericReduceTests):
|
||||
def check_reduce(self, s, op_name, skipna):
|
||||
# overwrite to ensure pd.NA is tested instead of np.nan
|
||||
# https://github.com/pandas-dev/pandas/issues/30958
|
||||
result = getattr(s, op_name)(skipna=skipna)
|
||||
if not skipna and s.isna().any():
|
||||
expected = pd.NA
|
||||
else:
|
||||
expected = getattr(s.dropna().astype(s.dtype.numpy_dtype), op_name)(
|
||||
skipna=skipna
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Tested in tests/reductions/test_reductions.py")
|
||||
class TestBooleanReduce(base.BaseBooleanReduceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestPrinting(base.BasePrintingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestParsing(base.BaseParsingTests):
|
||||
pass
|
||||
|
||||
|
||||
class Test2DCompat(base.Dim2CompatTests):
|
||||
pass
|
||||
@@ -0,0 +1,253 @@
|
||||
"""
|
||||
This file contains a minimal set of tests for compliance with the extension
|
||||
array interface test suite, and should contain no other tests.
|
||||
The test suite for the full functionality of the array is located in
|
||||
`pandas/tests/arrays/`.
|
||||
|
||||
The tests in this file are inherited from the BaseExtensionTests, and only
|
||||
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||||
parent method).
|
||||
|
||||
Additional tests should either be added to one of the BaseExtensionTests
|
||||
classes (if they are relevant for the extension interface for all dtypes), or
|
||||
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.api.types import (
|
||||
is_extension_array_dtype,
|
||||
is_integer_dtype,
|
||||
)
|
||||
from pandas.core.arrays.integer import (
|
||||
Int8Dtype,
|
||||
Int16Dtype,
|
||||
Int32Dtype,
|
||||
Int64Dtype,
|
||||
UInt8Dtype,
|
||||
UInt16Dtype,
|
||||
UInt32Dtype,
|
||||
UInt64Dtype,
|
||||
)
|
||||
from pandas.tests.extension import base
|
||||
|
||||
|
||||
def make_data():
|
||||
return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100]
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
Int8Dtype,
|
||||
Int16Dtype,
|
||||
Int32Dtype,
|
||||
Int64Dtype,
|
||||
UInt8Dtype,
|
||||
UInt16Dtype,
|
||||
UInt32Dtype,
|
||||
UInt64Dtype,
|
||||
]
|
||||
)
|
||||
def dtype(request):
|
||||
return request.param()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data(dtype):
|
||||
return pd.array(make_data(), dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_twos(dtype):
|
||||
return pd.array(np.ones(100) * 2, dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing(dtype):
|
||||
return pd.array([pd.NA, 1], dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting(dtype):
|
||||
return pd.array([1, 2, 0], dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting(dtype):
|
||||
return pd.array([1, pd.NA, 0], dtype=dtype)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_cmp():
|
||||
# we are pd.NA
|
||||
return lambda x, y: x is pd.NA and y is pd.NA
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return pd.NA
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping(dtype):
|
||||
b = 1
|
||||
a = 0
|
||||
c = 2
|
||||
na = pd.NA
|
||||
return pd.array([b, b, na, na, a, a, b, c], dtype=dtype)
|
||||
|
||||
|
||||
class TestDtype(base.BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestArithmeticOps(base.BaseArithmeticOpsTests):
|
||||
def check_opname(self, s, op_name, other, exc=None):
|
||||
# overwriting to indicate ops don't raise an error
|
||||
super().check_opname(s, op_name, other, exc=None)
|
||||
|
||||
def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
|
||||
if exc is None:
|
||||
sdtype = tm.get_dtype(s)
|
||||
|
||||
if (
|
||||
hasattr(other, "dtype")
|
||||
and not is_extension_array_dtype(other.dtype)
|
||||
and is_integer_dtype(other.dtype)
|
||||
and sdtype.is_unsigned_integer
|
||||
):
|
||||
# TODO: comment below is inaccurate; other can be int8, int16, ...
|
||||
# and the trouble is that e.g. if s is UInt8 and other is int8,
|
||||
# then result is UInt16
|
||||
# other is np.int64 and would therefore always result in
|
||||
# upcasting, so keeping other as same numpy_dtype
|
||||
other = other.astype(sdtype.numpy_dtype)
|
||||
|
||||
result = op(s, other)
|
||||
expected = self._combine(s, other, op)
|
||||
|
||||
if op_name in ("__rtruediv__", "__truediv__", "__div__"):
|
||||
expected = expected.fillna(np.nan).astype("Float64")
|
||||
else:
|
||||
# combine method result in 'biggest' (int64) dtype
|
||||
expected = expected.astype(sdtype)
|
||||
|
||||
self.assert_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(exc):
|
||||
op(s, other)
|
||||
|
||||
def _check_divmod_op(self, s, op, other, exc=None):
|
||||
super()._check_divmod_op(s, op, other, None)
|
||||
|
||||
|
||||
class TestComparisonOps(base.BaseComparisonOpsTests):
|
||||
def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
|
||||
if exc is None:
|
||||
result = op(s, other)
|
||||
# Override to do the astype to boolean
|
||||
expected = s.combine(other, op).astype("boolean")
|
||||
self.assert_series_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(exc):
|
||||
op(s, other)
|
||||
|
||||
def check_opname(self, s, op_name, other, exc=None):
|
||||
super().check_opname(s, op_name, other, exc=None)
|
||||
|
||||
def _compare_other(self, s, data, op, other):
|
||||
op_name = f"__{op.__name__}__"
|
||||
self.check_opname(s, op_name, other)
|
||||
|
||||
|
||||
class TestInterface(base.BaseInterfaceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestConstructors(base.BaseConstructorsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestReshaping(base.BaseReshapingTests):
|
||||
pass
|
||||
|
||||
# for test_concat_mixed_dtypes test
|
||||
# concat of an Integer and Int coerces to object dtype
|
||||
# TODO(jreback) once integrated this would
|
||||
|
||||
|
||||
class TestGetitem(base.BaseGetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestSetitem(base.BaseSetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestIndex(base.BaseIndexTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMissing(base.BaseMissingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMethods(base.BaseMethodsTests):
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_value_counts(self, all_data, dropna):
|
||||
all_data = all_data[:10]
|
||||
if dropna:
|
||||
other = np.array(all_data[~all_data.isna()])
|
||||
else:
|
||||
other = all_data
|
||||
|
||||
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
|
||||
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
|
||||
expected = expected.astype("Int64")
|
||||
expected.index = expected.index.astype(all_data.dtype)
|
||||
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.xfail(reason="uses nullable integer")
|
||||
def test_value_counts_with_normalize(self, data):
|
||||
super().test_value_counts_with_normalize(data)
|
||||
|
||||
|
||||
class TestCasting(base.BaseCastingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGroupby(base.BaseGroupbyTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestNumericReduce(base.BaseNumericReduceTests):
|
||||
def check_reduce(self, s, op_name, skipna):
|
||||
# overwrite to ensure pd.NA is tested instead of np.nan
|
||||
# https://github.com/pandas-dev/pandas/issues/30958
|
||||
result = getattr(s, op_name)(skipna=skipna)
|
||||
if not skipna and s.isna().any():
|
||||
expected = pd.NA
|
||||
else:
|
||||
expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Tested in tests/reductions/test_reductions.py")
|
||||
class TestBooleanReduce(base.BaseBooleanReduceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestPrinting(base.BasePrintingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestParsing(base.BaseParsingTests):
|
||||
pass
|
||||
|
||||
|
||||
class Test2DCompat(base.Dim2CompatTests):
|
||||
pass
|
||||
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
This file contains a minimal set of tests for compliance with the extension
|
||||
array interface test suite, and should contain no other tests.
|
||||
The test suite for the full functionality of the array is located in
|
||||
`pandas/tests/arrays/`.
|
||||
|
||||
The tests in this file are inherited from the BaseExtensionTests, and only
|
||||
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||||
parent method).
|
||||
|
||||
Additional tests should either be added to one of the BaseExtensionTests
|
||||
classes (if they are relevant for the extension interface for all dtypes), or
|
||||
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import IntervalDtype
|
||||
|
||||
from pandas import (
|
||||
Interval,
|
||||
Series,
|
||||
)
|
||||
from pandas.core.arrays import IntervalArray
|
||||
from pandas.tests.extension import base
|
||||
|
||||
|
||||
def make_data():
|
||||
N = 100
|
||||
left_array = np.random.uniform(size=N).cumsum()
|
||||
right_array = left_array + np.random.uniform(size=N)
|
||||
return [Interval(left, right) for left, right in zip(left_array, right_array)]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return IntervalDtype()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
"""Length-100 PeriodArray for semantics test."""
|
||||
return IntervalArray(make_data())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing():
|
||||
"""Length 2 array with [NA, Valid]"""
|
||||
return IntervalArray.from_tuples([None, (0, 1)])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting():
|
||||
return IntervalArray.from_tuples([(1, 2), (2, 3), (0, 1)])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting():
|
||||
return IntervalArray.from_tuples([(1, 2), None, (0, 1)])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return np.nan
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping():
|
||||
a = (0, 1)
|
||||
b = (1, 2)
|
||||
c = (2, 3)
|
||||
return IntervalArray.from_tuples([b, b, None, None, a, a, b, c])
|
||||
|
||||
|
||||
class BaseInterval:
|
||||
pass
|
||||
|
||||
|
||||
class TestDtype(BaseInterval, base.BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestCasting(BaseInterval, base.BaseCastingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestConstructors(BaseInterval, base.BaseConstructorsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGetitem(BaseInterval, base.BaseGetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestIndex(base.BaseIndexTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGrouping(BaseInterval, base.BaseGroupbyTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestInterface(BaseInterval, base.BaseInterfaceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestReduce(base.BaseNoReduceTests):
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
|
||||
op_name = all_numeric_reductions
|
||||
ser = Series(data)
|
||||
|
||||
if op_name in ["min", "max"]:
|
||||
# IntervalArray *does* implement these
|
||||
assert getattr(ser, op_name)(skipna=skipna) in data
|
||||
assert getattr(data, op_name)(skipna=skipna) in data
|
||||
return
|
||||
|
||||
super().test_reduce_series_numeric(data, all_numeric_reductions, skipna)
|
||||
|
||||
|
||||
class TestMethods(BaseInterval, base.BaseMethodsTests):
|
||||
@pytest.mark.xfail(reason="addition is not defined for intervals")
|
||||
def test_combine_add(self, data_repeated):
|
||||
super().test_combine_add(data_repeated)
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="Raises with incorrect message bc it disallows *all* listlikes "
|
||||
"instead of just wrong-length listlikes"
|
||||
)
|
||||
def test_fillna_length_mismatch(self, data_missing):
|
||||
super().test_fillna_length_mismatch(data_missing)
|
||||
|
||||
|
||||
class TestMissing(BaseInterval, base.BaseMissingTests):
|
||||
# Index.fillna only accepts scalar `value`, so we have to xfail all
|
||||
# non-scalar fill tests.
|
||||
unsupported_fill = pytest.mark.xfail(
|
||||
reason="Unsupported fillna option for Interval."
|
||||
)
|
||||
|
||||
@unsupported_fill
|
||||
def test_fillna_limit_pad(self):
|
||||
super().test_fillna_limit_pad()
|
||||
|
||||
@unsupported_fill
|
||||
def test_fillna_series_method(self):
|
||||
super().test_fillna_series_method()
|
||||
|
||||
@unsupported_fill
|
||||
def test_fillna_limit_backfill(self):
|
||||
super().test_fillna_limit_backfill()
|
||||
|
||||
@unsupported_fill
|
||||
def test_fillna_no_op_returns_copy(self):
|
||||
super().test_fillna_no_op_returns_copy()
|
||||
|
||||
@unsupported_fill
|
||||
def test_fillna_series(self):
|
||||
super().test_fillna_series()
|
||||
|
||||
def test_fillna_non_scalar_raises(self, data_missing):
|
||||
msg = "can only insert Interval objects and NA into an IntervalArray"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
data_missing.fillna([1, 1])
|
||||
|
||||
|
||||
class TestReshaping(BaseInterval, base.BaseReshapingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestSetitem(BaseInterval, base.BaseSetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestPrinting(BaseInterval, base.BasePrintingTests):
|
||||
@pytest.mark.xfail(reason="Interval has custom repr")
|
||||
def test_array_repr(self, data, size):
|
||||
super().test_array_repr()
|
||||
|
||||
|
||||
class TestParsing(BaseInterval, base.BaseParsingTests):
|
||||
@pytest.mark.parametrize("engine", ["c", "python"])
|
||||
def test_EA_types(self, engine, data):
|
||||
expected_msg = r".*must implement _from_sequence_of_strings.*"
|
||||
with pytest.raises(NotImplementedError, match=expected_msg):
|
||||
super().test_EA_types(engine, data)
|
||||
@@ -0,0 +1,463 @@
|
||||
"""
|
||||
This file contains a minimal set of tests for compliance with the extension
|
||||
array interface test suite, and should contain no other tests.
|
||||
The test suite for the full functionality of the array is located in
|
||||
`pandas/tests/arrays/`.
|
||||
|
||||
The tests in this file are inherited from the BaseExtensionTests, and only
|
||||
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||||
parent method).
|
||||
|
||||
Additional tests should either be added to one of the BaseExtensionTests
|
||||
classes (if they are relevant for the extension interface for all dtypes), or
|
||||
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||||
|
||||
Note: we do not bother with base.BaseIndexTests because PandasArray
|
||||
will never be held in an Index.
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.cast import can_hold_element
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
ExtensionDtype,
|
||||
PandasDtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays.numpy_ import PandasArray
|
||||
from pandas.core.internals import blocks
|
||||
from pandas.tests.extension import base
|
||||
|
||||
|
||||
def _can_hold_element_patched(obj, element) -> bool:
|
||||
if isinstance(element, PandasArray):
|
||||
element = element.to_numpy()
|
||||
return can_hold_element(obj, element)
|
||||
|
||||
|
||||
orig_assert_attr_equal = tm.assert_attr_equal
|
||||
|
||||
|
||||
def _assert_attr_equal(attr: str, left, right, obj: str = "Attributes"):
|
||||
"""
|
||||
patch tm.assert_attr_equal so PandasDtype("object") is closed enough to
|
||||
np.dtype("object")
|
||||
"""
|
||||
if attr == "dtype":
|
||||
lattr = getattr(left, "dtype", None)
|
||||
rattr = getattr(right, "dtype", None)
|
||||
if isinstance(lattr, PandasDtype) and not isinstance(rattr, PandasDtype):
|
||||
left = left.astype(lattr.numpy_dtype)
|
||||
elif isinstance(rattr, PandasDtype) and not isinstance(lattr, PandasDtype):
|
||||
right = right.astype(rattr.numpy_dtype)
|
||||
|
||||
orig_assert_attr_equal(attr, left, right, obj)
|
||||
|
||||
|
||||
@pytest.fixture(params=["float", "object"])
|
||||
def dtype(request):
|
||||
return PandasDtype(np.dtype(request.param))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def allow_in_pandas(monkeypatch):
|
||||
"""
|
||||
A monkeypatch to tells pandas to let us in.
|
||||
|
||||
By default, passing a PandasArray to an index / series / frame
|
||||
constructor will unbox that PandasArray to an ndarray, and treat
|
||||
it as a non-EA column. We don't want people using EAs without
|
||||
reason.
|
||||
|
||||
The mechanism for this is a check against ABCPandasArray
|
||||
in each constructor.
|
||||
|
||||
But, for testing, we need to allow them in pandas. So we patch
|
||||
the _typ of PandasArray, so that we evade the ABCPandasArray
|
||||
check.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(PandasArray, "_typ", "extension")
|
||||
m.setattr(blocks, "can_hold_element", _can_hold_element_patched)
|
||||
m.setattr(tm.asserters, "assert_attr_equal", _assert_attr_equal)
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data(allow_in_pandas, dtype):
|
||||
if dtype.numpy_dtype == "object":
|
||||
return pd.Series([(i,) for i in range(100)]).array
|
||||
return PandasArray(np.arange(1, 101, dtype=dtype._dtype))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing(allow_in_pandas, dtype):
|
||||
if dtype.numpy_dtype == "object":
|
||||
return PandasArray(np.array([np.nan, (1,)], dtype=object))
|
||||
return PandasArray(np.array([np.nan, 1.0]))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return np.nan
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_cmp():
|
||||
def cmp(a, b):
|
||||
return np.isnan(a) and np.isnan(b)
|
||||
|
||||
return cmp
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting(allow_in_pandas, dtype):
|
||||
"""Length-3 array with a known sort order.
|
||||
|
||||
This should be three items [B, C, A] with
|
||||
A < B < C
|
||||
"""
|
||||
if dtype.numpy_dtype == "object":
|
||||
# Use an empty tuple for first element, then remove,
|
||||
# to disable np.array's shape inference.
|
||||
return PandasArray(np.array([(), (2,), (3,), (1,)], dtype=object)[1:])
|
||||
return PandasArray(np.array([1, 2, 0]))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting(allow_in_pandas, dtype):
|
||||
"""Length-3 array with a known sort order.
|
||||
|
||||
This should be three items [B, NA, A] with
|
||||
A < B and NA missing.
|
||||
"""
|
||||
if dtype.numpy_dtype == "object":
|
||||
return PandasArray(np.array([(1,), np.nan, (0,)], dtype=object))
|
||||
return PandasArray(np.array([1, np.nan, 0]))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping(allow_in_pandas, dtype):
|
||||
"""Data for factorization, grouping, and unique tests.
|
||||
|
||||
Expected to be like [B, B, NA, NA, A, A, B, C]
|
||||
|
||||
Where A < B < C and NA is missing
|
||||
"""
|
||||
if dtype.numpy_dtype == "object":
|
||||
a, b, c = (1,), (2,), (3,)
|
||||
else:
|
||||
a, b, c = np.arange(3)
|
||||
return PandasArray(
|
||||
np.array([b, b, np.nan, np.nan, a, a, b, c], dtype=dtype.numpy_dtype)
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def skip_numpy_object(dtype, request):
|
||||
"""
|
||||
Tests for PandasArray with nested data. Users typically won't create
|
||||
these objects via `pd.array`, but they can show up through `.array`
|
||||
on a Series with nested data. Many of the base tests fail, as they aren't
|
||||
appropriate for nested data.
|
||||
|
||||
This fixture allows these tests to be skipped when used as a usefixtures
|
||||
marker to either an individual test or a test class.
|
||||
"""
|
||||
if dtype == "object":
|
||||
mark = pytest.mark.xfail(reason="Fails for object dtype")
|
||||
request.node.add_marker(mark)
|
||||
|
||||
|
||||
skip_nested = pytest.mark.usefixtures("skip_numpy_object")
|
||||
|
||||
|
||||
class BaseNumPyTests:
|
||||
@classmethod
|
||||
def assert_series_equal(cls, left, right, *args, **kwargs):
|
||||
# base class tests hard-code expected values with numpy dtypes,
|
||||
# whereas we generally want the corresponding PandasDtype
|
||||
if (
|
||||
isinstance(right, pd.Series)
|
||||
and not isinstance(right.dtype, ExtensionDtype)
|
||||
and isinstance(left.dtype, PandasDtype)
|
||||
):
|
||||
right = right.astype(PandasDtype(right.dtype))
|
||||
return tm.assert_series_equal(left, right, *args, **kwargs)
|
||||
|
||||
|
||||
class TestCasting(BaseNumPyTests, base.BaseCastingTests):
|
||||
@skip_nested
|
||||
def test_astype_str(self, data):
|
||||
# ValueError: setting an array element with a sequence
|
||||
super().test_astype_str(data)
|
||||
|
||||
|
||||
class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests):
|
||||
@pytest.mark.skip(reason="We don't register our dtype")
|
||||
# We don't want to register. This test should probably be split in two.
|
||||
def test_from_dtype(self, data):
|
||||
pass
|
||||
|
||||
@skip_nested
|
||||
def test_series_constructor_scalar_with_index(self, data, dtype):
|
||||
# ValueError: Length of passed values is 1, index implies 3.
|
||||
super().test_series_constructor_scalar_with_index(data, dtype)
|
||||
|
||||
|
||||
class TestDtype(BaseNumPyTests, base.BaseDtypeTests):
|
||||
def test_check_dtype(self, data, request):
|
||||
if data.dtype.numpy_dtype == "object":
|
||||
request.node.add_marker(
|
||||
pytest.mark.xfail(
|
||||
reason=f"PandasArray expectedly clashes with a "
|
||||
f"NumPy name: {data.dtype.numpy_dtype}"
|
||||
)
|
||||
)
|
||||
super().test_check_dtype(data)
|
||||
|
||||
|
||||
class TestGetitem(BaseNumPyTests, base.BaseGetitemTests):
|
||||
@skip_nested
|
||||
def test_getitem_scalar(self, data):
|
||||
# AssertionError
|
||||
super().test_getitem_scalar(data)
|
||||
|
||||
|
||||
class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests):
|
||||
def test_groupby_extension_apply(
|
||||
self, data_for_grouping, groupby_apply_op, request
|
||||
):
|
||||
dummy = groupby_apply_op([None])
|
||||
if (
|
||||
isinstance(dummy, pd.Series)
|
||||
and data_for_grouping.dtype.numpy_dtype == object
|
||||
):
|
||||
mark = pytest.mark.xfail(reason="raises in MultiIndex construction")
|
||||
request.node.add_marker(mark)
|
||||
super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
|
||||
|
||||
|
||||
class TestInterface(BaseNumPyTests, base.BaseInterfaceTests):
|
||||
@skip_nested
|
||||
def test_array_interface(self, data):
|
||||
# NumPy array shape inference
|
||||
super().test_array_interface(data)
|
||||
|
||||
|
||||
class TestMethods(BaseNumPyTests, base.BaseMethodsTests):
|
||||
@skip_nested
|
||||
def test_shift_fill_value(self, data):
|
||||
# np.array shape inference. Shift implementation fails.
|
||||
super().test_shift_fill_value(data)
|
||||
|
||||
@skip_nested
|
||||
def test_fillna_copy_frame(self, data_missing):
|
||||
# The "scalar" for this array isn't a scalar.
|
||||
super().test_fillna_copy_frame(data_missing)
|
||||
|
||||
@skip_nested
|
||||
def test_fillna_copy_series(self, data_missing):
|
||||
# The "scalar" for this array isn't a scalar.
|
||||
super().test_fillna_copy_series(data_missing)
|
||||
|
||||
@skip_nested
|
||||
def test_searchsorted(self, data_for_sorting, as_series):
|
||||
# Test setup fails.
|
||||
super().test_searchsorted(data_for_sorting, as_series)
|
||||
|
||||
@pytest.mark.xfail(reason="PandasArray.diff may fail on dtype")
|
||||
def test_diff(self, data, periods):
|
||||
return super().test_diff(data, periods)
|
||||
|
||||
def test_insert(self, data, request):
|
||||
if data.dtype.numpy_dtype == object:
|
||||
mark = pytest.mark.xfail(reason="Dimension mismatch in np.concatenate")
|
||||
request.node.add_marker(mark)
|
||||
|
||||
super().test_insert(data)
|
||||
|
||||
@skip_nested
|
||||
def test_insert_invalid(self, data, invalid_scalar):
|
||||
# PandasArray[object] can hold anything, so skip
|
||||
super().test_insert_invalid(data, invalid_scalar)
|
||||
|
||||
|
||||
class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests):
|
||||
divmod_exc = None
|
||||
series_scalar_exc = None
|
||||
frame_scalar_exc = None
|
||||
series_array_exc = None
|
||||
|
||||
@skip_nested
|
||||
def test_divmod(self, data):
|
||||
super().test_divmod(data)
|
||||
|
||||
@skip_nested
|
||||
def test_divmod_series_array(self, data):
|
||||
ser = pd.Series(data)
|
||||
self._check_divmod_op(ser, divmod, data, exc=None)
|
||||
|
||||
@skip_nested
|
||||
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
|
||||
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
def test_arith_series_with_array(self, data, all_arithmetic_operators, request):
|
||||
opname = all_arithmetic_operators
|
||||
if data.dtype.numpy_dtype == object and opname not in ["__add__", "__radd__"]:
|
||||
mark = pytest.mark.xfail(reason="Fails for object dtype")
|
||||
request.node.add_marker(mark)
|
||||
super().test_arith_series_with_array(data, all_arithmetic_operators)
|
||||
|
||||
@skip_nested
|
||||
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
|
||||
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
|
||||
class TestPrinting(BaseNumPyTests, base.BasePrintingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestNumericReduce(BaseNumPyTests, base.BaseNumericReduceTests):
|
||||
def check_reduce(self, s, op_name, skipna):
|
||||
result = getattr(s, op_name)(skipna=skipna)
|
||||
# avoid coercing int -> float. Just cast to the actual numpy type.
|
||||
expected = getattr(s.astype(s.dtype._dtype), op_name)(skipna=skipna)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
def test_reduce_series(self, data, all_boolean_reductions, skipna):
|
||||
super().test_reduce_series(data, all_boolean_reductions, skipna)
|
||||
|
||||
|
||||
@skip_nested
|
||||
class TestBooleanReduce(BaseNumPyTests, base.BaseBooleanReduceTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMissing(BaseNumPyTests, base.BaseMissingTests):
|
||||
@skip_nested
|
||||
def test_fillna_series(self, data_missing):
|
||||
# Non-scalar "scalar" values.
|
||||
super().test_fillna_series(data_missing)
|
||||
|
||||
@skip_nested
|
||||
def test_fillna_frame(self, data_missing):
|
||||
# Non-scalar "scalar" values.
|
||||
super().test_fillna_frame(data_missing)
|
||||
|
||||
|
||||
class TestReshaping(BaseNumPyTests, base.BaseReshapingTests):
|
||||
@pytest.mark.parametrize(
|
||||
"in_frame",
|
||||
[
|
||||
True,
|
||||
pytest.param(
|
||||
False,
|
||||
marks=pytest.mark.xfail(reason="PandasArray inconsistently extracted"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_concat(self, data, in_frame):
|
||||
super().test_concat(data, in_frame)
|
||||
|
||||
|
||||
class TestSetitem(BaseNumPyTests, base.BaseSetitemTests):
|
||||
@skip_nested
|
||||
def test_setitem_invalid(self, data, invalid_scalar):
|
||||
# object dtype can hold anything, so doesn't raise
|
||||
super().test_setitem_invalid(data, invalid_scalar)
|
||||
|
||||
@skip_nested
|
||||
def test_setitem_sequence_broadcasts(self, data, box_in_series):
|
||||
# ValueError: cannot set using a list-like indexer with a different
|
||||
# length than the value
|
||||
super().test_setitem_sequence_broadcasts(data, box_in_series)
|
||||
|
||||
@skip_nested
|
||||
@pytest.mark.parametrize("setter", ["loc", None])
|
||||
def test_setitem_mask_broadcast(self, data, setter):
|
||||
# ValueError: cannot set using a list-like indexer with a different
|
||||
# length than the value
|
||||
super().test_setitem_mask_broadcast(data, setter)
|
||||
|
||||
@skip_nested
|
||||
def test_setitem_scalar_key_sequence_raise(self, data):
|
||||
# Failed: DID NOT RAISE <class 'ValueError'>
|
||||
super().test_setitem_scalar_key_sequence_raise(data)
|
||||
|
||||
# TODO: there is some issue with PandasArray, therefore,
|
||||
# skip the setitem test for now, and fix it later (GH 31446)
|
||||
|
||||
@skip_nested
|
||||
@pytest.mark.parametrize(
|
||||
"mask",
|
||||
[
|
||||
np.array([True, True, True, False, False]),
|
||||
pd.array([True, True, True, False, False], dtype="boolean"),
|
||||
],
|
||||
ids=["numpy-array", "boolean-array"],
|
||||
)
|
||||
def test_setitem_mask(self, data, mask, box_in_series):
|
||||
super().test_setitem_mask(data, mask, box_in_series)
|
||||
|
||||
def test_setitem_mask_raises(self, data, box_in_series):
|
||||
super().test_setitem_mask_raises(data, box_in_series)
|
||||
|
||||
@skip_nested
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
|
||||
ids=["list", "integer-array", "numpy-array"],
|
||||
)
|
||||
def test_setitem_integer_array(self, data, idx, box_in_series):
|
||||
super().test_setitem_integer_array(data, idx, box_in_series)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx, box_in_series",
|
||||
[
|
||||
([0, 1, 2, pd.NA], False),
|
||||
pytest.param([0, 1, 2, pd.NA], True, marks=pytest.mark.xfail),
|
||||
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
|
||||
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
|
||||
],
|
||||
ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
|
||||
)
|
||||
def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
|
||||
super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
|
||||
|
||||
@skip_nested
|
||||
def test_setitem_slice(self, data, box_in_series):
|
||||
super().test_setitem_slice(data, box_in_series)
|
||||
|
||||
@skip_nested
|
||||
def test_setitem_loc_iloc_slice(self, data):
|
||||
super().test_setitem_loc_iloc_slice(data)
|
||||
|
||||
def test_setitem_with_expansion_dataframe_column(self, data, full_indexer):
|
||||
# https://github.com/pandas-dev/pandas/issues/32395
|
||||
df = expected = pd.DataFrame({"data": pd.Series(data)})
|
||||
result = pd.DataFrame(index=df.index)
|
||||
|
||||
# because result has object dtype, the attempt to do setting inplace
|
||||
# is successful, and object dtype is retained
|
||||
key = full_indexer(df)
|
||||
result.loc[key, "data"] = df["data"]
|
||||
|
||||
# base class method has expected = df; PandasArray behaves oddly because
|
||||
# we patch _typ for these tests.
|
||||
if data.dtype.numpy_dtype != object:
|
||||
if not isinstance(key, slice) or key != slice(None):
|
||||
expected = pd.DataFrame({"data": data.to_numpy()})
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_nested
|
||||
class TestParsing(BaseNumPyTests, base.BaseParsingTests):
|
||||
pass
|
||||
|
||||
|
||||
class Test2DCompat(BaseNumPyTests, base.NDArrayBacked2DTests):
|
||||
pass
|
||||
@@ -0,0 +1,191 @@
|
||||
"""
|
||||
This file contains a minimal set of tests for compliance with the extension
|
||||
array interface test suite, and should contain no other tests.
|
||||
The test suite for the full functionality of the array is located in
|
||||
`pandas/tests/arrays/`.
|
||||
|
||||
The tests in this file are inherited from the BaseExtensionTests, and only
|
||||
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||||
parent method).
|
||||
|
||||
Additional tests should either be added to one of the BaseExtensionTests
|
||||
classes (if they are relevant for the extension interface for all dtypes), or
|
||||
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import iNaT
|
||||
|
||||
from pandas.core.dtypes.dtypes import PeriodDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.arrays import PeriodArray
|
||||
from pandas.tests.extension import base
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return PeriodDtype(freq="D")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data(dtype):
|
||||
return PeriodArray(np.arange(1970, 2070), freq=dtype.freq)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_twos(dtype):
|
||||
return PeriodArray(np.ones(100) * 2, freq=dtype.freq)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting(dtype):
|
||||
return PeriodArray([2018, 2019, 2017], freq=dtype.freq)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing(dtype):
|
||||
return PeriodArray([iNaT, 2017], freq=dtype.freq)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting(dtype):
|
||||
return PeriodArray([2018, iNaT, 2017], freq=dtype.freq)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping(dtype):
|
||||
B = 2018
|
||||
NA = iNaT
|
||||
A = 2017
|
||||
C = 2019
|
||||
return PeriodArray([B, B, NA, NA, A, A, B, C], freq=dtype.freq)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return pd.NaT
|
||||
|
||||
|
||||
class BasePeriodTests:
|
||||
pass
|
||||
|
||||
|
||||
class TestPeriodDtype(BasePeriodTests, base.BaseDtypeTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestConstructors(BasePeriodTests, base.BaseConstructorsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGetitem(BasePeriodTests, base.BaseGetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestIndex(base.BaseIndexTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMethods(BasePeriodTests, base.BaseMethodsTests):
|
||||
def test_combine_add(self, data_repeated):
|
||||
# Period + Period is not defined.
|
||||
pass
|
||||
|
||||
|
||||
class TestInterface(BasePeriodTests, base.BaseInterfaceTests):
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class TestArithmeticOps(BasePeriodTests, base.BaseArithmeticOpsTests):
|
||||
implements = {"__sub__", "__rsub__"}
|
||||
|
||||
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
|
||||
# frame & scalar
|
||||
if all_arithmetic_operators in self.implements:
|
||||
df = pd.DataFrame({"A": data})
|
||||
self.check_opname(df, all_arithmetic_operators, data[0], exc=None)
|
||||
else:
|
||||
# ... but not the rest.
|
||||
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
|
||||
# we implement substitution...
|
||||
if all_arithmetic_operators in self.implements:
|
||||
s = pd.Series(data)
|
||||
self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None)
|
||||
else:
|
||||
# ... but not the rest.
|
||||
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
def test_arith_series_with_array(self, data, all_arithmetic_operators):
|
||||
if all_arithmetic_operators in self.implements:
|
||||
s = pd.Series(data)
|
||||
self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None)
|
||||
else:
|
||||
# ... but not the rest.
|
||||
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
|
||||
super()._check_divmod_op(s, op, other, exc=TypeError)
|
||||
|
||||
def test_add_series_with_extension_array(self, data):
|
||||
# we don't implement + for Period
|
||||
s = pd.Series(data)
|
||||
msg = (
|
||||
r"unsupported operand type\(s\) for \+: "
|
||||
r"\'PeriodArray\' and \'PeriodArray\'"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s + data
|
||||
|
||||
@pytest.mark.parametrize("box", [pd.Series, pd.DataFrame])
|
||||
def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box):
|
||||
# Override to use __sub__ instead of __add__
|
||||
other = pd.Series(data)
|
||||
if box is pd.DataFrame:
|
||||
other = other.to_frame()
|
||||
|
||||
result = data.__sub__(other)
|
||||
assert result is NotImplemented
|
||||
|
||||
|
||||
class TestCasting(BasePeriodTests, base.BaseCastingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestComparisonOps(BasePeriodTests, base.BaseComparisonOpsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMissing(BasePeriodTests, base.BaseMissingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestReshaping(BasePeriodTests, base.BaseReshapingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestSetitem(BasePeriodTests, base.BaseSetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGroupby(BasePeriodTests, base.BaseGroupbyTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestPrinting(BasePeriodTests, base.BasePrintingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestParsing(BasePeriodTests, base.BaseParsingTests):
|
||||
@pytest.mark.parametrize("engine", ["c", "python"])
|
||||
def test_EA_types(self, engine, data):
|
||||
super().test_EA_types(engine, data)
|
||||
|
||||
|
||||
class Test2DCompat(BasePeriodTests, base.NDArrayBacked2DTests):
|
||||
pass
|
||||
@@ -0,0 +1,513 @@
|
||||
"""
|
||||
This file contains a minimal set of tests for compliance with the extension
|
||||
array interface test suite, and should contain no other tests.
|
||||
The test suite for the full functionality of the array is located in
|
||||
`pandas/tests/arrays/`.
|
||||
|
||||
The tests in this file are inherited from the BaseExtensionTests, and only
|
||||
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||||
parent method).
|
||||
|
||||
Additional tests should either be added to one of the BaseExtensionTests
|
||||
classes (if they are relevant for the extension interface for all dtypes), or
|
||||
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||||
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import np_version_under1p20
|
||||
from pandas.errors import PerformanceWarning
|
||||
|
||||
from pandas.core.dtypes.common import is_object_dtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import SparseDtype
|
||||
import pandas._testing as tm
|
||||
from pandas.arrays import SparseArray
|
||||
from pandas.tests.extension import base
|
||||
|
||||
|
||||
def make_data(fill_value):
|
||||
if np.isnan(fill_value):
|
||||
data = np.random.uniform(size=100)
|
||||
else:
|
||||
data = np.random.randint(1, 100, size=100)
|
||||
if data[0] == data[1]:
|
||||
data[0] += 1
|
||||
|
||||
data[2::3] = fill_value
|
||||
return data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype():
|
||||
return SparseDtype()
|
||||
|
||||
|
||||
@pytest.fixture(params=[0, np.nan])
|
||||
def data(request):
|
||||
"""Length-100 PeriodArray for semantics test."""
|
||||
res = SparseArray(make_data(request.param), fill_value=request.param)
|
||||
return res
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_twos(request):
|
||||
return SparseArray(np.ones(100) * 2)
|
||||
|
||||
|
||||
@pytest.fixture(params=[0, np.nan])
|
||||
def data_missing(request):
|
||||
"""Length 2 array with [NA, Valid]"""
|
||||
return SparseArray([np.nan, 1], fill_value=request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=[0, np.nan])
|
||||
def data_repeated(request):
|
||||
"""Return different versions of data for count times"""
|
||||
|
||||
def gen(count):
|
||||
for _ in range(count):
|
||||
yield SparseArray(make_data(request.param), fill_value=request.param)
|
||||
|
||||
yield gen
|
||||
|
||||
|
||||
@pytest.fixture(params=[0, np.nan])
|
||||
def data_for_sorting(request):
|
||||
return SparseArray([2, 3, 1], fill_value=request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=[0, np.nan])
|
||||
def data_missing_for_sorting(request):
|
||||
return SparseArray([2, np.nan, 1], fill_value=request.param)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return np.nan
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_cmp():
|
||||
return lambda left, right: pd.isna(left) and pd.isna(right)
|
||||
|
||||
|
||||
@pytest.fixture(params=[0, np.nan])
|
||||
def data_for_grouping(request):
|
||||
return SparseArray([1, 1, np.nan, np.nan, 2, 2, 1, 3], fill_value=request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=[0, np.nan])
|
||||
def data_for_compare(request):
|
||||
return SparseArray([0, 0, np.nan, -2, -1, 4, 2, 3, 0, 0], fill_value=request.param)
|
||||
|
||||
|
||||
class BaseSparseTests:
|
||||
def _check_unsupported(self, data):
|
||||
if data.dtype == SparseDtype(int, 0):
|
||||
pytest.skip("Can't store nan in int array.")
|
||||
|
||||
@pytest.mark.xfail(reason="SparseArray does not support setitem")
|
||||
def test_ravel(self, data):
|
||||
super().test_ravel(data)
|
||||
|
||||
|
||||
class TestDtype(BaseSparseTests, base.BaseDtypeTests):
|
||||
def test_array_type_with_arg(self, data, dtype):
|
||||
assert dtype.construct_array_type() is SparseArray
|
||||
|
||||
|
||||
class TestInterface(BaseSparseTests, base.BaseInterfaceTests):
|
||||
def test_copy(self, data):
|
||||
# __setitem__ does not work, so we only have a smoke-test
|
||||
data.copy()
|
||||
|
||||
def test_view(self, data):
|
||||
# __setitem__ does not work, so we only have a smoke-test
|
||||
data.view()
|
||||
|
||||
|
||||
class TestConstructors(BaseSparseTests, base.BaseConstructorsTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestReshaping(BaseSparseTests, base.BaseReshapingTests):
|
||||
def test_concat_mixed_dtypes(self, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/20762
|
||||
# This should be the same, aside from concat([sparse, float])
|
||||
df1 = pd.DataFrame({"A": data[:3]})
|
||||
df2 = pd.DataFrame({"A": [1, 2, 3]})
|
||||
df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
|
||||
dfs = [df1, df2, df3]
|
||||
|
||||
# dataframes
|
||||
result = pd.concat(dfs)
|
||||
expected = pd.concat(
|
||||
[x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs]
|
||||
)
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_columns(self, data, na_value):
|
||||
self._check_unsupported(data)
|
||||
super().test_concat_columns(data, na_value)
|
||||
|
||||
def test_concat_extension_arrays_copy_false(self, data, na_value):
|
||||
self._check_unsupported(data)
|
||||
super().test_concat_extension_arrays_copy_false(data, na_value)
|
||||
|
||||
def test_align(self, data, na_value):
|
||||
self._check_unsupported(data)
|
||||
super().test_align(data, na_value)
|
||||
|
||||
def test_align_frame(self, data, na_value):
|
||||
self._check_unsupported(data)
|
||||
super().test_align_frame(data, na_value)
|
||||
|
||||
def test_align_series_frame(self, data, na_value):
|
||||
self._check_unsupported(data)
|
||||
super().test_align_series_frame(data, na_value)
|
||||
|
||||
def test_merge(self, data, na_value):
|
||||
self._check_unsupported(data)
|
||||
super().test_merge(data, na_value)
|
||||
|
||||
@pytest.mark.xfail(reason="SparseArray does not support setitem")
|
||||
def test_transpose(self, data):
|
||||
super().test_transpose(data)
|
||||
|
||||
|
||||
class TestGetitem(BaseSparseTests, base.BaseGetitemTests):
|
||||
def test_get(self, data):
|
||||
ser = pd.Series(data, index=[2 * i for i in range(len(data))])
|
||||
if np.isnan(ser.values.fill_value):
|
||||
assert np.isnan(ser.get(4)) and np.isnan(ser.iloc[2])
|
||||
else:
|
||||
assert ser.get(4) == ser.iloc[2]
|
||||
assert ser.get(2) == ser.iloc[1]
|
||||
|
||||
def test_reindex(self, data, na_value):
|
||||
self._check_unsupported(data)
|
||||
super().test_reindex(data, na_value)
|
||||
|
||||
|
||||
# Skipping TestSetitem, since we don't implement it.
|
||||
|
||||
|
||||
class TestIndex(base.BaseIndexTests):
|
||||
def test_index_from_array(self, data):
|
||||
msg = "will store that array directly"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
idx = pd.Index(data)
|
||||
|
||||
if data.dtype.subtype == "f":
|
||||
assert idx.dtype == np.float64
|
||||
elif data.dtype.subtype == "i":
|
||||
assert idx.dtype == np.int64
|
||||
else:
|
||||
assert idx.dtype == data.dtype.subtype
|
||||
|
||||
# TODO(2.0): should pass once SparseArray is stored directly in Index.
|
||||
@pytest.mark.xfail(reason="Index cannot yet store sparse dtype")
|
||||
def test_index_from_listlike_with_dtype(self, data):
|
||||
msg = "passing a SparseArray to pd.Index"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
super().test_index_from_listlike_with_dtype(data)
|
||||
|
||||
|
||||
class TestMissing(BaseSparseTests, base.BaseMissingTests):
|
||||
def test_isna(self, data_missing):
|
||||
sarr = SparseArray(data_missing)
|
||||
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
|
||||
expected = SparseArray([True, False], dtype=expected_dtype)
|
||||
result = sarr.isna()
|
||||
tm.assert_sp_array_equal(result, expected)
|
||||
|
||||
# test isna for arr without na
|
||||
sarr = sarr.fillna(0)
|
||||
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
|
||||
expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype)
|
||||
self.assert_equal(sarr.isna(), expected)
|
||||
|
||||
def test_fillna_limit_pad(self, data_missing):
|
||||
with tm.assert_produces_warning(PerformanceWarning):
|
||||
super().test_fillna_limit_pad(data_missing)
|
||||
|
||||
def test_fillna_limit_backfill(self, data_missing):
|
||||
with tm.assert_produces_warning(PerformanceWarning):
|
||||
super().test_fillna_limit_backfill(data_missing)
|
||||
|
||||
def test_fillna_no_op_returns_copy(self, data, request):
|
||||
if np.isnan(data.fill_value):
|
||||
request.node.add_marker(
|
||||
pytest.mark.xfail(reason="returns array with different fill value")
|
||||
)
|
||||
with tm.assert_produces_warning(PerformanceWarning):
|
||||
super().test_fillna_no_op_returns_copy(data)
|
||||
|
||||
def test_fillna_series_method(self, data_missing):
|
||||
with tm.assert_produces_warning(PerformanceWarning):
|
||||
super().test_fillna_limit_backfill(data_missing)
|
||||
|
||||
@pytest.mark.skip(reason="Unsupported")
|
||||
def test_fillna_series(self):
|
||||
# this one looks doable.
|
||||
pass
|
||||
|
||||
def test_fillna_frame(self, data_missing):
|
||||
# Have to override to specify that fill_value will change.
|
||||
fill_value = data_missing[1]
|
||||
|
||||
result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value)
|
||||
|
||||
if pd.isna(data_missing.fill_value):
|
||||
dtype = SparseDtype(data_missing.dtype, fill_value)
|
||||
else:
|
||||
dtype = data_missing.dtype
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": data_missing._from_sequence([fill_value, fill_value], dtype=dtype),
|
||||
"B": [1, 2],
|
||||
}
|
||||
)
|
||||
|
||||
self.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestMethods(BaseSparseTests, base.BaseMethodsTests):
|
||||
def test_combine_le(self, data_repeated):
|
||||
# We return a Series[SparseArray].__le__ returns a
|
||||
# Series[Sparse[bool]]
|
||||
# rather than Series[bool]
|
||||
orig_data1, orig_data2 = data_repeated(2)
|
||||
s1 = pd.Series(orig_data1)
|
||||
s2 = pd.Series(orig_data2)
|
||||
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
|
||||
expected = pd.Series(
|
||||
SparseArray(
|
||||
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
|
||||
fill_value=False,
|
||||
)
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
val = s1.iloc[0]
|
||||
result = s1.combine(val, lambda x1, x2: x1 <= x2)
|
||||
expected = pd.Series(
|
||||
SparseArray([a <= val for a in list(orig_data1)], fill_value=False)
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_fillna_copy_frame(self, data_missing):
|
||||
arr = data_missing.take([1, 1])
|
||||
df = pd.DataFrame({"A": arr}, copy=False)
|
||||
|
||||
filled_val = df.iloc[0, 0]
|
||||
result = df.fillna(filled_val)
|
||||
|
||||
if hasattr(df._mgr, "blocks"):
|
||||
assert df.values.base is not result.values.base
|
||||
assert df.A._values.to_dense() is arr.to_dense()
|
||||
|
||||
def test_fillna_copy_series(self, data_missing):
|
||||
arr = data_missing.take([1, 1])
|
||||
ser = pd.Series(arr)
|
||||
|
||||
filled_val = ser[0]
|
||||
result = ser.fillna(filled_val)
|
||||
|
||||
assert ser._values is not result._values
|
||||
assert ser._values.to_dense() is arr.to_dense()
|
||||
|
||||
@pytest.mark.skip(reason="Not Applicable")
|
||||
def test_fillna_length_mismatch(self, data_missing):
|
||||
pass
|
||||
|
||||
def test_where_series(self, data, na_value):
|
||||
assert data[0] != data[1]
|
||||
cls = type(data)
|
||||
a, b = data[:2]
|
||||
|
||||
ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
|
||||
|
||||
cond = np.array([True, True, False, False])
|
||||
result = ser.where(cond)
|
||||
|
||||
new_dtype = SparseDtype("float", 0.0)
|
||||
expected = pd.Series(
|
||||
cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype)
|
||||
)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
|
||||
cond = np.array([True, False, True, True])
|
||||
result = ser.where(cond, other)
|
||||
expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_combine_first(self, data, request):
|
||||
if data.dtype.subtype == "int":
|
||||
# Right now this is upcasted to float, just like combine_first
|
||||
# for Series[int]
|
||||
mark = pytest.mark.xfail(
|
||||
reason="TODO(SparseArray.__setitem__) will preserve dtype."
|
||||
)
|
||||
request.node.add_marker(mark)
|
||||
super().test_combine_first(data)
|
||||
|
||||
def test_searchsorted(self, data_for_sorting, as_series):
|
||||
with tm.assert_produces_warning(PerformanceWarning):
|
||||
super().test_searchsorted(data_for_sorting, as_series)
|
||||
|
||||
def test_shift_0_periods(self, data):
|
||||
# GH#33856 shifting with periods=0 should return a copy, not same obj
|
||||
result = data.shift(0)
|
||||
|
||||
data._sparse_values[0] = data._sparse_values[1]
|
||||
assert result._sparse_values[0] != result._sparse_values[1]
|
||||
|
||||
@pytest.mark.parametrize("method", ["argmax", "argmin"])
|
||||
def test_argmin_argmax_all_na(self, method, data, na_value):
|
||||
# overriding because Sparse[int64, 0] cannot handle na_value
|
||||
self._check_unsupported(data)
|
||||
super().test_argmin_argmax_all_na(method, data, na_value)
|
||||
|
||||
@pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
|
||||
def test_equals(self, data, na_value, as_series, box):
|
||||
self._check_unsupported(data)
|
||||
super().test_equals(data, na_value, as_series, box)
|
||||
|
||||
|
||||
class TestCasting(BaseSparseTests, base.BaseCastingTests):
|
||||
def test_astype_object_series(self, all_data):
|
||||
# Unlike the base class, we do not expect the resulting Block
|
||||
# to be ObjectBlock / resulting array to be np.dtype("object")
|
||||
ser = pd.Series(all_data, name="A")
|
||||
result = ser.astype(object)
|
||||
assert is_object_dtype(result.dtype)
|
||||
assert is_object_dtype(result._mgr.array.dtype)
|
||||
|
||||
def test_astype_object_frame(self, all_data):
|
||||
# Unlike the base class, we do not expect the resulting Block
|
||||
# to be ObjectBlock / resulting array to be np.dtype("object")
|
||||
df = pd.DataFrame({"A": all_data})
|
||||
|
||||
result = df.astype(object)
|
||||
assert is_object_dtype(result._mgr.arrays[0].dtype)
|
||||
|
||||
# earlier numpy raises TypeError on e.g. np.dtype(np.int64) == "Int64"
|
||||
# instead of returning False
|
||||
if not np_version_under1p20:
|
||||
# check that we can compare the dtypes
|
||||
comp = result.dtypes == df.dtypes
|
||||
assert not comp.any()
|
||||
|
||||
def test_astype_str(self, data):
|
||||
result = pd.Series(data[:5]).astype(str)
|
||||
expected_dtype = SparseDtype(str, str(data.fill_value))
|
||||
expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype)
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.xfail(raises=TypeError, reason="no sparse StringDtype")
|
||||
def test_astype_string(self, data):
|
||||
super().test_astype_string(data)
|
||||
|
||||
|
||||
class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests):
|
||||
series_scalar_exc = None
|
||||
frame_scalar_exc = None
|
||||
divmod_exc = None
|
||||
series_array_exc = None
|
||||
|
||||
def _skip_if_different_combine(self, data):
|
||||
if data.fill_value == 0:
|
||||
# arith ops call on dtype.fill_value so that the sparsity
|
||||
# is maintained. Combine can't be called on a dtype in
|
||||
# general, so we can't make the expected. This is tested elsewhere
|
||||
raise pytest.skip("Incorrected expected from Series.combine")
|
||||
|
||||
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
|
||||
self._skip_if_different_combine(data)
|
||||
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
def test_arith_series_with_array(self, data, all_arithmetic_operators):
|
||||
self._skip_if_different_combine(data)
|
||||
super().test_arith_series_with_array(data, all_arithmetic_operators)
|
||||
|
||||
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
|
||||
if data.dtype.fill_value != 0:
|
||||
pass
|
||||
elif all_arithmetic_operators.strip("_") not in [
|
||||
"mul",
|
||||
"rmul",
|
||||
"floordiv",
|
||||
"rfloordiv",
|
||||
"pow",
|
||||
"mod",
|
||||
"rmod",
|
||||
]:
|
||||
mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch")
|
||||
request.node.add_marker(mark)
|
||||
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
|
||||
|
||||
def _check_divmod_op(self, ser, op, other, exc=NotImplementedError):
|
||||
# We implement divmod
|
||||
super()._check_divmod_op(ser, op, other, exc=None)
|
||||
|
||||
|
||||
class TestComparisonOps(BaseSparseTests):
|
||||
def _compare_other(self, data_for_compare: SparseArray, comparison_op, other):
|
||||
op = comparison_op
|
||||
|
||||
result = op(data_for_compare, other)
|
||||
assert isinstance(result, SparseArray)
|
||||
assert result.dtype.subtype == np.bool_
|
||||
|
||||
if isinstance(other, SparseArray):
|
||||
fill_value = op(data_for_compare.fill_value, other.fill_value)
|
||||
else:
|
||||
fill_value = np.all(
|
||||
op(np.asarray(data_for_compare.fill_value), np.asarray(other))
|
||||
)
|
||||
|
||||
expected = SparseArray(
|
||||
op(data_for_compare.to_dense(), np.asarray(other)),
|
||||
fill_value=fill_value,
|
||||
dtype=np.bool_,
|
||||
)
|
||||
tm.assert_sp_array_equal(result, expected)
|
||||
|
||||
def test_scalar(self, data_for_compare: SparseArray, comparison_op):
|
||||
self._compare_other(data_for_compare, comparison_op, 0)
|
||||
self._compare_other(data_for_compare, comparison_op, 1)
|
||||
self._compare_other(data_for_compare, comparison_op, -1)
|
||||
self._compare_other(data_for_compare, comparison_op, np.nan)
|
||||
|
||||
@pytest.mark.xfail(reason="Wrong indices")
|
||||
def test_array(self, data_for_compare: SparseArray, comparison_op):
|
||||
arr = np.linspace(-4, 5, 10)
|
||||
self._compare_other(data_for_compare, comparison_op, arr)
|
||||
|
||||
@pytest.mark.xfail(reason="Wrong indices")
|
||||
def test_sparse_array(self, data_for_compare: SparseArray, comparison_op):
|
||||
arr = data_for_compare + 1
|
||||
self._compare_other(data_for_compare, comparison_op, arr)
|
||||
arr = data_for_compare * 2
|
||||
self._compare_other(data_for_compare, comparison_op, arr)
|
||||
|
||||
|
||||
class TestPrinting(BaseSparseTests, base.BasePrintingTests):
|
||||
@pytest.mark.xfail(reason="Different repr")
|
||||
def test_array_repr(self, data, size):
|
||||
super().test_array_repr(data, size)
|
||||
|
||||
|
||||
class TestParsing(BaseSparseTests, base.BaseParsingTests):
|
||||
@pytest.mark.parametrize("engine", ["c", "python"])
|
||||
def test_EA_types(self, engine, data):
|
||||
expected_msg = r".*must implement _from_sequence_of_strings.*"
|
||||
with pytest.raises(NotImplementedError, match=expected_msg):
|
||||
super().test_EA_types(engine, data)
|
||||
@@ -0,0 +1,210 @@
|
||||
"""
|
||||
This file contains a minimal set of tests for compliance with the extension
|
||||
array interface test suite, and should contain no other tests.
|
||||
The test suite for the full functionality of the array is located in
|
||||
`pandas/tests/arrays/`.
|
||||
|
||||
The tests in this file are inherited from the BaseExtensionTests, and only
|
||||
minimal tweaks should be applied to get the tests passing (by overwriting a
|
||||
parent method).
|
||||
|
||||
Additional tests should either be added to one of the BaseExtensionTests
|
||||
classes (if they are relevant for the extension interface for all dtypes), or
|
||||
be added to the array-specific tests in `pandas/tests/arrays/`.
|
||||
|
||||
"""
|
||||
import string
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import pa_version_under2p0
|
||||
|
||||
import pandas as pd
|
||||
from pandas.core.arrays import ArrowStringArray
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
from pandas.tests.extension import base
|
||||
|
||||
|
||||
def split_array(arr):
|
||||
if arr.dtype.storage != "pyarrow":
|
||||
pytest.skip("only applicable for pyarrow chunked array n/a")
|
||||
|
||||
def _split_array(arr):
|
||||
import pyarrow as pa
|
||||
|
||||
arrow_array = arr._data
|
||||
split = len(arrow_array) // 2
|
||||
arrow_array = pa.chunked_array(
|
||||
[*arrow_array[:split].chunks, *arrow_array[split:].chunks]
|
||||
)
|
||||
assert arrow_array.num_chunks == 2
|
||||
return type(arr)(arrow_array)
|
||||
|
||||
return _split_array(arr)
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def chunked(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dtype(string_storage):
|
||||
return StringDtype(storage=string_storage)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data(dtype, chunked):
|
||||
strings = np.random.choice(list(string.ascii_letters), size=100)
|
||||
while strings[0] == strings[1]:
|
||||
strings = np.random.choice(list(string.ascii_letters), size=100)
|
||||
|
||||
arr = dtype.construct_array_type()._from_sequence(strings)
|
||||
return split_array(arr) if chunked else arr
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing(dtype, chunked):
|
||||
"""Length 2 array with [NA, Valid]"""
|
||||
arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"])
|
||||
return split_array(arr) if chunked else arr
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_sorting(dtype, chunked):
|
||||
arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"])
|
||||
return split_array(arr) if chunked else arr
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_missing_for_sorting(dtype, chunked):
|
||||
arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"])
|
||||
return split_array(arr) if chunked else arr
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def na_value():
|
||||
return pd.NA
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_for_grouping(dtype, chunked):
|
||||
arr = dtype.construct_array_type()._from_sequence(
|
||||
["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]
|
||||
)
|
||||
return split_array(arr) if chunked else arr
|
||||
|
||||
|
||||
class TestDtype(base.BaseDtypeTests):
|
||||
def test_eq_with_str(self, dtype):
|
||||
assert dtype == f"string[{dtype.storage}]"
|
||||
super().test_eq_with_str(dtype)
|
||||
|
||||
|
||||
class TestInterface(base.BaseInterfaceTests):
|
||||
def test_view(self, data, request):
|
||||
if data.dtype.storage == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="not implemented")
|
||||
request.node.add_marker(mark)
|
||||
super().test_view(data)
|
||||
|
||||
|
||||
class TestConstructors(base.BaseConstructorsTests):
|
||||
def test_from_dtype(self, data):
|
||||
# base test uses string representation of dtype
|
||||
pass
|
||||
|
||||
|
||||
class TestReshaping(base.BaseReshapingTests):
|
||||
def test_transpose(self, data, request):
|
||||
if data.dtype.storage == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="not implemented")
|
||||
request.node.add_marker(mark)
|
||||
super().test_transpose(data)
|
||||
|
||||
|
||||
class TestGetitem(base.BaseGetitemTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestSetitem(base.BaseSetitemTests):
|
||||
def test_setitem_preserves_views(self, data, request):
|
||||
if data.dtype.storage == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="not implemented")
|
||||
request.node.add_marker(mark)
|
||||
super().test_setitem_preserves_views(data)
|
||||
|
||||
|
||||
class TestIndex(base.BaseIndexTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestMissing(base.BaseMissingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestNoReduce(base.BaseNoReduceTests):
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
|
||||
op_name = all_numeric_reductions
|
||||
|
||||
if op_name in ["min", "max"]:
|
||||
return None
|
||||
|
||||
ser = pd.Series(data)
|
||||
with pytest.raises(TypeError):
|
||||
getattr(ser, op_name)(skipna=skipna)
|
||||
|
||||
|
||||
class TestMethods(base.BaseMethodsTests):
|
||||
@pytest.mark.skip(reason="returns nullable")
|
||||
def test_value_counts(self, all_data, dropna):
|
||||
return super().test_value_counts(all_data, dropna)
|
||||
|
||||
@pytest.mark.xfail(reason="returns nullable: GH 44692")
|
||||
def test_value_counts_with_normalize(self, data):
|
||||
super().test_value_counts_with_normalize(data)
|
||||
|
||||
|
||||
class TestCasting(base.BaseCastingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestComparisonOps(base.BaseComparisonOpsTests):
|
||||
def _compare_other(self, ser, data, op, other):
|
||||
op_name = f"__{op.__name__}__"
|
||||
result = getattr(ser, op_name)(other)
|
||||
expected = getattr(ser.astype(object), op_name)(other).astype("boolean")
|
||||
self.assert_series_equal(result, expected)
|
||||
|
||||
def test_compare_scalar(self, data, comparison_op):
|
||||
ser = pd.Series(data)
|
||||
self._compare_other(ser, data, comparison_op, "abc")
|
||||
|
||||
|
||||
class TestParsing(base.BaseParsingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestPrinting(base.BasePrintingTests):
|
||||
pass
|
||||
|
||||
|
||||
class TestGroupBy(base.BaseGroupbyTests):
|
||||
def test_groupby_extension_transform(self, data_for_grouping, request):
|
||||
if data_for_grouping.dtype.storage == "pyarrow" and pa_version_under2p0:
|
||||
# failure observed in 1.0.1, not in 2.0 or later
|
||||
mark = pytest.mark.xfail(reason="pyarrow raises in self._data[item]")
|
||||
request.node.add_marker(mark)
|
||||
super().test_groupby_extension_transform(data_for_grouping)
|
||||
|
||||
|
||||
class Test2DCompat(base.Dim2CompatTests):
|
||||
@pytest.fixture(autouse=True)
|
||||
def arrow_not_supported(self, data, request):
|
||||
if isinstance(data, ArrowStringArray):
|
||||
mark = pytest.mark.xfail(
|
||||
reason="2D support not implemented for ArrowStringArray"
|
||||
)
|
||||
request.node.add_marker(mark)
|
||||
Reference in New Issue
Block a user