first commit

This commit is contained in:
Carla Floricel
2022-08-02 09:52:52 -04:00
parent 417ea8660b
commit 05e52aa52b
10444 changed files with 2300232 additions and 0 deletions

View File

@@ -0,0 +1,58 @@
from __future__ import annotations
from pandas import (
DataFrame,
concat,
)
def _check_mixed_float(df, dtype=None):
# float16 are most likely to be upcasted to float32
dtypes = {"A": "float32", "B": "float32", "C": "float16", "D": "float64"}
if isinstance(dtype, str):
dtypes = {k: dtype for k, v in dtypes.items()}
elif isinstance(dtype, dict):
dtypes.update(dtype)
if dtypes.get("A"):
assert df.dtypes["A"] == dtypes["A"]
if dtypes.get("B"):
assert df.dtypes["B"] == dtypes["B"]
if dtypes.get("C"):
assert df.dtypes["C"] == dtypes["C"]
if dtypes.get("D"):
assert df.dtypes["D"] == dtypes["D"]
def _check_mixed_int(df, dtype=None):
dtypes = {"A": "int32", "B": "uint64", "C": "uint8", "D": "int64"}
if isinstance(dtype, str):
dtypes = {k: dtype for k, v in dtypes.items()}
elif isinstance(dtype, dict):
dtypes.update(dtype)
if dtypes.get("A"):
assert df.dtypes["A"] == dtypes["A"]
if dtypes.get("B"):
assert df.dtypes["B"] == dtypes["B"]
if dtypes.get("C"):
assert df.dtypes["C"] == dtypes["C"]
if dtypes.get("D"):
assert df.dtypes["D"] == dtypes["D"]
def zip_frames(frames: list[DataFrame], axis: int = 1) -> DataFrame:
"""
take a list of frames, zip them together under the
assumption that these all have the first frames' index/columns.
Returns
-------
new_frame : DataFrame
"""
if axis == 1:
columns = frames[0].columns
zipped = [f.loc[:, c] for c in columns for f in frames]
return concat(zipped, axis=1)
else:
index = frames[0].index
zipped = [f.loc[i, :] for i in index for f in frames]
return DataFrame(zipped)

View File

@@ -0,0 +1,284 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
NaT,
date_range,
)
import pandas._testing as tm
@pytest.fixture
def float_frame_with_na():
"""
Fixture for DataFrame of floats with index of unique strings
Columns are ['A', 'B', 'C', 'D']; some entries are missing
A B C D
ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997
DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872
neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522
0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018
3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826
soujjZ0A08 NaN NaN NaN NaN
7W6NLGsjB9 NaN NaN NaN NaN
... ... ... ... ...
uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590
n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717
ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189
uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503
3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947
2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083
sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517
[30 rows x 4 columns]
"""
df = DataFrame(tm.getSeriesData())
# set some NAs
df.iloc[5:10] = np.nan
df.iloc[15:20, -2:] = np.nan
return df
@pytest.fixture
def bool_frame_with_na():
"""
Fixture for DataFrame of booleans with index of unique strings
Columns are ['A', 'B', 'C', 'D']; some entries are missing
A B C D
zBZxY2IDGd False False False False
IhBWBMWllt False True True True
ctjdvZSR6R True False True True
AVTujptmxb False True False True
G9lrImrSWq False False False True
sFFwdIUfz2 NaN NaN NaN NaN
s15ptEJnRb NaN NaN NaN NaN
... ... ... ... ...
UW41KkDyZ4 True True False False
l9l6XkOdqV True False False False
X2MeZfzDYA False True False False
xWkIKU7vfX False True False True
QOhL6VmpGU False False False True
22PwkRJdat False True False False
kfboQ3VeIK True False True False
[30 rows x 4 columns]
"""
df = DataFrame(tm.getSeriesData()) > 0
df = df.astype(object)
# set some NAs
df.iloc[5:10] = np.nan
df.iloc[15:20, -2:] = np.nan
# For `any` tests we need to have at least one True before the first NaN
# in each column
for i in range(4):
df.iloc[i, i] = True
return df
@pytest.fixture
def float_string_frame():
"""
Fixture for DataFrame of floats and strings with index of unique strings
Columns are ['A', 'B', 'C', 'D', 'foo'].
A B C D foo
w3orJvq07g -1.594062 -1.084273 -1.252457 0.356460 bar
PeukuVdmz2 0.109855 -0.955086 -0.809485 0.409747 bar
ahp2KvwiM8 -1.533729 -0.142519 -0.154666 1.302623 bar
3WSJ7BUCGd 2.484964 0.213829 0.034778 -2.327831 bar
khdAmufk0U -0.193480 -0.743518 -0.077987 0.153646 bar
LE2DZiFlrE -0.193566 -1.343194 -0.107321 0.959978 bar
HJXSJhVn7b 0.142590 1.257603 -0.659409 -0.223844 bar
... ... ... ... ... ...
9a1Vypttgw -1.316394 1.601354 0.173596 1.213196 bar
h5d1gVFbEy 0.609475 1.106738 -0.155271 0.294630 bar
mK9LsTQG92 1.303613 0.857040 -1.019153 0.369468 bar
oOLksd9gKH 0.558219 -0.134491 -0.289869 -0.951033 bar
9jgoOjKyHg 0.058270 -0.496110 -0.413212 -0.852659 bar
jZLDHclHAO 0.096298 1.267510 0.549206 -0.005235 bar
lR0nxDp1C2 -2.119350 -0.794384 0.544118 0.145849 bar
[30 rows x 5 columns]
"""
df = DataFrame(tm.getSeriesData())
df["foo"] = "bar"
return df
@pytest.fixture
def mixed_float_frame():
"""
Fixture for DataFrame of different float types with index of unique strings
Columns are ['A', 'B', 'C', 'D'].
A B C D
GI7bbDaEZe -0.237908 -0.246225 -0.468506 0.752993
KGp9mFepzA -1.140809 -0.644046 -1.225586 0.801588
VeVYLAb1l2 -1.154013 -1.677615 0.690430 -0.003731
kmPME4WKhO 0.979578 0.998274 -0.776367 0.897607
CPyopdXTiz 0.048119 -0.257174 0.836426 0.111266
0kJZQndAj0 0.274357 -0.281135 -0.344238 0.834541
tqdwQsaHG8 -0.979716 -0.519897 0.582031 0.144710
... ... ... ... ...
7FhZTWILQj -2.906357 1.261039 -0.780273 -0.537237
4pUDPM4eGq -2.042512 -0.464382 -0.382080 1.132612
B8dUgUzwTi -1.506637 -0.364435 1.087891 0.297653
hErlVYjVv9 1.477453 -0.495515 -0.713867 1.438427
1BKN3o7YLs 0.127535 -0.349812 -0.881836 0.489827
9S4Ekn7zga 1.445518 -2.095149 0.031982 0.373204
xN1dNn6OV6 1.425017 -0.983995 -0.363281 -0.224502
[30 rows x 4 columns]
"""
df = DataFrame(tm.getSeriesData())
df.A = df.A.astype("float32")
df.B = df.B.astype("float32")
df.C = df.C.astype("float16")
df.D = df.D.astype("float64")
return df
@pytest.fixture
def mixed_int_frame():
"""
Fixture for DataFrame of different int types with index of unique strings
Columns are ['A', 'B', 'C', 'D'].
A B C D
mUrCZ67juP 0 1 2 2
rw99ACYaKS 0 1 0 0
7QsEcpaaVU 0 1 1 1
xkrimI2pcE 0 1 0 0
dz01SuzoS8 0 1 255 255
ccQkqOHX75 -1 1 0 0
DN0iXaoDLd 0 1 0 0
... .. .. ... ...
Dfb141wAaQ 1 1 254 254
IPD8eQOVu5 0 1 0 0
CcaKulsCmv 0 1 0 0
rIBa8gu7E5 0 1 0 0
RP6peZmh5o 0 1 1 1
NMb9pipQWQ 0 1 0 0
PqgbJEzjib 0 1 3 3
[30 rows x 4 columns]
"""
df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()})
df.A = df.A.astype("int32")
df.B = np.ones(len(df.B), dtype="uint64")
df.C = df.C.astype("uint8")
df.D = df.C.astype("int64")
return df
@pytest.fixture
def timezone_frame():
"""
Fixture for DataFrame of date_range Series with different time zones
Columns are ['A', 'B', 'C']; some entries are missing
A B C
0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00
1 2013-01-02 NaT NaT
2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00
"""
df = DataFrame(
{
"A": date_range("20130101", periods=3),
"B": date_range("20130101", periods=3, tz="US/Eastern"),
"C": date_range("20130101", periods=3, tz="CET"),
}
)
df.iloc[1, 1] = NaT
df.iloc[1, 2] = NaT
return df
@pytest.fixture
def uint64_frame():
"""
Fixture for DataFrame with uint64 values
Columns are ['A', 'B']
"""
return DataFrame(
{"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, dtype=np.uint64
)
@pytest.fixture
def simple_frame():
"""
Fixture for simple 3x3 DataFrame
Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c'].
one two three
a 1.0 2.0 3.0
b 4.0 5.0 6.0
c 7.0 8.0 9.0
"""
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"])
@pytest.fixture
def frame_of_index_cols():
"""
Fixture for DataFrame of columns that can be used for indexing
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
A B C D E (tuple, as, label)
0 foo one a 0.608477 -0.012500 -1.664297
1 foo two b -0.633460 0.249614 -0.364411
2 foo three c 0.615256 2.154968 -0.834666
3 bar one d 0.234246 1.085675 0.718445
4 bar two e 0.533841 -0.005702 -3.533912
"""
df = DataFrame(
{
"A": ["foo", "foo", "foo", "bar", "bar"],
"B": ["one", "two", "three", "one", "two"],
"C": ["a", "b", "c", "d", "e"],
"D": np.random.randn(5),
"E": np.random.randn(5),
("tuple", "as", "label"): np.random.randn(5),
}
)
return df
@pytest.fixture(
params=[
"any",
"all",
"count",
"sum",
"prod",
"max",
"min",
"mean",
"median",
"skew",
"kurt",
"sem",
"var",
"std",
"mad",
]
)
def reduction_functions(request):
return request.param

View File

@@ -0,0 +1,191 @@
from collections import OrderedDict
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
)
import pandas._testing as tm
from pandas.core.construction import create_series_with_explicit_dtype
class TestFromDict:
# Note: these tests are specific to the from_dict method, not for
# passing dictionaries to DataFrame.__init__
def test_from_dict_scalars_requires_index(self):
msg = "If using all scalar values, you must pass an index"
with pytest.raises(ValueError, match=msg):
DataFrame.from_dict(OrderedDict([("b", 8), ("a", 5), ("a", 6)]))
def test_constructor_list_of_odicts(self):
data = [
OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]]),
OrderedDict([["a", 1.5], ["b", 3], ["d", 6]]),
OrderedDict([["a", 1.5], ["d", 6]]),
OrderedDict(),
OrderedDict([["a", 1.5], ["b", 3], ["c", 4]]),
OrderedDict([["b", 3], ["c", 4], ["d", 6]]),
]
result = DataFrame(data)
expected = DataFrame.from_dict(
dict(zip(range(len(data)), data)), orient="index"
)
tm.assert_frame_equal(result, expected.reindex(result.index))
def test_constructor_single_row(self):
data = [OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]])]
result = DataFrame(data)
expected = DataFrame.from_dict(dict(zip([0], data)), orient="index").reindex(
result.index
)
tm.assert_frame_equal(result, expected)
def test_constructor_list_of_series(self):
data = [
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 6.0]]),
]
sdict = OrderedDict(zip(["x", "y"], data))
idx = Index(["a", "b", "c"])
# all named
data2 = [
Series([1.5, 3, 4], idx, dtype="O", name="x"),
Series([1.5, 3, 6], idx, name="y"),
]
result = DataFrame(data2)
expected = DataFrame.from_dict(sdict, orient="index")
tm.assert_frame_equal(result, expected)
# some unnamed
data2 = [
Series([1.5, 3, 4], idx, dtype="O", name="x"),
Series([1.5, 3, 6], idx),
]
result = DataFrame(data2)
sdict = OrderedDict(zip(["x", "Unnamed 0"], data))
expected = DataFrame.from_dict(sdict, orient="index")
tm.assert_frame_equal(result, expected)
# none named
data = [
OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]]),
OrderedDict([["a", 1.5], ["b", 3], ["d", 6]]),
OrderedDict([["a", 1.5], ["d", 6]]),
OrderedDict(),
OrderedDict([["a", 1.5], ["b", 3], ["c", 4]]),
OrderedDict([["b", 3], ["c", 4], ["d", 6]]),
]
data = [
create_series_with_explicit_dtype(d, dtype_if_empty=object) for d in data
]
result = DataFrame(data)
sdict = OrderedDict(zip(range(len(data)), data))
expected = DataFrame.from_dict(sdict, orient="index")
tm.assert_frame_equal(result, expected.reindex(result.index))
result2 = DataFrame(data, index=np.arange(6))
tm.assert_frame_equal(result, result2)
result = DataFrame([Series(dtype=object)])
expected = DataFrame(index=[0])
tm.assert_frame_equal(result, expected)
data = [
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 6.0]]),
]
sdict = OrderedDict(zip(range(len(data)), data))
idx = Index(["a", "b", "c"])
data2 = [Series([1.5, 3, 4], idx, dtype="O"), Series([1.5, 3, 6], idx)]
result = DataFrame(data2)
expected = DataFrame.from_dict(sdict, orient="index")
tm.assert_frame_equal(result, expected)
def test_constructor_orient(self, float_string_frame):
data_dict = float_string_frame.T._series
recons = DataFrame.from_dict(data_dict, orient="index")
expected = float_string_frame.reindex(index=recons.index)
tm.assert_frame_equal(recons, expected)
# dict of sequence
a = {"hi": [32, 3, 3], "there": [3, 5, 3]}
rs = DataFrame.from_dict(a, orient="index")
xp = DataFrame.from_dict(a).T.reindex(list(a.keys()))
tm.assert_frame_equal(rs, xp)
def test_constructor_from_ordered_dict(self):
# GH#8425
a = OrderedDict(
[
("one", OrderedDict([("col_a", "foo1"), ("col_b", "bar1")])),
("two", OrderedDict([("col_a", "foo2"), ("col_b", "bar2")])),
("three", OrderedDict([("col_a", "foo3"), ("col_b", "bar3")])),
]
)
expected = DataFrame.from_dict(a, orient="columns").T
result = DataFrame.from_dict(a, orient="index")
tm.assert_frame_equal(result, expected)
def test_from_dict_columns_parameter(self):
# GH#18529
# Test new columns parameter for from_dict that was added to make
# from_items(..., orient='index', columns=[...]) easier to replicate
result = DataFrame.from_dict(
OrderedDict([("A", [1, 2]), ("B", [4, 5])]),
orient="index",
columns=["one", "two"],
)
expected = DataFrame([[1, 2], [4, 5]], index=["A", "B"], columns=["one", "two"])
tm.assert_frame_equal(result, expected)
msg = "cannot use columns parameter with orient='columns'"
with pytest.raises(ValueError, match=msg):
DataFrame.from_dict(
{"A": [1, 2], "B": [4, 5]},
orient="columns",
columns=["one", "two"],
)
with pytest.raises(ValueError, match=msg):
DataFrame.from_dict({"A": [1, 2], "B": [4, 5]}, columns=["one", "two"])
@pytest.mark.parametrize(
"data_dict, keys, orient",
[
({}, [], "index"),
([{("a",): 1}, {("a",): 2}], [("a",)], "columns"),
([OrderedDict([(("a",), 1), (("b",), 2)])], [("a",), ("b",)], "columns"),
([{("a", "b"): 1}], [("a", "b")], "columns"),
],
)
def test_constructor_from_dict_tuples(self, data_dict, keys, orient):
# GH#16769
df = DataFrame.from_dict(data_dict, orient)
result = df.columns
expected = Index(keys, dtype="object", tupleize_cols=False)
tm.assert_index_equal(result, expected)
def test_frame_dict_constructor_empty_series(self):
s1 = Series(
[1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)])
)
s2 = Series(
[1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)])
)
s3 = Series(dtype=object)
# it works!
DataFrame({"foo": s1, "bar": s2, "baz": s3})
DataFrame.from_dict({"foo": s1, "baz": s3, "bar": s2})

View File

@@ -0,0 +1,466 @@
from datetime import datetime
from decimal import Decimal
import numpy as np
import pytest
import pytz
from pandas.compat import is_platform_little_endian
from pandas import (
CategoricalIndex,
DataFrame,
Index,
Interval,
RangeIndex,
Series,
)
import pandas._testing as tm
class TestFromRecords:
def test_from_records_with_datetimes(self):
# this may fail on certain platforms because of a numpy issue
# related GH#6140
if not is_platform_little_endian():
pytest.skip("known failure of test on non-little endian")
# construction with a null in a recarray
# GH#6140
expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]})
arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])]
dtypes = [("EXPIRY", "<M8[ns]")]
recarray = np.core.records.fromarrays(arrdata, dtype=dtypes)
result = DataFrame.from_records(recarray)
tm.assert_frame_equal(result, expected)
# coercion should work too
arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])]
dtypes = [("EXPIRY", "<M8[m]")]
recarray = np.core.records.fromarrays(arrdata, dtype=dtypes)
result = DataFrame.from_records(recarray)
tm.assert_frame_equal(result, expected)
def test_from_records_sequencelike(self):
df = DataFrame(
{
"A": np.array(np.random.randn(6), dtype=np.float64),
"A1": np.array(np.random.randn(6), dtype=np.float64),
"B": np.array(np.arange(6), dtype=np.int64),
"C": ["foo"] * 6,
"D": np.array([True, False] * 3, dtype=bool),
"E": np.array(np.random.randn(6), dtype=np.float32),
"E1": np.array(np.random.randn(6), dtype=np.float32),
"F": np.array(np.arange(6), dtype=np.int32),
}
)
# this is actually tricky to create the recordlike arrays and
# have the dtypes be intact
blocks = df._to_dict_of_blocks()
tuples = []
columns = []
dtypes = []
for dtype, b in blocks.items():
columns.extend(b.columns)
dtypes.extend([(c, np.dtype(dtype).descr[0][1]) for c in b.columns])
for i in range(len(df.index)):
tup = []
for _, b in blocks.items():
tup.extend(b.iloc[i].values)
tuples.append(tuple(tup))
recarray = np.array(tuples, dtype=dtypes).view(np.recarray)
recarray2 = df.to_records()
lists = [list(x) for x in tuples]
# tuples (lose the dtype info)
result = DataFrame.from_records(tuples, columns=columns).reindex(
columns=df.columns
)
# created recarray and with to_records recarray (have dtype info)
result2 = DataFrame.from_records(recarray, columns=columns).reindex(
columns=df.columns
)
result3 = DataFrame.from_records(recarray2, columns=columns).reindex(
columns=df.columns
)
# list of tupels (no dtype info)
result4 = DataFrame.from_records(lists, columns=columns).reindex(
columns=df.columns
)
tm.assert_frame_equal(result, df, check_dtype=False)
tm.assert_frame_equal(result2, df)
tm.assert_frame_equal(result3, df)
tm.assert_frame_equal(result4, df, check_dtype=False)
# tuples is in the order of the columns
result = DataFrame.from_records(tuples)
tm.assert_index_equal(result.columns, RangeIndex(8))
# test exclude parameter & we are casting the results here (as we don't
# have dtype info to recover)
columns_to_test = [columns.index("C"), columns.index("E1")]
exclude = list(set(range(8)) - set(columns_to_test))
result = DataFrame.from_records(tuples, exclude=exclude)
result.columns = [columns[i] for i in sorted(columns_to_test)]
tm.assert_series_equal(result["C"], df["C"])
tm.assert_series_equal(result["E1"], df["E1"])
def test_from_records_sequencelike_empty(self):
# empty case
result = DataFrame.from_records([], columns=["foo", "bar", "baz"])
assert len(result) == 0
tm.assert_index_equal(result.columns, Index(["foo", "bar", "baz"]))
result = DataFrame.from_records([])
assert len(result) == 0
assert len(result.columns) == 0
def test_from_records_dictlike(self):
# test the dict methods
df = DataFrame(
{
"A": np.array(np.random.randn(6), dtype=np.float64),
"A1": np.array(np.random.randn(6), dtype=np.float64),
"B": np.array(np.arange(6), dtype=np.int64),
"C": ["foo"] * 6,
"D": np.array([True, False] * 3, dtype=bool),
"E": np.array(np.random.randn(6), dtype=np.float32),
"E1": np.array(np.random.randn(6), dtype=np.float32),
"F": np.array(np.arange(6), dtype=np.int32),
}
)
# columns is in a different order here than the actual items iterated
# from the dict
blocks = df._to_dict_of_blocks()
columns = []
for b in blocks.values():
columns.extend(b.columns)
asdict = {x: y for x, y in df.items()}
asdict2 = {x: y.values for x, y in df.items()}
# dict of series & dict of ndarrays (have dtype info)
results = []
results.append(DataFrame.from_records(asdict).reindex(columns=df.columns))
results.append(
DataFrame.from_records(asdict, columns=columns).reindex(columns=df.columns)
)
results.append(
DataFrame.from_records(asdict2, columns=columns).reindex(columns=df.columns)
)
for r in results:
tm.assert_frame_equal(r, df)
def test_from_records_with_index_data(self):
df = DataFrame(np.random.randn(10, 3), columns=["A", "B", "C"])
data = np.random.randn(10)
df1 = DataFrame.from_records(df, index=data)
tm.assert_index_equal(df1.index, Index(data))
def test_from_records_bad_index_column(self):
df = DataFrame(np.random.randn(10, 3), columns=["A", "B", "C"])
# should pass
df1 = DataFrame.from_records(df, index=["C"])
tm.assert_index_equal(df1.index, Index(df.C))
df1 = DataFrame.from_records(df, index="C")
tm.assert_index_equal(df1.index, Index(df.C))
# should fail
msg = "|".join(
[
r"Length of values \(10\) does not match length of index \(1\)",
]
)
with pytest.raises(ValueError, match=msg):
DataFrame.from_records(df, index=[2])
with pytest.raises(KeyError, match=r"^2$"):
DataFrame.from_records(df, index=2)
def test_from_records_non_tuple(self):
class Record:
def __init__(self, *args):
self.args = args
def __getitem__(self, i):
return self.args[i]
def __iter__(self):
return iter(self.args)
recs = [Record(1, 2, 3), Record(4, 5, 6), Record(7, 8, 9)]
tups = [tuple(rec) for rec in recs]
result = DataFrame.from_records(recs)
expected = DataFrame.from_records(tups)
tm.assert_frame_equal(result, expected)
def test_from_records_len0_with_columns(self):
# GH#2633
result = DataFrame.from_records([], index="foo", columns=["foo", "bar"])
expected = Index(["bar"])
assert len(result) == 0
assert result.index.name == "foo"
tm.assert_index_equal(result.columns, expected)
def test_from_records_series_list_dict(self):
# GH#27358
expected = DataFrame([[{"a": 1, "b": 2}, {"a": 3, "b": 4}]]).T
data = Series([[{"a": 1, "b": 2}], [{"a": 3, "b": 4}]])
result = DataFrame.from_records(data)
tm.assert_frame_equal(result, expected)
def test_from_records_series_categorical_index(self):
# GH#32805
index = CategoricalIndex(
[Interval(-20, -10), Interval(-10, 0), Interval(0, 10)]
)
series_of_dicts = Series([{"a": 1}, {"a": 2}, {"b": 3}], index=index)
frame = DataFrame.from_records(series_of_dicts, index=index)
expected = DataFrame(
{"a": [1, 2, np.NaN], "b": [np.NaN, np.NaN, 3]}, index=index
)
tm.assert_frame_equal(frame, expected)
def test_frame_from_records_utc(self):
rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)}
# it works
DataFrame.from_records([rec], index="begin_time")
def test_from_records_to_records(self):
# from numpy documentation
arr = np.zeros((2,), dtype=("i4,f4,a10"))
arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
# TODO(wesm): unused
frame = DataFrame.from_records(arr) # noqa
index = Index(np.arange(len(arr))[::-1])
indexed_frame = DataFrame.from_records(arr, index=index)
tm.assert_index_equal(indexed_frame.index, index)
# without names, it should go to last ditch
arr2 = np.zeros((2, 3))
tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2))
# wrong length
msg = "|".join(
[
r"Length of values \(2\) does not match length of index \(1\)",
]
)
with pytest.raises(ValueError, match=msg):
DataFrame.from_records(arr, index=index[:-1])
indexed_frame = DataFrame.from_records(arr, index="f1")
# what to do?
records = indexed_frame.to_records()
assert len(records.dtype.names) == 3
records = indexed_frame.to_records(index=False)
assert len(records.dtype.names) == 2
assert "index" not in records.dtype.names
def test_from_records_nones(self):
tuples = [(1, 2, None, 3), (1, 2, None, 3), (None, 2, 5, 3)]
df = DataFrame.from_records(tuples, columns=["a", "b", "c", "d"])
assert np.isnan(df["c"][0])
def test_from_records_iterator(self):
arr = np.array(
[(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5.0, 5.0, 6, 6), (7.0, 7.0, 8, 8)],
dtype=[
("x", np.float64),
("u", np.float32),
("y", np.int64),
("z", np.int32),
],
)
df = DataFrame.from_records(iter(arr), nrows=2)
xp = DataFrame(
{
"x": np.array([1.0, 3.0], dtype=np.float64),
"u": np.array([1.0, 3.0], dtype=np.float32),
"y": np.array([2, 4], dtype=np.int64),
"z": np.array([2, 4], dtype=np.int32),
}
)
tm.assert_frame_equal(df.reindex_like(xp), xp)
# no dtypes specified here, so just compare with the default
arr = [(1.0, 2), (3.0, 4), (5.0, 6), (7.0, 8)]
df = DataFrame.from_records(iter(arr), columns=["x", "y"], nrows=2)
tm.assert_frame_equal(df, xp.reindex(columns=["x", "y"]), check_dtype=False)
def test_from_records_tuples_generator(self):
def tuple_generator(length):
for i in range(length):
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
yield (i, letters[i % len(letters)], i / length)
columns_names = ["Integer", "String", "Float"]
columns = [
[i[j] for i in tuple_generator(10)] for j in range(len(columns_names))
]
data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]}
expected = DataFrame(data, columns=columns_names)
generator = tuple_generator(10)
result = DataFrame.from_records(generator, columns=columns_names)
tm.assert_frame_equal(result, expected)
def test_from_records_lists_generator(self):
def list_generator(length):
for i in range(length):
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
yield [i, letters[i % len(letters)], i / length]
columns_names = ["Integer", "String", "Float"]
columns = [
[i[j] for i in list_generator(10)] for j in range(len(columns_names))
]
data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]}
expected = DataFrame(data, columns=columns_names)
generator = list_generator(10)
result = DataFrame.from_records(generator, columns=columns_names)
tm.assert_frame_equal(result, expected)
def test_from_records_columns_not_modified(self):
tuples = [(1, 2, 3), (1, 2, 3), (2, 5, 3)]
columns = ["a", "b", "c"]
original_columns = list(columns)
df = DataFrame.from_records(tuples, columns=columns, index="a") # noqa
assert columns == original_columns
def test_from_records_decimal(self):
tuples = [(Decimal("1.5"),), (Decimal("2.5"),), (None,)]
df = DataFrame.from_records(tuples, columns=["a"])
assert df["a"].dtype == object
df = DataFrame.from_records(tuples, columns=["a"], coerce_float=True)
assert df["a"].dtype == np.float64
assert np.isnan(df["a"].values[-1])
def test_from_records_duplicates(self):
result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"])
expected = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"])
tm.assert_frame_equal(result, expected)
def test_from_records_set_index_name(self):
def create_dict(order_id):
return {
"order_id": order_id,
"quantity": np.random.randint(1, 10),
"price": np.random.randint(1, 10),
}
documents = [create_dict(i) for i in range(10)]
# demo missing data
documents.append({"order_id": 10, "quantity": 5})
result = DataFrame.from_records(documents, index="order_id")
assert result.index.name == "order_id"
# MultiIndex
result = DataFrame.from_records(documents, index=["order_id", "quantity"])
assert result.index.names == ("order_id", "quantity")
def test_from_records_misc_brokenness(self):
# GH#2179
data = {1: ["foo"], 2: ["bar"]}
result = DataFrame.from_records(data, columns=["a", "b"])
exp = DataFrame(data, columns=["a", "b"])
tm.assert_frame_equal(result, exp)
# overlap in index/index_names
data = {"a": [1, 2, 3], "b": [4, 5, 6]}
result = DataFrame.from_records(data, index=["a", "b", "c"])
exp = DataFrame(data, index=["a", "b", "c"])
tm.assert_frame_equal(result, exp)
# GH#2623
rows = []
rows.append([datetime(2010, 1, 1), 1])
rows.append([datetime(2010, 1, 2), "hi"]) # test col upconverts to obj
df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
result = df2_obj.dtypes
expected = Series(
[np.dtype("datetime64[ns]"), np.dtype("object")], index=["date", "test"]
)
tm.assert_series_equal(result, expected)
rows = []
rows.append([datetime(2010, 1, 1), 1])
rows.append([datetime(2010, 1, 2), 1])
df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
result = df2_obj.dtypes
expected = Series(
[np.dtype("datetime64[ns]"), np.dtype("int64")], index=["date", "test"]
)
tm.assert_series_equal(result, expected)
def test_from_records_empty(self):
# GH#3562
result = DataFrame.from_records([], columns=["a", "b", "c"])
expected = DataFrame(columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)
result = DataFrame.from_records([], columns=["a", "b", "b"])
expected = DataFrame(columns=["a", "b", "b"])
tm.assert_frame_equal(result, expected)
def test_from_records_empty_with_nonempty_fields_gh3682(self):
a = np.array([(1, 2)], dtype=[("id", np.int64), ("value", np.int64)])
df = DataFrame.from_records(a, index="id")
ex_index = Index([1], name="id")
expected = DataFrame({"value": [2]}, index=ex_index, columns=["value"])
tm.assert_frame_equal(df, expected)
b = a[:0]
df2 = DataFrame.from_records(b, index="id")
tm.assert_frame_equal(df2, df.iloc[:0])
def test_from_records_empty2(self):
# GH#42456
dtype = [("prop", int)]
shape = (0, len(dtype))
arr = np.empty(shape, dtype=dtype)
result = DataFrame.from_records(arr)
expected = DataFrame({"prop": np.array([], dtype=int)})
tm.assert_frame_equal(result, expected)
alt = DataFrame(arr)
tm.assert_frame_equal(alt, expected)

View File

@@ -0,0 +1,60 @@
import re
import numpy as np
import pytest
from pandas import (
DataFrame,
MultiIndex,
)
class TestDataFrameDelItem:
def test_delitem(self, float_frame):
del float_frame["A"]
assert "A" not in float_frame
def test_delitem_multiindex(self):
midx = MultiIndex.from_product([["A", "B"], [1, 2]])
df = DataFrame(np.random.randn(4, 4), columns=midx)
assert len(df.columns) == 4
assert ("A",) in df.columns
assert "A" in df.columns
result = df["A"]
assert isinstance(result, DataFrame)
del df["A"]
assert len(df.columns) == 2
# A still in the levels, BUT get a KeyError if trying
# to delete
assert ("A",) not in df.columns
with pytest.raises(KeyError, match=re.escape("('A',)")):
del df[("A",)]
# behavior of dropped/deleted MultiIndex levels changed from
# GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
# levels which are dropped/deleted
assert "A" not in df.columns
with pytest.raises(KeyError, match=re.escape("('A',)")):
del df["A"]
def test_delitem_corner(self, float_frame):
f = float_frame.copy()
del f["D"]
assert len(f.columns) == 3
with pytest.raises(KeyError, match=r"^'D'$"):
del f["D"]
del f["B"]
assert len(f.columns) == 2
def test_delitem_col_still_multiindex(self):
arrays = [["a", "b", "c", "top"], ["", "", "", "OD"], ["", "", "", "wx"]]
tuples = sorted(zip(*arrays))
index = MultiIndex.from_tuples(tuples)
df = DataFrame(np.random.randn(3, 4), columns=index)
del df[("a", "", "")]
assert isinstance(df.columns, MultiIndex)

View File

@@ -0,0 +1,27 @@
import pytest
from pandas import DataFrame
import pandas._testing as tm
class TestGet:
def test_get(self, float_frame):
b = float_frame.get("B")
tm.assert_series_equal(b, float_frame["B"])
assert float_frame.get("foo") is None
tm.assert_series_equal(
float_frame.get("foo", float_frame["B"]), float_frame["B"]
)
@pytest.mark.parametrize(
"df",
[
DataFrame(),
DataFrame(columns=list("AB")),
DataFrame(columns=list("AB"), index=range(3)),
],
)
def test_get_none(self, df):
# see gh-5652
assert df.get(None) is None

View File

@@ -0,0 +1,22 @@
import pytest
from pandas import (
DataFrame,
MultiIndex,
)
class TestGetValue:
def test_get_set_value_no_partial_indexing(self):
# partial w/ MultiIndex raise exception
index = MultiIndex.from_tuples([(0, 1), (0, 2), (1, 1), (1, 2)])
df = DataFrame(index=index, columns=range(4))
with pytest.raises(KeyError, match=r"^0$"):
df._get_value(0, 1)
def test_get_value(self, float_frame):
for idx in float_frame.index:
for col in float_frame.columns:
result = float_frame._get_value(idx, col)
expected = float_frame[col][idx]
assert result == expected

View File

@@ -0,0 +1,406 @@
import re
import numpy as np
import pytest
from pandas import (
Categorical,
CategoricalDtype,
CategoricalIndex,
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
concat,
get_dummies,
period_range,
)
import pandas._testing as tm
from pandas.core.arrays import SparseArray
class TestGetitem:
def test_getitem_unused_level_raises(self):
# GH#20410
mi = MultiIndex(
levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]],
codes=[[1, 0], [1, 0]],
)
df = DataFrame(-1, index=range(3), columns=mi)
with pytest.raises(KeyError, match="notevenone"):
df["notevenone"]
def test_getitem_periodindex(self):
rng = period_range("1/1/2000", periods=5)
df = DataFrame(np.random.randn(10, 5), columns=rng)
ts = df[rng[0]]
tm.assert_series_equal(ts, df.iloc[:, 0])
# GH#1211; smoketest unrelated to the rest of this test
repr(df)
ts = df["1/1/2000"]
tm.assert_series_equal(ts, df.iloc[:, 0])
def test_getitem_list_of_labels_categoricalindex_cols(self):
# GH#16115
cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")])
expected = DataFrame(
[[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats
)
dummies = get_dummies(cats)
result = dummies[list(dummies.columns)]
tm.assert_frame_equal(result, expected)
def test_getitem_sparse_column_return_type_and_dtype(self):
# https://github.com/pandas-dev/pandas/issues/23559
data = SparseArray([0, 1])
df = DataFrame({"A": data})
expected = Series(data, name="A")
result = df["A"]
tm.assert_series_equal(result, expected)
# Also check iloc and loc while we're here
result = df.iloc[:, 0]
tm.assert_series_equal(result, expected)
result = df.loc[:, "A"]
tm.assert_series_equal(result, expected)
class TestGetitemListLike:
def test_getitem_list_missing_key(self):
# GH#13822, incorrect error string with non-unique columns when missing
# column is accessed
df = DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]})
df.columns = ["x", "x", "z"]
# Check that we get the correct value in the KeyError
with pytest.raises(KeyError, match=r"\['y'\] not in index"):
df[["x", "y", "z"]]
def test_getitem_list_duplicates(self):
# GH#1943
df = DataFrame(np.random.randn(4, 4), columns=list("AABC"))
df.columns.name = "foo"
result = df[["B", "C"]]
assert result.columns.name == "foo"
expected = df.iloc[:, 2:]
tm.assert_frame_equal(result, expected)
def test_getitem_dupe_cols(self):
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\""
with pytest.raises(KeyError, match=re.escape(msg)):
df[["baf"]]
@pytest.mark.parametrize(
"idx_type",
[
list,
iter,
Index,
set,
lambda l: dict(zip(l, range(len(l)))),
lambda l: dict(zip(l, range(len(l)))).keys(),
],
ids=["list", "iter", "Index", "set", "dict", "dict_keys"],
)
@pytest.mark.parametrize("levels", [1, 2])
def test_getitem_listlike(self, idx_type, levels, float_frame):
# GH#21294
if levels == 1:
frame, missing = float_frame, "food"
else:
# MultiIndex columns
frame = DataFrame(
np.random.randn(8, 3),
columns=Index(
[("foo", "bar"), ("baz", "qux"), ("peek", "aboo")],
name=("sth", "sth2"),
),
)
missing = ("good", "food")
keys = [frame.columns[1], frame.columns[0]]
idx = idx_type(keys)
idx_check = list(idx_type(keys))
if isinstance(idx, (set, dict)):
with tm.assert_produces_warning(FutureWarning):
result = frame[idx]
else:
result = frame[idx]
expected = frame.loc[:, idx_check]
expected.columns.names = frame.columns.names
tm.assert_frame_equal(result, expected)
idx = idx_type(keys + [missing])
with pytest.raises(KeyError, match="not in index"):
with tm.assert_produces_warning(FutureWarning):
frame[idx]
def test_getitem_iloc_generator(self):
# GH#39614
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
indexer = (x for x in [1, 2])
result = df.iloc[indexer]
expected = DataFrame({"a": [2, 3], "b": [5, 6]}, index=[1, 2])
tm.assert_frame_equal(result, expected)
def test_getitem_iloc_two_dimensional_generator(self):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
indexer = (x for x in [1, 2])
result = df.iloc[indexer, 1]
expected = Series([5, 6], name="b", index=[1, 2])
tm.assert_series_equal(result, expected)
class TestGetitemCallable:
def test_getitem_callable(self, float_frame):
# GH#12533
result = float_frame[lambda x: "A"]
expected = float_frame.loc[:, "A"]
tm.assert_series_equal(result, expected)
result = float_frame[lambda x: ["A", "B"]]
expected = float_frame.loc[:, ["A", "B"]]
tm.assert_frame_equal(result, float_frame.loc[:, ["A", "B"]])
df = float_frame[:3]
result = df[lambda x: [True, False, True]]
expected = float_frame.iloc[[0, 2], :]
tm.assert_frame_equal(result, expected)
def test_loc_multiindex_columns_one_level(self):
# GH#29749
df = DataFrame([[1, 2]], columns=[["a", "b"]])
expected = DataFrame([1], columns=[["a"]])
result = df["a"]
tm.assert_frame_equal(result, expected)
result = df.loc[:, "a"]
tm.assert_frame_equal(result, expected)
class TestGetitemBooleanMask:
def test_getitem_bool_mask_categorical_index(self):
df3 = DataFrame(
{
"A": np.arange(6, dtype="int64"),
},
index=CategoricalIndex(
[1, 1, 2, 1, 3, 2],
dtype=CategoricalDtype([3, 2, 1], ordered=True),
name="B",
),
)
df4 = DataFrame(
{
"A": np.arange(6, dtype="int64"),
},
index=CategoricalIndex(
[1, 1, 2, 1, 3, 2],
dtype=CategoricalDtype([3, 2, 1], ordered=False),
name="B",
),
)
result = df3[df3.index == "a"]
expected = df3.iloc[[]]
tm.assert_frame_equal(result, expected)
result = df4[df4.index == "a"]
expected = df4.iloc[[]]
tm.assert_frame_equal(result, expected)
result = df3[df3.index == 1]
expected = df3.iloc[[0, 1, 3]]
tm.assert_frame_equal(result, expected)
result = df4[df4.index == 1]
expected = df4.iloc[[0, 1, 3]]
tm.assert_frame_equal(result, expected)
# since we have an ordered categorical
# CategoricalIndex([1, 1, 2, 1, 3, 2],
# categories=[3, 2, 1],
# ordered=True,
# name='B')
result = df3[df3.index < 2]
expected = df3.iloc[[4]]
tm.assert_frame_equal(result, expected)
result = df3[df3.index > 1]
expected = df3.iloc[[]]
tm.assert_frame_equal(result, expected)
# unordered
# cannot be compared
# CategoricalIndex([1, 1, 2, 1, 3, 2],
# categories=[3, 2, 1],
# ordered=False,
# name='B')
msg = "Unordered Categoricals can only compare equality or not"
with pytest.raises(TypeError, match=msg):
df4[df4.index < 2]
with pytest.raises(TypeError, match=msg):
df4[df4.index > 1]
@pytest.mark.parametrize(
"data1,data2,expected_data",
(
(
[[1, 2], [3, 4]],
[[0.5, 6], [7, 8]],
[[np.nan, 3.0], [np.nan, 4.0], [np.nan, 7.0], [6.0, 8.0]],
),
(
[[1, 2], [3, 4]],
[[5, 6], [7, 8]],
[[np.nan, 3.0], [np.nan, 4.0], [5, 7], [6, 8]],
),
),
)
def test_getitem_bool_mask_duplicate_columns_mixed_dtypes(
self,
data1,
data2,
expected_data,
):
# GH#31954
df1 = DataFrame(np.array(data1))
df2 = DataFrame(np.array(data2))
df = concat([df1, df2], axis=1)
result = df[df > 2]
exdict = {i: np.array(col) for i, col in enumerate(expected_data)}
expected = DataFrame(exdict).rename(columns={2: 0, 3: 1})
tm.assert_frame_equal(result, expected)
@pytest.fixture
def df_dup_cols(self):
dups = ["A", "A", "C", "D"]
df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
return df
def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self, df_dup_cols):
# `df.A > 6` is a DataFrame with a different shape from df
# boolean with the duplicate raises
df = df_dup_cols
msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df[df.A > 6]
def test_getitem_boolean_series_with_duplicate_columns(self, df_dup_cols):
# boolean indexing
# GH#4879
df = DataFrame(
np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
)
expected = df[df.C > 6]
expected.columns = df_dup_cols.columns
df = df_dup_cols
result = df[df.C > 6]
tm.assert_frame_equal(result, expected)
result.dtypes
str(result)
def test_getitem_boolean_frame_with_duplicate_columns(self, df_dup_cols):
# where
df = DataFrame(
np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
)
# `df > 6` is a DataFrame with the same shape+alignment as df
expected = df[df > 6]
expected.columns = df_dup_cols.columns
df = df_dup_cols
result = df[df > 6]
tm.assert_frame_equal(result, expected)
result.dtypes
str(result)
def test_getitem_empty_frame_with_boolean(self):
# Test for issue GH#11859
df = DataFrame()
df2 = df[df > 0]
tm.assert_frame_equal(df, df2)
class TestGetitemSlice:
def test_getitem_slice_float64(self, frame_or_series):
values = np.arange(10.0, 50.0, 2)
index = Index(values)
start, end = values[[5, 15]]
data = np.random.randn(20, 3)
if frame_or_series is not DataFrame:
data = data[:, 0]
obj = frame_or_series(data, index=index)
result = obj[start:end]
expected = obj.iloc[5:16]
tm.assert_equal(result, expected)
result = obj.loc[start:end]
tm.assert_equal(result, expected)
def test_getitem_datetime_slice(self):
# GH#43223
df = DataFrame(
{"a": 0},
index=DatetimeIndex(
[
"11.01.2011 22:00",
"11.01.2011 23:00",
"12.01.2011 00:00",
"2011-01-13 00:00",
]
),
)
with tm.assert_produces_warning(FutureWarning):
result = df["2011-01-01":"2011-11-01"]
expected = DataFrame(
{"a": 0},
index=DatetimeIndex(
["11.01.2011 22:00", "11.01.2011 23:00", "2011-01-13 00:00"]
),
)
tm.assert_frame_equal(result, expected)
class TestGetitemDeprecatedIndexers:
@pytest.mark.parametrize("key", [{"a", "b"}, {"a": "a"}])
def test_getitem_dict_and_set_deprecated(self, key):
# GH#42825
df = DataFrame(
[[1, 2], [3, 4]], columns=MultiIndex.from_tuples([("a", 1), ("b", 2)])
)
with tm.assert_produces_warning(FutureWarning):
df[key]

View File

@@ -0,0 +1,106 @@
"""
test_insert is specifically for the DataFrame.insert method; not to be
confused with tests with "insert" in their names that are really testing
__setitem__.
"""
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
class TestDataFrameInsert:
def test_insert(self):
df = DataFrame(
np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"]
)
df.insert(0, "foo", df["a"])
tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"]))
tm.assert_series_equal(df["a"], df["foo"], check_names=False)
df.insert(2, "bar", df["c"])
tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"]))
tm.assert_almost_equal(df["c"], df["bar"], check_names=False)
with pytest.raises(ValueError, match="already exists"):
df.insert(1, "a", df["b"])
msg = "cannot insert c, already exists"
with pytest.raises(ValueError, match=msg):
df.insert(1, "c", df["b"])
df.columns.name = "some_name"
# preserve columns name field
df.insert(0, "baz", df["c"])
assert df.columns.name == "some_name"
def test_insert_column_bug_4032(self):
# GH#4032, inserting a column and renaming causing errors
df = DataFrame({"b": [1.1, 2.2]})
df = df.rename(columns={})
df.insert(0, "a", [1, 2])
result = df.rename(columns={})
str(result)
expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
df.insert(0, "c", [1.3, 2.3])
result = df.rename(columns={})
str(result)
expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"])
tm.assert_frame_equal(result, expected)
def test_insert_with_columns_dups(self):
# GH#14291
df = DataFrame()
df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True)
df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
exp = DataFrame(
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
)
tm.assert_frame_equal(df, exp)
def test_insert_item_cache(self, using_array_manager):
df = DataFrame(np.random.randn(4, 3))
ser = df[0]
if using_array_manager:
expected_warning = None
else:
# with BlockManager warn about high fragmentation of single dtype
expected_warning = PerformanceWarning
with tm.assert_produces_warning(expected_warning):
for n in range(100):
df[n + 3] = df[1] * n
ser.values[0] = 99
assert df.iloc[0, 0] == df[0][0]
def test_insert_EA_no_warning(self):
# PerformanceWarning about fragmented frame should not be raised when
# using EAs (https://github.com/pandas-dev/pandas/issues/44098)
df = DataFrame(np.random.randint(0, 100, size=(3, 100)), dtype="Int64")
with tm.assert_produces_warning(None):
df["a"] = np.array([1, 2, 3])
def test_insert_frame(self):
# GH#42403
df = DataFrame({"col1": [1, 2], "col2": [3, 4]})
msg = r"Expected a 1D array, got an array with shape \(2, 2\)"
with pytest.raises(ValueError, match=msg):
df.insert(1, "newcol", df)

View File

@@ -0,0 +1,94 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
class TestLookup:
def test_lookup_float(self, float_frame):
df = float_frame
rows = list(df.index) * len(df.columns)
cols = list(df.columns) * len(df.index)
with tm.assert_produces_warning(FutureWarning):
result = df.lookup(rows, cols)
expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)])
tm.assert_numpy_array_equal(result, expected)
def test_lookup_mixed(self, float_string_frame):
df = float_string_frame
rows = list(df.index) * len(df.columns)
cols = list(df.columns) * len(df.index)
with tm.assert_produces_warning(FutureWarning):
result = df.lookup(rows, cols)
expected = np.array(
[df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_
)
tm.assert_almost_equal(result, expected)
def test_lookup_bool(self):
df = DataFrame(
{
"label": ["a", "b", "a", "c"],
"mask_a": [True, True, False, True],
"mask_b": [True, False, False, False],
"mask_c": [False, True, False, True],
}
)
with tm.assert_produces_warning(FutureWarning):
df["mask"] = df.lookup(df.index, "mask_" + df["label"])
exp_mask = np.array(
[df.loc[r, c] for r, c in zip(df.index, "mask_" + df["label"])]
)
tm.assert_series_equal(df["mask"], Series(exp_mask, name="mask"))
assert df["mask"].dtype == np.bool_
def test_lookup_raises(self, float_frame):
with pytest.raises(KeyError, match="'One or more row labels was not found'"):
with tm.assert_produces_warning(FutureWarning):
float_frame.lookup(["xyz"], ["A"])
with pytest.raises(KeyError, match="'One or more column labels was not found'"):
with tm.assert_produces_warning(FutureWarning):
float_frame.lookup([float_frame.index[0]], ["xyz"])
with pytest.raises(ValueError, match="same size"):
with tm.assert_produces_warning(FutureWarning):
float_frame.lookup(["a", "b", "c"], ["a"])
def test_lookup_requires_unique_axes(self):
# GH#33041 raise with a helpful error message
df = DataFrame(np.random.randn(6).reshape(3, 2), columns=["A", "A"])
rows = [0, 1]
cols = ["A", "A"]
# homogeneous-dtype case
with pytest.raises(ValueError, match="requires unique index and columns"):
with tm.assert_produces_warning(FutureWarning):
df.lookup(rows, cols)
with pytest.raises(ValueError, match="requires unique index and columns"):
with tm.assert_produces_warning(FutureWarning):
df.T.lookup(cols, rows)
# heterogeneous dtype
df["B"] = 0
with pytest.raises(ValueError, match="requires unique index and columns"):
with tm.assert_produces_warning(FutureWarning):
df.lookup(rows, cols)
def test_lookup_deprecated():
# GH#18262
df = DataFrame(
{"col": ["A", "A", "B", "B"], "A": [80, 23, np.nan, 22], "B": [80, 55, 76, 67]}
)
with tm.assert_produces_warning(FutureWarning):
df.lookup(df.index, df["col"])

View File

@@ -0,0 +1,162 @@
"""
Tests for DataFrame.mask; tests DataFrame.where as a side-effect.
"""
import numpy as np
from pandas import (
NA,
DataFrame,
Series,
StringDtype,
Timedelta,
isna,
)
import pandas._testing as tm
class TestDataFrameMask:
def test_mask(self):
df = DataFrame(np.random.randn(5, 3))
cond = df > 0
rs = df.where(cond, np.nan)
tm.assert_frame_equal(rs, df.mask(df <= 0))
tm.assert_frame_equal(rs, df.mask(~cond))
other = DataFrame(np.random.randn(5, 3))
rs = df.where(cond, other)
tm.assert_frame_equal(rs, df.mask(df <= 0, other))
tm.assert_frame_equal(rs, df.mask(~cond, other))
def test_mask2(self):
# see GH#21891
df = DataFrame([1, 2])
res = df.mask([[True], [False]])
exp = DataFrame([np.nan, 2])
tm.assert_frame_equal(res, exp)
def test_mask_inplace(self):
# GH#8801
df = DataFrame(np.random.randn(5, 3))
cond = df > 0
rdf = df.copy()
return_value = rdf.where(cond, inplace=True)
assert return_value is None
tm.assert_frame_equal(rdf, df.where(cond))
tm.assert_frame_equal(rdf, df.mask(~cond))
rdf = df.copy()
return_value = rdf.where(cond, -df, inplace=True)
assert return_value is None
tm.assert_frame_equal(rdf, df.where(cond, -df))
tm.assert_frame_equal(rdf, df.mask(~cond, -df))
def test_mask_edge_case_1xN_frame(self):
# GH#4071
df = DataFrame([[1, 2]])
res = df.mask(DataFrame([[True, False]]))
expec = DataFrame([[np.nan, 2]])
tm.assert_frame_equal(res, expec)
def test_mask_callable(self):
# GH#12533
df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
result = df.mask(lambda x: x > 4, lambda x: x + 1)
exp = DataFrame([[1, 2, 3], [4, 6, 7], [8, 9, 10]])
tm.assert_frame_equal(result, exp)
tm.assert_frame_equal(result, df.mask(df > 4, df + 1))
# return ndarray and scalar
result = df.mask(lambda x: (x % 2 == 0).values, lambda x: 99)
exp = DataFrame([[1, 99, 3], [99, 5, 99], [7, 99, 9]])
tm.assert_frame_equal(result, exp)
tm.assert_frame_equal(result, df.mask(df % 2 == 0, 99))
# chain
result = (df + 2).mask(lambda x: x > 8, lambda x: x + 10)
exp = DataFrame([[3, 4, 5], [6, 7, 8], [19, 20, 21]])
tm.assert_frame_equal(result, exp)
tm.assert_frame_equal(result, (df + 2).mask((df + 2) > 8, (df + 2) + 10))
def test_mask_dtype_bool_conversion(self):
# GH#3733
df = DataFrame(data=np.random.randn(100, 50))
df = df.where(df > 0) # create nans
bools = df > 0
mask = isna(df)
expected = bools.astype(object).mask(mask)
result = bools.mask(mask)
tm.assert_frame_equal(result, expected)
def test_mask_pos_args_deprecation(self, frame_or_series):
# https://github.com/pandas-dev/pandas/issues/41485
obj = DataFrame({"a": range(5)})
expected = DataFrame({"a": [-1, 1, -1, 3, -1]})
obj = tm.get_obj(obj, frame_or_series)
expected = tm.get_obj(expected, frame_or_series)
cond = obj % 2 == 0
msg = (
r"In a future version of pandas all arguments of "
f"{frame_or_series.__name__}.mask except for "
r"the arguments 'cond' and 'other' will be keyword-only"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = obj.mask(cond, -1, False)
tm.assert_equal(result, expected)
def test_mask_try_cast_deprecated(frame_or_series):
obj = DataFrame(np.random.randn(4, 3))
if frame_or_series is not DataFrame:
obj = obj[0]
mask = obj > 0
with tm.assert_produces_warning(FutureWarning):
# try_cast keyword deprecated
obj.mask(mask, -1, try_cast=True)
def test_mask_stringdtype(frame_or_series):
# GH 40824
obj = DataFrame(
{"A": ["foo", "bar", "baz", NA]},
index=["id1", "id2", "id3", "id4"],
dtype=StringDtype(),
)
filtered_obj = DataFrame(
{"A": ["this", "that"]}, index=["id2", "id3"], dtype=StringDtype()
)
expected = DataFrame(
{"A": [NA, "this", "that", NA]},
index=["id1", "id2", "id3", "id4"],
dtype=StringDtype(),
)
if frame_or_series is Series:
obj = obj["A"]
filtered_obj = filtered_obj["A"]
expected = expected["A"]
filter_ser = Series([False, True, True, False])
result = obj.mask(filter_ser, filtered_obj)
tm.assert_equal(result, expected)
def test_mask_where_dtype_timedelta():
# https://github.com/pandas-dev/pandas/issues/39548
df = DataFrame([Timedelta(i, unit="d") for i in range(5)])
expected = DataFrame(np.full(5, np.nan, dtype="timedelta64[ns]"))
tm.assert_frame_equal(df.mask(df.notna()), expected)
expected = DataFrame(
[np.nan, np.nan, np.nan, Timedelta("3 day"), Timedelta("4 day")]
)
tm.assert_frame_equal(df.where(df > Timedelta(2, unit="d")), expected)

View File

@@ -0,0 +1,68 @@
import numpy as np
from pandas.core.dtypes.common import is_float_dtype
from pandas import (
DataFrame,
isna,
)
class TestSetValue:
def test_set_value(self, float_frame):
for idx in float_frame.index:
for col in float_frame.columns:
float_frame._set_value(idx, col, 1)
assert float_frame[col][idx] == 1
def test_set_value_resize(self, float_frame):
res = float_frame._set_value("foobar", "B", 0)
assert res is None
assert float_frame.index[-1] == "foobar"
assert float_frame._get_value("foobar", "B") == 0
float_frame.loc["foobar", "qux"] = 0
assert float_frame._get_value("foobar", "qux") == 0
res = float_frame.copy()
res._set_value("foobar", "baz", "sam")
assert res["baz"].dtype == np.object_
res = float_frame.copy()
res._set_value("foobar", "baz", True)
assert res["baz"].dtype == np.object_
res = float_frame.copy()
res._set_value("foobar", "baz", 5)
assert is_float_dtype(res["baz"])
assert isna(res["baz"].drop(["foobar"])).all()
res._set_value("foobar", "baz", "sam")
assert res.loc["foobar", "baz"] == "sam"
def test_set_value_with_index_dtype_change(self):
df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC"))
# this is actually ambiguous as the 2 is interpreted as a positional
# so column is not created
df = df_orig.copy()
df._set_value("C", 2, 1.0)
assert list(df.index) == list(df_orig.index) + ["C"]
# assert list(df.columns) == list(df_orig.columns) + [2]
df = df_orig.copy()
df.loc["C", 2] = 1.0
assert list(df.index) == list(df_orig.index) + ["C"]
# assert list(df.columns) == list(df_orig.columns) + [2]
# create both new
df = df_orig.copy()
df._set_value("C", "D", 1.0)
assert list(df.index) == list(df_orig.index) + ["C"]
assert list(df.columns) == list(df_orig.columns) + ["D"]
df = df_orig.copy()
df.loc["C", "D"] = 1.0
assert list(df.index) == list(df_orig.index) + ["C"]
assert list(df.columns) == list(df_orig.columns) + ["D"]

View File

@@ -0,0 +1,88 @@
import pytest
import pandas._testing as tm
class TestDataFrameTake:
def test_take(self, float_frame):
# homogeneous
order = [3, 1, 2, 0]
for df in [float_frame]:
result = df.take(order, axis=0)
expected = df.reindex(df.index.take(order))
tm.assert_frame_equal(result, expected)
# axis = 1
result = df.take(order, axis=1)
expected = df.loc[:, ["D", "B", "C", "A"]]
tm.assert_frame_equal(result, expected, check_names=False)
# negative indices
order = [2, 1, -1]
for df in [float_frame]:
result = df.take(order, axis=0)
expected = df.reindex(df.index.take(order))
tm.assert_frame_equal(result, expected)
result = df.take(order, axis=0)
tm.assert_frame_equal(result, expected)
# axis = 1
result = df.take(order, axis=1)
expected = df.loc[:, ["C", "B", "D"]]
tm.assert_frame_equal(result, expected, check_names=False)
# illegal indices
msg = "indices are out-of-bounds"
with pytest.raises(IndexError, match=msg):
df.take([3, 1, 2, 30], axis=0)
with pytest.raises(IndexError, match=msg):
df.take([3, 1, 2, -31], axis=0)
with pytest.raises(IndexError, match=msg):
df.take([3, 1, 2, 5], axis=1)
with pytest.raises(IndexError, match=msg):
df.take([3, 1, 2, -5], axis=1)
def test_take_mixed_type(self, float_string_frame):
# mixed-dtype
order = [4, 1, 2, 0, 3]
for df in [float_string_frame]:
result = df.take(order, axis=0)
expected = df.reindex(df.index.take(order))
tm.assert_frame_equal(result, expected)
# axis = 1
result = df.take(order, axis=1)
expected = df.loc[:, ["foo", "B", "C", "A", "D"]]
tm.assert_frame_equal(result, expected)
# negative indices
order = [4, 1, -2]
for df in [float_string_frame]:
result = df.take(order, axis=0)
expected = df.reindex(df.index.take(order))
tm.assert_frame_equal(result, expected)
# axis = 1
result = df.take(order, axis=1)
expected = df.loc[:, ["foo", "B", "D"]]
tm.assert_frame_equal(result, expected)
def test_take_mixed_numeric(self, mixed_float_frame, mixed_int_frame):
# by dtype
order = [1, 2, 0, 3]
for df in [mixed_float_frame, mixed_int_frame]:
result = df.take(order, axis=0)
expected = df.reindex(df.index.take(order))
tm.assert_frame_equal(result, expected)
# axis = 1
result = df.take(order, axis=1)
expected = df.loc[:, ["B", "C", "A", "D"]]
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,929 @@
from datetime import datetime
from hypothesis import given
import numpy as np
import pytest
from pandas.compat import np_version_under1p19
from pandas.core.dtypes.common import is_scalar
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Series,
StringDtype,
Timestamp,
date_range,
isna,
)
import pandas._testing as tm
from pandas._testing._hypothesis import OPTIONAL_ONE_OF_ALL
@pytest.fixture(params=["default", "float_string", "mixed_float", "mixed_int"])
def where_frame(request, float_string_frame, mixed_float_frame, mixed_int_frame):
if request.param == "default":
return DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])
if request.param == "float_string":
return float_string_frame
if request.param == "mixed_float":
return mixed_float_frame
if request.param == "mixed_int":
return mixed_int_frame
def _safe_add(df):
# only add to the numeric items
def is_ok(s):
return (
issubclass(s.dtype.type, (np.integer, np.floating)) and s.dtype != "uint8"
)
return DataFrame(dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()))
class TestDataFrameIndexingWhere:
def test_where_get(self, where_frame, float_string_frame):
def _check_get(df, cond, check_dtypes=True):
other1 = _safe_add(df)
rs = df.where(cond, other1)
rs2 = df.where(cond.values, other1)
for k, v in rs.items():
exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index)
tm.assert_series_equal(v, exp, check_names=False)
tm.assert_frame_equal(rs, rs2)
# dtypes
if check_dtypes:
assert (rs.dtypes == df.dtypes).all()
# check getting
df = where_frame
if df is float_string_frame:
msg = "'>' not supported between instances of 'str' and 'int'"
with pytest.raises(TypeError, match=msg):
df > 0
return
cond = df > 0
_check_get(df, cond)
def test_where_upcasting(self):
# upcasting case (GH # 2794)
df = DataFrame(
{
c: Series([1] * 3, dtype=c)
for c in ["float32", "float64", "int32", "int64"]
}
)
df.iloc[1, :] = 0
result = df.dtypes
expected = Series(
[
np.dtype("float32"),
np.dtype("float64"),
np.dtype("int32"),
np.dtype("int64"),
],
index=["float32", "float64", "int32", "int64"],
)
# when we don't preserve boolean casts
#
# expected = Series({ 'float32' : 1, 'float64' : 3 })
tm.assert_series_equal(result, expected)
def test_where_alignment(self, where_frame, float_string_frame, mixed_int_frame):
# aligning
def _check_align(df, cond, other, check_dtypes=True):
rs = df.where(cond, other)
for i, k in enumerate(rs.columns):
result = rs[k]
d = df[k].values
c = cond[k].reindex(df[k].index).fillna(False).values
if is_scalar(other):
o = other
else:
if isinstance(other, np.ndarray):
o = Series(other[:, i], index=result.index).values
else:
o = other[k].values
new_values = d if c.all() else np.where(c, d, o)
expected = Series(new_values, index=result.index, name=k)
# since we can't always have the correct numpy dtype
# as numpy doesn't know how to downcast, don't check
tm.assert_series_equal(result, expected, check_dtype=False)
# dtypes
# can't check dtype when other is an ndarray
if check_dtypes and not isinstance(other, np.ndarray):
assert (rs.dtypes == df.dtypes).all()
df = where_frame
if df is float_string_frame:
msg = "'>' not supported between instances of 'str' and 'int'"
with pytest.raises(TypeError, match=msg):
df > 0
return
# other is a frame
cond = (df > 0)[1:]
_check_align(df, cond, _safe_add(df))
# check other is ndarray
cond = df > 0
warn = None
if df is mixed_int_frame:
warn = FutureWarning
with tm.assert_produces_warning(warn, match="Downcasting integer-dtype"):
_check_align(df, cond, (_safe_add(df).values))
# integers are upcast, so don't check the dtypes
cond = df > 0
check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes)
_check_align(df, cond, np.nan, check_dtypes=check_dtypes)
def test_where_invalid(self):
# invalid conditions
df = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])
cond = df > 0
err1 = (df + 1).values[0:2, :]
msg = "other must be the same shape as self when an ndarray"
with pytest.raises(ValueError, match=msg):
df.where(cond, err1)
err2 = cond.iloc[:2, :].values
other1 = _safe_add(df)
msg = "Array conditional must be same shape as self"
with pytest.raises(ValueError, match=msg):
df.where(err2, other1)
with pytest.raises(ValueError, match=msg):
df.mask(True)
with pytest.raises(ValueError, match=msg):
df.mask(0)
def test_where_set(self, where_frame, float_string_frame):
# where inplace
def _check_set(df, cond, check_dtypes=True):
dfi = df.copy()
econd = cond.reindex_like(df).fillna(True)
expected = dfi.mask(~econd)
return_value = dfi.where(cond, np.nan, inplace=True)
assert return_value is None
tm.assert_frame_equal(dfi, expected)
# dtypes (and confirm upcasts)x
if check_dtypes:
for k, v in df.dtypes.items():
if issubclass(v.type, np.integer) and not cond[k].all():
v = np.dtype("float64")
assert dfi[k].dtype == v
df = where_frame
if df is float_string_frame:
msg = "'>' not supported between instances of 'str' and 'int'"
with pytest.raises(TypeError, match=msg):
df > 0
return
cond = df > 0
_check_set(df, cond)
cond = df >= 0
_check_set(df, cond)
# aligning
cond = (df >= 0)[1:]
_check_set(df, cond)
def test_where_series_slicing(self):
# GH 10218
# test DataFrame.where with Series slicing
df = DataFrame({"a": range(3), "b": range(4, 7)})
result = df.where(df["a"] == 1)
expected = df[df["a"] == 1].reindex(df.index)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("klass", [list, tuple, np.array])
def test_where_array_like(self, klass):
# see gh-15414
df = DataFrame({"a": [1, 2, 3]})
cond = [[False], [True], [True]]
expected = DataFrame({"a": [np.nan, 2, 3]})
result = df.where(klass(cond))
tm.assert_frame_equal(result, expected)
df["b"] = 2
expected["b"] = [2, np.nan, 2]
cond = [[False, True], [True, False], [True, True]]
result = df.where(klass(cond))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"cond",
[
[[1], [0], [1]],
Series([[2], [5], [7]]),
DataFrame({"a": [2, 5, 7]}),
[["True"], ["False"], ["True"]],
[[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]],
],
)
def test_where_invalid_input_single(self, cond):
# see gh-15414: only boolean arrays accepted
df = DataFrame({"a": [1, 2, 3]})
msg = "Boolean array expected for the condition"
with pytest.raises(ValueError, match=msg):
df.where(cond)
@pytest.mark.parametrize(
"cond",
[
[[0, 1], [1, 0], [1, 1]],
Series([[0, 2], [5, 0], [4, 7]]),
[["False", "True"], ["True", "False"], ["True", "True"]],
DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}),
[
[pd.NaT, Timestamp("2017-01-01")],
[Timestamp("2017-01-02"), pd.NaT],
[Timestamp("2017-01-03"), Timestamp("2017-01-03")],
],
],
)
def test_where_invalid_input_multiple(self, cond):
# see gh-15414: only boolean arrays accepted
df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]})
msg = "Boolean array expected for the condition"
with pytest.raises(ValueError, match=msg):
df.where(cond)
def test_where_dataframe_col_match(self):
df = DataFrame([[1, 2, 3], [4, 5, 6]])
cond = DataFrame([[True, False, True], [False, False, True]])
result = df.where(cond)
expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]])
tm.assert_frame_equal(result, expected)
# this *does* align, though has no matching columns
cond.columns = ["a", "b", "c"]
result = df.where(cond)
expected = DataFrame(np.nan, index=df.index, columns=df.columns)
tm.assert_frame_equal(result, expected)
def test_where_ndframe_align(self):
msg = "Array conditional must be same shape as self"
df = DataFrame([[1, 2, 3], [4, 5, 6]])
cond = [True]
with pytest.raises(ValueError, match=msg):
df.where(cond)
expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]])
out = df.where(Series(cond))
tm.assert_frame_equal(out, expected)
cond = np.array([False, True, False, True])
with pytest.raises(ValueError, match=msg):
df.where(cond)
expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]])
out = df.where(Series(cond))
tm.assert_frame_equal(out, expected)
def test_where_bug(self):
# see gh-2793
df = DataFrame(
{"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64"
)
expected = DataFrame(
{"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]},
dtype="float64",
)
result = df.where(df > 2, np.nan)
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(result > 2, np.nan, inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
def test_where_bug_mixed(self, any_signed_int_numpy_dtype):
# see gh-2793
df = DataFrame(
{
"a": np.array([1, 2, 3, 4], dtype=any_signed_int_numpy_dtype),
"b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"),
}
)
expected = DataFrame(
{"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]},
dtype="float64",
)
result = df.where(df > 2, np.nan)
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(result > 2, np.nan, inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
def test_where_bug_transposition(self):
# see gh-7506
a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]})
b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]})
do_not_replace = b.isna() | (a > b)
expected = a.copy()
expected[~do_not_replace] = b
result = a.where(do_not_replace, b)
tm.assert_frame_equal(result, expected)
a = DataFrame({0: [4, 6], 1: [1, 0]})
b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]})
do_not_replace = b.isna() | (a > b)
expected = a.copy()
expected[~do_not_replace] = b
result = a.where(do_not_replace, b)
tm.assert_frame_equal(result, expected)
def test_where_datetime(self):
# GH 3311
df = DataFrame(
{
"A": date_range("20130102", periods=5),
"B": date_range("20130104", periods=5),
"C": np.random.randn(5),
}
)
stamp = datetime(2013, 1, 3)
msg = "'>' not supported between instances of 'float' and 'datetime.datetime'"
with pytest.raises(TypeError, match=msg):
df > stamp
result = df[df.iloc[:, :-1] > stamp]
expected = df.copy()
expected.loc[[0, 1], "A"] = np.nan
expected.loc[:, "C"] = np.nan
tm.assert_frame_equal(result, expected)
def test_where_none(self):
# GH 4667
# setting with None changes dtype
df = DataFrame({"series": Series(range(10))}).astype(float)
df[df > 7] = None
expected = DataFrame(
{"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])}
)
tm.assert_frame_equal(df, expected)
# GH 7656
df = DataFrame(
[
{"A": 1, "B": np.nan, "C": "Test"},
{"A": np.nan, "B": "Test", "C": np.nan},
]
)
msg = "boolean setting on mixed-type"
with pytest.raises(TypeError, match=msg):
df.where(~isna(df), None, inplace=True)
def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self):
# see gh-21947
df = DataFrame(columns=["a"])
cond = df
assert (cond.dtypes == object).all()
result = df.where(cond)
tm.assert_frame_equal(result, df)
def test_where_align(self):
def create():
df = DataFrame(np.random.randn(10, 3))
df.iloc[3:5, 0] = np.nan
df.iloc[4:6, 1] = np.nan
df.iloc[5:8, 2] = np.nan
return df
# series
df = create()
expected = df.fillna(df.mean())
result = df.where(pd.notna(df), df.mean(), axis="columns")
tm.assert_frame_equal(result, expected)
return_value = df.where(pd.notna(df), df.mean(), inplace=True, axis="columns")
assert return_value is None
tm.assert_frame_equal(df, expected)
df = create().fillna(0)
expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0])
result = df.where(df > 0, df[0], axis="index")
tm.assert_frame_equal(result, expected)
result = df.where(df > 0, df[0], axis="rows")
tm.assert_frame_equal(result, expected)
# frame
df = create()
expected = df.fillna(1)
result = df.where(
pd.notna(df), DataFrame(1, index=df.index, columns=df.columns)
)
tm.assert_frame_equal(result, expected)
def test_where_complex(self):
# GH 6345
expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"])
df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"])
df[df.abs() >= 5] = np.nan
tm.assert_frame_equal(df, expected)
def test_where_axis(self, using_array_manager):
# GH 9736
df = DataFrame(np.random.randn(2, 2))
mask = DataFrame([[False, False], [False, False]])
s = Series([0, 1])
expected = DataFrame([[0, 0], [1, 1]], dtype="float64")
result = df.where(mask, s, axis="index")
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(mask, s, axis="index", inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
expected = DataFrame([[0, 1], [0, 1]], dtype="float64")
result = df.where(mask, s, axis="columns")
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(mask, s, axis="columns", inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
# Upcast needed
df = DataFrame([[1, 2], [3, 4]], dtype="int64")
mask = DataFrame([[False, False], [False, False]])
s = Series([0, np.nan])
expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64")
result = df.where(mask, s, axis="index")
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(mask, s, axis="index", inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
warn = FutureWarning if using_array_manager else None
expected = DataFrame([[0, np.nan], [0, np.nan]])
with tm.assert_produces_warning(warn, match="Downcasting integer-dtype"):
result = df.where(mask, s, axis="columns")
tm.assert_frame_equal(result, expected)
expected = DataFrame(
{
0: np.array([0, 0], dtype="int64"),
1: np.array([np.nan, np.nan], dtype="float64"),
}
)
result = df.copy()
return_value = result.where(mask, s, axis="columns", inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
def test_where_axis_multiple_dtypes(self):
# Multiple dtypes (=> multiple Blocks)
df = pd.concat(
[
DataFrame(np.random.randn(10, 2)),
DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"),
],
ignore_index=True,
axis=1,
)
mask = DataFrame(False, columns=df.columns, index=df.index)
s1 = Series(1, index=df.columns)
s2 = Series(2, index=df.index)
result = df.where(mask, s1, axis="columns")
expected = DataFrame(1.0, columns=df.columns, index=df.index)
expected[2] = expected[2].astype("int64")
expected[3] = expected[3].astype("int64")
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(mask, s1, axis="columns", inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
result = df.where(mask, s2, axis="index")
expected = DataFrame(2.0, columns=df.columns, index=df.index)
expected[2] = expected[2].astype("int64")
expected[3] = expected[3].astype("int64")
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(mask, s2, axis="index", inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
# DataFrame vs DataFrame
d1 = df.copy().drop(1, axis=0)
expected = df.copy()
expected.loc[1, :] = np.nan
result = df.where(mask, d1)
tm.assert_frame_equal(result, expected)
result = df.where(mask, d1, axis="index")
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(mask, d1, inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(mask, d1, inplace=True, axis="index")
assert return_value is None
tm.assert_frame_equal(result, expected)
d2 = df.copy().drop(1, axis=1)
expected = df.copy()
expected.loc[:, 1] = np.nan
result = df.where(mask, d2)
tm.assert_frame_equal(result, expected)
result = df.where(mask, d2, axis="columns")
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(mask, d2, inplace=True)
assert return_value is None
tm.assert_frame_equal(result, expected)
result = df.copy()
return_value = result.where(mask, d2, inplace=True, axis="columns")
assert return_value is None
tm.assert_frame_equal(result, expected)
def test_where_callable(self):
# GH 12533
df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
result = df.where(lambda x: x > 4, lambda x: x + 1)
exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]])
tm.assert_frame_equal(result, exp)
tm.assert_frame_equal(result, df.where(df > 4, df + 1))
# return ndarray and scalar
result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99)
exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]])
tm.assert_frame_equal(result, exp)
tm.assert_frame_equal(result, df.where(df % 2 == 0, 99))
# chain
result = (df + 2).where(lambda x: x > 8, lambda x: x + 10)
exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]])
tm.assert_frame_equal(result, exp)
tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10))
def test_where_tz_values(self, tz_naive_fixture, frame_or_series):
obj1 = DataFrame(
DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture),
columns=["date"],
)
obj2 = DataFrame(
DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture),
columns=["date"],
)
mask = DataFrame([True, True, False], columns=["date"])
exp = DataFrame(
DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture),
columns=["date"],
)
if frame_or_series is Series:
obj1 = obj1["date"]
obj2 = obj2["date"]
mask = mask["date"]
exp = exp["date"]
result = obj1.where(mask, obj2)
tm.assert_equal(exp, result)
def test_df_where_change_dtype(self):
# GH#16979
df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC"))
mask = np.array([[True, False, False], [False, False, True]])
result = df.where(mask)
expected = DataFrame(
[[0, np.nan, np.nan], [np.nan, np.nan, 5]], columns=list("ABC")
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [{}, {"other": None}])
def test_df_where_with_category(self, kwargs):
# GH#16979
df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC"))
mask = np.array([[True, False, False], [False, False, True]])
# change type to category
df.A = df.A.astype("category")
df.B = df.B.astype("category")
df.C = df.C.astype("category")
result = df.where(mask, **kwargs)
A = pd.Categorical([0, np.nan], categories=[0, 3])
B = pd.Categorical([np.nan, np.nan], categories=[1, 4])
C = pd.Categorical([np.nan, 5], categories=[2, 5])
expected = DataFrame({"A": A, "B": B, "C": C})
tm.assert_frame_equal(result, expected)
# Check Series.where while we're here
result = df.A.where(mask[:, 0], **kwargs)
expected = Series(A, name="A")
tm.assert_series_equal(result, expected)
def test_where_categorical_filtering(self):
# GH#22609 Verify filtering operations on DataFrames with categorical Series
df = DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"])
df["b"] = df["b"].astype("category")
result = df.where(df["a"] > 0)
expected = df.copy()
expected.loc[0, :] = np.nan
tm.assert_equal(result, expected)
def test_where_ea_other(self):
# GH#38729/GH#38742
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
arr = pd.array([7, pd.NA, 9])
ser = Series(arr)
mask = np.ones(df.shape, dtype=bool)
mask[1, :] = False
# TODO: ideally we would get Int64 instead of object
result = df.where(mask, ser, axis=0)
expected = DataFrame({"A": [1, pd.NA, 3], "B": [4, pd.NA, 6]}).astype(object)
tm.assert_frame_equal(result, expected)
ser2 = Series(arr[:2], index=["A", "B"])
expected = DataFrame({"A": [1, 7, 3], "B": [4, pd.NA, 6]})
expected["B"] = expected["B"].astype(object)
result = df.where(mask, ser2, axis=1)
tm.assert_frame_equal(result, expected)
def test_where_interval_noop(self):
# GH#44181
df = DataFrame([pd.Interval(0, 0)])
res = df.where(df.notna())
tm.assert_frame_equal(res, df)
ser = df[0]
res = ser.where(ser.notna())
tm.assert_series_equal(res, ser)
@pytest.mark.parametrize(
"dtype",
[
"timedelta64[ns]",
"datetime64[ns]",
"datetime64[ns, Asia/Tokyo]",
"Period[D]",
],
)
def test_where_datetimelike_noop(self, dtype):
# GH#45135, analogue to GH#44181 for Period don't raise on no-op
# For td64/dt64/dt64tz we already don't raise, but also are
# checking that we don't unnecessarily upcast to object.
ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype)
df = ser.to_frame()
mask = np.array([False, False, False])
res = ser.where(~mask, "foo")
tm.assert_series_equal(res, ser)
mask2 = mask.reshape(-1, 1)
res2 = df.where(~mask2, "foo")
tm.assert_frame_equal(res2, df)
res3 = ser.mask(mask, "foo")
tm.assert_series_equal(res3, ser)
res4 = df.mask(mask2, "foo")
tm.assert_frame_equal(res4, df)
def test_where_try_cast_deprecated(frame_or_series):
obj = DataFrame(np.random.randn(4, 3))
obj = tm.get_obj(obj, frame_or_series)
mask = obj > 0
with tm.assert_produces_warning(FutureWarning):
# try_cast keyword deprecated
obj.where(mask, -1, try_cast=False)
def test_where_int_downcasting_deprecated(using_array_manager):
# GH#44597
arr = np.arange(6).astype(np.int16).reshape(3, 2)
df = DataFrame(arr)
mask = np.zeros(arr.shape, dtype=bool)
mask[:, 0] = True
msg = "Downcasting integer-dtype"
warn = FutureWarning if not using_array_manager else None
with tm.assert_produces_warning(warn, match=msg):
res = df.where(mask, 2**17)
expected = DataFrame({0: arr[:, 0], 1: np.array([2**17] * 3, dtype=np.int32)})
tm.assert_frame_equal(res, expected)
def test_where_copies_with_noop(frame_or_series):
# GH-39595
result = frame_or_series([1, 2, 3, 4])
expected = result.copy()
col = result[0] if frame_or_series is DataFrame else result
where_res = result.where(col < 5)
where_res *= 2
tm.assert_equal(result, expected)
where_res = result.where(col > 5, [1, 2, 3, 4])
where_res *= 2
tm.assert_equal(result, expected)
def test_where_string_dtype(frame_or_series):
# GH40824
obj = frame_or_series(
["a", "b", "c", "d"], index=["id1", "id2", "id3", "id4"], dtype=StringDtype()
)
filtered_obj = frame_or_series(
["b", "c"], index=["id2", "id3"], dtype=StringDtype()
)
filter_ser = Series([False, True, True, False])
result = obj.where(filter_ser, filtered_obj)
expected = frame_or_series(
[pd.NA, "b", "c", pd.NA],
index=["id1", "id2", "id3", "id4"],
dtype=StringDtype(),
)
tm.assert_equal(result, expected)
def test_where_bool_comparison():
# GH 10336
df_mask = DataFrame(
{"AAA": [True] * 4, "BBB": [False] * 4, "CCC": [True, False, True, False]}
)
result = df_mask.where(df_mask == False) # noqa:E712
expected = DataFrame(
{
"AAA": np.array([np.nan] * 4, dtype=object),
"BBB": [False] * 4,
"CCC": [np.nan, False, np.nan, False],
}
)
tm.assert_frame_equal(result, expected)
def test_where_none_nan_coerce():
# GH 15613
expected = DataFrame(
{
"A": [Timestamp("20130101"), pd.NaT, Timestamp("20130103")],
"B": [1, 2, np.nan],
}
)
result = expected.where(expected.notnull(), None)
tm.assert_frame_equal(result, expected)
def test_where_non_keyword_deprecation(frame_or_series):
# GH 41485
obj = frame_or_series(range(5))
msg = (
"In a future version of pandas all arguments of "
f"{frame_or_series.__name__}.where except for the arguments 'cond' "
"and 'other' will be keyword-only"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = obj.where(obj > 1, 10, False)
expected = frame_or_series([10, 10, 2, 3, 4])
tm.assert_equal(expected, result)
def test_where_columns_casting():
# GH 42295
df = DataFrame({"a": [1.0, 2.0], "b": [3, np.nan]})
expected = df.copy()
result = df.where(pd.notnull(df), None)
# make sure dtypes don't change
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("as_cat", [True, False])
def test_where_period_invalid_na(frame_or_series, as_cat, request):
# GH#44697
idx = pd.period_range("2016-01-01", periods=3, freq="D")
if as_cat:
idx = idx.astype("category")
obj = frame_or_series(idx)
# NA value that we should *not* cast to Period dtype
tdnat = pd.NaT.to_numpy("m8[ns]")
mask = np.array([True, True, False], ndmin=obj.ndim).T
if as_cat:
msg = (
r"Cannot setitem on a Categorical with a new category \(NaT\), "
"set the categories first"
)
if np_version_under1p19:
mark = pytest.mark.xfail(
reason="When evaluating the f-string to generate the exception "
"message, numpy somehow ends up trying to cast None to int, so "
"ends up raising TypeError but with an unrelated message."
)
request.node.add_marker(mark)
else:
msg = "value should be a 'Period'"
with pytest.raises(TypeError, match=msg):
obj.where(mask, tdnat)
with pytest.raises(TypeError, match=msg):
obj.mask(mask, tdnat)
with pytest.raises(TypeError, match=msg):
obj.mask(mask, tdnat, inplace=True)
def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype):
# GH#44697
arr = pd.array([1, 2, 3], dtype=any_numeric_ea_dtype)
obj = frame_or_series(arr)
mask = np.array([True, True, False], ndmin=obj.ndim).T
msg = "|".join(
[
r"datetime64\[.{1,2}\] cannot be converted to an? (Integer|Floating)Dtype",
r"timedelta64\[.{1,2}\] cannot be converted to an? (Integer|Floating)Dtype",
r"int\(\) argument must be a string, a bytes-like object or a number, "
"not 'NaTType'",
"object cannot be converted to a FloatingDtype",
"'values' contains non-numeric NA",
]
)
for null in tm.NP_NAT_OBJECTS + [pd.NaT]:
# NaT is an NA value that we should *not* cast to pd.NA dtype
with pytest.raises(TypeError, match=msg):
obj.where(mask, null)
with pytest.raises(TypeError, match=msg):
obj.mask(mask, null)
@given(data=OPTIONAL_ONE_OF_ALL)
def test_where_inplace_casting(data):
# GH 22051
df = DataFrame({"a": data})
df_copy = df.where(pd.notnull(df), None).copy()
df.where(pd.notnull(df), None, inplace=True)
tm.assert_equal(df, df_copy)

View File

@@ -0,0 +1,392 @@
import re
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
IndexSlice,
MultiIndex,
Series,
concat,
)
import pandas._testing as tm
import pandas.core.common as com
from pandas.tseries.offsets import BDay
@pytest.fixture
def four_level_index_dataframe():
arr = np.array(
[
[-0.5109, -2.3358, -0.4645, 0.05076, 0.364],
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
[-0.6662, -0.5243, -0.358, 0.89145, 2.5838],
]
)
index = MultiIndex(
levels=[["a", "x"], ["b", "q"], [10.0032, 20.0, 30.0], [3, 4, 5]],
codes=[[0, 0, 1], [0, 1, 1], [0, 1, 2], [2, 1, 0]],
names=["one", "two", "three", "four"],
)
return DataFrame(arr, index=index, columns=list("ABCDE"))
class TestXS:
def test_xs(self, float_frame, datetime_frame):
idx = float_frame.index[5]
xs = float_frame.xs(idx)
for item, value in xs.items():
if np.isnan(value):
assert np.isnan(float_frame[item][idx])
else:
assert value == float_frame[item][idx]
# mixed-type xs
test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
frame = DataFrame(test_data)
xs = frame.xs("1")
assert xs.dtype == np.object_
assert xs["A"] == 1
assert xs["B"] == "1"
with pytest.raises(
KeyError, match=re.escape("Timestamp('1999-12-31 00:00:00', freq='B')")
):
datetime_frame.xs(datetime_frame.index[0] - BDay())
# xs get column
series = float_frame.xs("A", axis=1)
expected = float_frame["A"]
tm.assert_series_equal(series, expected)
# view is returned if possible
series = float_frame.xs("A", axis=1)
series[:] = 5
assert (expected == 5).all()
def test_xs_corner(self):
# pathological mixed-type reordering case
df = DataFrame(index=[0])
df["A"] = 1.0
df["B"] = "foo"
df["C"] = 2.0
df["D"] = "bar"
df["E"] = 3.0
xs = df.xs(0)
exp = Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0)
tm.assert_series_equal(xs, exp)
# no columns but Index(dtype=object)
df = DataFrame(index=["a", "b", "c"])
result = df.xs("a")
expected = Series([], name="a", index=Index([]), dtype=np.float64)
tm.assert_series_equal(result, expected)
def test_xs_duplicates(self):
df = DataFrame(np.random.randn(5, 2), index=["b", "b", "c", "b", "a"])
cross = df.xs("c")
exp = df.iloc[2]
tm.assert_series_equal(cross, exp)
def test_xs_keep_level(self):
df = DataFrame(
{
"day": {0: "sat", 1: "sun"},
"flavour": {0: "strawberry", 1: "strawberry"},
"sales": {0: 10, 1: 12},
"year": {0: 2008, 1: 2008},
}
).set_index(["year", "flavour", "day"])
result = df.xs("sat", level="day", drop_level=False)
expected = df[:1]
tm.assert_frame_equal(result, expected)
with tm.assert_produces_warning(FutureWarning):
result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False)
tm.assert_frame_equal(result, expected)
def test_xs_view(self, using_array_manager):
# in 0.14 this will return a view if possible a copy otherwise, but
# this is numpy dependent
dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5))
if using_array_manager:
# INFO(ArrayManager) with ArrayManager getting a row as a view is
# not possible
msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
dm.xs(2)[:] = 20
assert not (dm.xs(2) == 20).any()
else:
dm.xs(2)[:] = 20
assert (dm.xs(2) == 20).all()
class TestXSWithMultiIndex:
def test_xs_doc_example(self):
# TODO: more descriptive name
# based on example in advanced.rst
arrays = [
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
tuples = list(zip(*arrays))
index = MultiIndex.from_tuples(tuples, names=["first", "second"])
df = DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index)
result = df.xs(("one", "bar"), level=("second", "first"), axis=1)
expected = df.iloc[:, [0]]
tm.assert_frame_equal(result, expected)
def test_xs_integer_key(self):
# see GH#2107
dates = range(20111201, 20111205)
ids = list("abcde")
index = MultiIndex.from_product([dates, ids], names=["date", "secid"])
df = DataFrame(np.random.randn(len(index), 3), index, ["X", "Y", "Z"])
result = df.xs(20111201, level="date")
expected = df.loc[20111201, :]
tm.assert_frame_equal(result, expected)
def test_xs_level(self, multiindex_dataframe_random_data):
df = multiindex_dataframe_random_data
result = df.xs("two", level="second")
expected = df[df.index.get_level_values(1) == "two"]
expected.index = Index(["foo", "bar", "baz", "qux"], name="first")
tm.assert_frame_equal(result, expected)
def test_xs_level_eq_2(self):
arr = np.random.randn(3, 5)
index = MultiIndex(
levels=[["a", "p", "x"], ["b", "q", "y"], ["c", "r", "z"]],
codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]],
)
df = DataFrame(arr, index=index)
expected = DataFrame(arr[1:2], index=[["a"], ["b"]])
result = df.xs("c", level=2)
tm.assert_frame_equal(result, expected)
def test_xs_setting_with_copy_error(self, multiindex_dataframe_random_data):
# this is a copy in 0.14
df = multiindex_dataframe_random_data
result = df.xs("two", level="second")
# setting this will give a SettingWithCopyError
# as we are trying to write a view
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
result[:] = 10
def test_xs_setting_with_copy_error_multiple(self, four_level_index_dataframe):
# this is a copy in 0.14
df = four_level_index_dataframe
result = df.xs(("a", 4), level=["one", "four"])
# setting this will give a SettingWithCopyError
# as we are trying to write a view
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
result[:] = 10
@pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])])
def test_xs_with_duplicates(self, key, level, multiindex_dataframe_random_data):
# see GH#13719
frame = multiindex_dataframe_random_data
df = concat([frame] * 2)
assert df.index.is_unique is False
expected = concat([frame.xs("one", level="second")] * 2)
if isinstance(key, list):
with tm.assert_produces_warning(FutureWarning):
result = df.xs(key, level=level)
else:
result = df.xs(key, level=level)
tm.assert_frame_equal(result, expected)
def test_xs_missing_values_in_index(self):
# see GH#6574
# missing values in returned index should be preserved
acc = [
("a", "abcde", 1),
("b", "bbcde", 2),
("y", "yzcde", 25),
("z", "xbcde", 24),
("z", None, 26),
("z", "zbcde", 25),
("z", "ybcde", 26),
]
df = DataFrame(acc, columns=["a1", "a2", "cnt"]).set_index(["a1", "a2"])
expected = DataFrame(
{"cnt": [24, 26, 25, 26]},
index=Index(["xbcde", np.nan, "zbcde", "ybcde"], name="a2"),
)
result = df.xs("z", level="a1")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"key, level, exp_arr, exp_index",
[
("a", "lvl0", lambda x: x[:, 0:2], Index(["bar", "foo"], name="lvl1")),
("foo", "lvl1", lambda x: x[:, 1:2], Index(["a"], name="lvl0")),
],
)
def test_xs_named_levels_axis_eq_1(self, key, level, exp_arr, exp_index):
# see GH#2903
arr = np.random.randn(4, 4)
index = MultiIndex(
levels=[["a", "b"], ["bar", "foo", "hello", "world"]],
codes=[[0, 0, 1, 1], [0, 1, 2, 3]],
names=["lvl0", "lvl1"],
)
df = DataFrame(arr, columns=index)
result = df.xs(key, level=level, axis=1)
expected = DataFrame(exp_arr(arr), columns=exp_index)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"indexer",
[
lambda df: df.xs(("a", 4), level=["one", "four"]),
lambda df: df.xs("a").xs(4, level="four"),
],
)
def test_xs_level_multiple(self, indexer, four_level_index_dataframe):
df = four_level_index_dataframe
expected_values = [[0.4473, 1.4152, 0.2834, 1.00661, 0.1744]]
expected_index = MultiIndex(
levels=[["q"], [20.0]], codes=[[0], [0]], names=["two", "three"]
)
expected = DataFrame(
expected_values, index=expected_index, columns=list("ABCDE")
)
result = indexer(df)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"indexer", [lambda df: df.xs("a", level=0), lambda df: df.xs("a")]
)
def test_xs_level0(self, indexer, four_level_index_dataframe):
df = four_level_index_dataframe
expected_values = [
[-0.5109, -2.3358, -0.4645, 0.05076, 0.364],
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
]
expected_index = MultiIndex(
levels=[["b", "q"], [10.0032, 20.0], [4, 5]],
codes=[[0, 1], [0, 1], [1, 0]],
names=["two", "three", "four"],
)
expected = DataFrame(
expected_values, index=expected_index, columns=list("ABCDE")
)
result = indexer(df)
tm.assert_frame_equal(result, expected)
def test_xs_values(self, multiindex_dataframe_random_data):
df = multiindex_dataframe_random_data
result = df.xs(("bar", "two")).values
expected = df.values[4]
tm.assert_almost_equal(result, expected)
def test_xs_loc_equality(self, multiindex_dataframe_random_data):
df = multiindex_dataframe_random_data
result = df.xs(("bar", "two"))
expected = df.loc[("bar", "two")]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("klass", [DataFrame, Series])
def test_xs_IndexSlice_argument_not_implemented(self, klass):
# GH#35301
index = MultiIndex(
levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]],
codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
)
obj = DataFrame(np.random.randn(6, 4), index=index)
if klass is Series:
obj = obj[0]
expected = obj.iloc[-2:].droplevel(0)
result = obj.xs(IndexSlice[("foo", "qux", 0), :])
tm.assert_equal(result, expected)
result = obj.loc[IndexSlice[("foo", "qux", 0), :]]
tm.assert_equal(result, expected)
@pytest.mark.parametrize("klass", [DataFrame, Series])
def test_xs_levels_raises(self, klass):
obj = DataFrame({"A": [1, 2, 3]})
if klass is Series:
obj = obj["A"]
msg = "Index must be a MultiIndex"
with pytest.raises(TypeError, match=msg):
obj.xs(0, level="as")
def test_xs_multiindex_droplevel_false(self):
# GH#19056
mi = MultiIndex.from_tuples(
[("a", "x"), ("a", "y"), ("b", "x")], names=["level1", "level2"]
)
df = DataFrame([[1, 2, 3]], columns=mi)
result = df.xs("a", axis=1, drop_level=False)
expected = DataFrame(
[[1, 2]],
columns=MultiIndex.from_tuples(
[("a", "x"), ("a", "y")], names=["level1", "level2"]
),
)
tm.assert_frame_equal(result, expected)
def test_xs_droplevel_false(self):
# GH#19056
df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"]))
result = df.xs("a", axis=1, drop_level=False)
expected = DataFrame({"a": [1]})
tm.assert_frame_equal(result, expected)
def test_xs_droplevel_false_view(self, using_array_manager):
# GH#37832
df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"]))
result = df.xs("a", axis=1, drop_level=False)
# check that result still views the same data as df
assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values)
# modifying original df also modifies result when having a single block
df.iloc[0, 0] = 2
expected = DataFrame({"a": [2]})
tm.assert_frame_equal(result, expected)
# with mixed dataframe, modifying the parent doesn't modify result
# TODO the "split" path behaves differently here as with single block
df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"]))
result = df.xs("a", axis=1, drop_level=False)
df.iloc[0, 0] = 2
if using_array_manager:
# Here the behavior is consistent
expected = DataFrame({"a": [2]})
else:
# FIXME: iloc does not update the array inplace using
# "split" path
expected = DataFrame({"a": [1]})
tm.assert_frame_equal(result, expected)
def test_xs_list_indexer_droplevel_false(self):
# GH#41760
mi = MultiIndex.from_tuples([("x", "m", "a"), ("x", "n", "b"), ("y", "o", "c")])
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=mi)
with tm.assert_produces_warning(FutureWarning):
with pytest.raises(KeyError, match="y"):
df.xs(["x", "y"], drop_level=False, axis=1)

View File

@@ -0,0 +1,7 @@
"""
Test files dedicated to individual (stand-alone) DataFrame methods
Ideally these files/tests should correspond 1-to-1 with tests.series.methods
These may also present opportunities for sharing/de-duplicating test code.
"""

Some files were not shown because too many files have changed in this diff Show More