first commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,393 @@
|
||||
"""
|
||||
test cython .agg behavior
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import is_float_dtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
bdate_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
pytest.param(
|
||||
"median",
|
||||
# ignore mean of empty slice
|
||||
# and all-NaN
|
||||
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
|
||||
),
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cythonized_aggers(op_name):
|
||||
data = {
|
||||
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
|
||||
"B": ["A", "B"] * 6,
|
||||
"C": np.random.randn(12),
|
||||
}
|
||||
df = DataFrame(data)
|
||||
df.loc[2:10:2, "C"] = np.nan
|
||||
|
||||
op = lambda x: getattr(x, op_name)()
|
||||
|
||||
# single column
|
||||
grouped = df.drop(["B"], axis=1).groupby("A")
|
||||
exp = {cat: op(group["C"]) for cat, group in grouped}
|
||||
exp = DataFrame({"C": exp})
|
||||
exp.index.name = "A"
|
||||
result = op(grouped)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
# multiple columns
|
||||
grouped = df.groupby(["A", "B"])
|
||||
expd = {}
|
||||
for (cat1, cat2), group in grouped:
|
||||
expd.setdefault(cat1, {})[cat2] = op(group["C"])
|
||||
exp = DataFrame(expd).T.stack(dropna=False)
|
||||
exp.index.names = ["A", "B"]
|
||||
exp.name = "C"
|
||||
|
||||
result = op(grouped)["C"]
|
||||
if op_name in ["sum", "prod"]:
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_cython_agg_boolean():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.randint(0, 5, 50),
|
||||
"b": np.random.randint(0, 2, 50).astype("bool"),
|
||||
}
|
||||
)
|
||||
result = frame.groupby("a")["b"].mean()
|
||||
expected = frame.groupby("a")["b"].agg(np.mean)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg():
|
||||
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
|
||||
|
||||
with pytest.raises(NotImplementedError, match="does not implement"):
|
||||
frame.groupby("a")["b"].mean(numeric_only=True)
|
||||
|
||||
with pytest.raises(TypeError, match="Could not convert (foo|bar)*"):
|
||||
frame.groupby("a")["b"].mean()
|
||||
|
||||
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result = frame[["b"]].groupby(frame["a"]).mean()
|
||||
expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg_with_dates():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.randint(0, 5, 50),
|
||||
"b": ["foo", "bar"] * 25,
|
||||
"dates": pd.date_range("now", periods=50, freq="T"),
|
||||
}
|
||||
)
|
||||
with pytest.raises(NotImplementedError, match="does not implement"):
|
||||
frame.groupby("b").dates.mean(numeric_only=True)
|
||||
|
||||
|
||||
def test_cython_agg_frame_columns():
|
||||
# #2113
|
||||
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
|
||||
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
|
||||
|
||||
def test_cython_agg_return_dict():
|
||||
# GH 16741
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.random.randn(8),
|
||||
}
|
||||
)
|
||||
|
||||
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
|
||||
expected = Series(
|
||||
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
|
||||
index=Index(["bar", "foo"], name="A"),
|
||||
name="B",
|
||||
)
|
||||
tm.assert_series_equal(ts, expected)
|
||||
|
||||
|
||||
def test_cython_fail_agg():
|
||||
dr = bdate_range("1/1/2000", periods=50)
|
||||
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
|
||||
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
summed = grouped.sum()
|
||||
expected = grouped.agg(np.sum)
|
||||
tm.assert_series_equal(summed, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", np.median),
|
||||
("var", np.var),
|
||||
("add", np.sum),
|
||||
("prod", np.prod),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
("first", lambda x: x.iloc[0]),
|
||||
("last", lambda x: x.iloc[-1]),
|
||||
],
|
||||
)
|
||||
def test__cython_agg_general(op, targop):
|
||||
df = DataFrame(np.random.randn(1000))
|
||||
labels = np.random.randint(0, 50, size=1000).astype(float)
|
||||
|
||||
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
|
||||
expected = df.groupby(labels).agg(targop)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
|
||||
("var", lambda x: np.var(x, ddof=1)),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_empty_buckets(op, targop, observed):
|
||||
df = DataFrame([11, 12, 13])
|
||||
grps = range(0, 55, 5)
|
||||
|
||||
# calling _cython_agg_general directly, instead of via the user API
|
||||
# which sets different values for min_count, so do that here.
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
result = g._cython_agg_general(op, alt=None, numeric_only=True)
|
||||
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
expected = g.agg(lambda x: targop(x))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_empty_buckets_nanops(observed):
|
||||
# GH-18869 can't call nanops on empty groups, so hardcode expected
|
||||
# for these
|
||||
df = DataFrame([11, 12, 13], columns=["a"])
|
||||
grps = range(0, 25, 5)
|
||||
# add / sum
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"add", alt=None, numeric_only=True
|
||||
)
|
||||
intervals = pd.interval_range(0, 20, freq=5)
|
||||
expected = DataFrame(
|
||||
{"a": [0, 0, 36, 0]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 0]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# prod
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"prod", alt=None, numeric_only=True
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 1, 1716, 1]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 1]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
|
||||
@pytest.mark.parametrize(
|
||||
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
|
||||
)
|
||||
def test_cython_with_timestamp_and_nat(op, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/19526
|
||||
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
|
||||
index = Index([0, 1], name="a")
|
||||
|
||||
# We will group by a and test the cython aggregations
|
||||
expected = DataFrame({"b": [data, NaT]}, index=index)
|
||||
|
||||
result = df.groupby("a").aggregate(op)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg",
|
||||
[
|
||||
"min",
|
||||
"max",
|
||||
"count",
|
||||
"sum",
|
||||
"prod",
|
||||
"var",
|
||||
"mean",
|
||||
"median",
|
||||
"ohlc",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"shift",
|
||||
"any",
|
||||
"all",
|
||||
"quantile",
|
||||
"first",
|
||||
"last",
|
||||
"rank",
|
||||
"cummin",
|
||||
"cummax",
|
||||
],
|
||||
)
|
||||
def test_read_only_buffer_source_agg(agg):
|
||||
# https://github.com/pandas-dev/pandas/issues/36014
|
||||
df = DataFrame(
|
||||
{
|
||||
"sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
|
||||
"species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
|
||||
}
|
||||
)
|
||||
df._mgr.arrays[0].flags.writeable = False
|
||||
|
||||
result = df.groupby(["species"]).agg({"sepal_length": agg})
|
||||
expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
"median",
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cython_agg_nullable_int(op_name):
|
||||
# ensure that the cython-based aggregations don't fail for nullable dtype
|
||||
# (eg https://github.com/pandas-dev/pandas/issues/37415)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["A", "B"] * 5,
|
||||
"B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("A")["B"], op_name)()
|
||||
df2 = df.assign(B=df["B"].astype("float64"))
|
||||
expected = getattr(df2.groupby("A")["B"], op_name)()
|
||||
|
||||
if op_name != "count":
|
||||
# the result is not yet consistently using Int64/Float64 dtype,
|
||||
# so for now just checking the values by casting to float
|
||||
result = result.astype("float64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_na", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"op_name, action",
|
||||
[
|
||||
# ("count", "always_int"),
|
||||
("sum", "large_int"),
|
||||
# ("std", "always_float"),
|
||||
("var", "always_float"),
|
||||
# ("sem", "always_float"),
|
||||
("mean", "always_float"),
|
||||
("median", "always_float"),
|
||||
("prod", "large_int"),
|
||||
("min", "preserve"),
|
||||
("max", "preserve"),
|
||||
("first", "preserve"),
|
||||
("last", "preserve"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
pd.array([1, 2, 3, 4], dtype="Int64"),
|
||||
pd.array([1, 2, 3, 4], dtype="Int8"),
|
||||
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
|
||||
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
|
||||
pd.array([True, True, False, False], dtype="boolean"),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
|
||||
if with_na:
|
||||
data[3] = pd.NA
|
||||
|
||||
df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
|
||||
grouped = df.groupby("key")
|
||||
|
||||
if action == "always_int":
|
||||
# always Int64
|
||||
expected_dtype = pd.Int64Dtype()
|
||||
elif action == "large_int":
|
||||
# for any int/bool use Int64, for float preserve dtype
|
||||
if is_float_dtype(data.dtype):
|
||||
expected_dtype = data.dtype
|
||||
else:
|
||||
expected_dtype = pd.Int64Dtype()
|
||||
elif action == "always_float":
|
||||
# for any int/bool use Float64, for float preserve dtype
|
||||
if is_float_dtype(data.dtype):
|
||||
expected_dtype = data.dtype
|
||||
else:
|
||||
expected_dtype = pd.Float64Dtype()
|
||||
elif action == "preserve":
|
||||
expected_dtype = data.dtype
|
||||
|
||||
result = getattr(grouped, op_name)()
|
||||
assert result["col"].dtype == expected_dtype
|
||||
|
||||
result = grouped.aggregate(op_name)
|
||||
assert result["col"].dtype == expected_dtype
|
||||
|
||||
result = getattr(grouped["col"], op_name)()
|
||||
assert result.dtype == expected_dtype
|
||||
|
||||
result = grouped["col"].aggregate(op_name)
|
||||
assert result.dtype == expected_dtype
|
||||
@@ -0,0 +1,216 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import NumbaUtilError
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
NamedAgg,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_correct_function_signature():
|
||||
def incorrect_function(x):
|
||||
return sum(x) * 2.7
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba")
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba")
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_check_nopython_kwargs():
|
||||
def incorrect_function(x, **kwargs):
|
||||
return sum(x) * 2.7
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba", a=1)
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
@pytest.mark.filterwarnings("ignore:\n")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython):
|
||||
def func_numba(values, index):
|
||||
return np.mean(values) * 2.7
|
||||
|
||||
if jit:
|
||||
# Test accepted jitted functions
|
||||
import numba
|
||||
|
||||
func_numba = numba.jit(func_numba)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
@pytest.mark.filterwarnings("ignore:\n")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
|
||||
# Test that the functions are cached correctly if we switch functions
|
||||
def func_1(values, index):
|
||||
return np.mean(values) - 3.4
|
||||
|
||||
def func_2(values, index):
|
||||
return np.mean(values) * 2.7
|
||||
|
||||
if jit:
|
||||
import numba
|
||||
|
||||
func_1 = numba.jit(func_1)
|
||||
func_2 = numba.jit(func_2)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
# func_1 should be in the cache now
|
||||
assert (func_1, "groupby_agg") in NUMBA_FUNC_CACHE
|
||||
|
||||
# Add func_2 to the cache
|
||||
result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
assert (func_2, "groupby_agg") in NUMBA_FUNC_CACHE
|
||||
|
||||
# Retest func_1 which should use the cache
|
||||
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_use_global_config():
|
||||
def func_1(values, index):
|
||||
return np.mean(values) - 3.4
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
expected = grouped.agg(func_1, engine="numba")
|
||||
with option_context("compute.use_numba", True):
|
||||
result = grouped.agg(func_1, engine=None)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
@pytest.mark.parametrize(
|
||||
"agg_func",
|
||||
[
|
||||
["min", "max"],
|
||||
"min",
|
||||
{"B": ["min", "max"], "C": "sum"},
|
||||
NamedAgg(column="B", aggfunc="min"),
|
||||
],
|
||||
)
|
||||
def test_multifunc_notimplimented(agg_func):
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
with pytest.raises(NotImplementedError, match="Numba engine can"):
|
||||
grouped.agg(agg_func, engine="numba")
|
||||
|
||||
with pytest.raises(NotImplementedError, match="Numba engine can"):
|
||||
grouped[1].agg(agg_func, engine="numba")
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_args_not_cached():
|
||||
# GH 41647
|
||||
def sum_last(values, index, n):
|
||||
return values[-n:].sum()
|
||||
|
||||
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
||||
grouped_x = df.groupby("id")["x"]
|
||||
result = grouped_x.agg(sum_last, 1, engine="numba")
|
||||
expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped_x.agg(sum_last, 2, engine="numba")
|
||||
expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_index_data_correctly_passed():
|
||||
# GH 43133
|
||||
def f(values, index):
|
||||
return np.mean(index)
|
||||
|
||||
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
||||
result = df.groupby("group").aggregate(f, engine="numba")
|
||||
expected = DataFrame(
|
||||
[-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_multiindex_one_key(nogil, parallel, nopython):
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby("A").agg(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
with pytest.raises(NotImplementedError, match="More than 1 grouping labels"):
|
||||
df.groupby(["A", "B"]).agg(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
@@ -0,0 +1,673 @@
|
||||
"""
|
||||
test all other .agg behavior
|
||||
"""
|
||||
|
||||
import datetime as dt
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.base import SpecificationError
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
def test_agg_api():
|
||||
# GH 6337
|
||||
# https://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
|
||||
# different api for agg when passed custom function with mixed frame
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"data1": np.random.randn(5),
|
||||
"data2": np.random.randn(5),
|
||||
"key1": ["a", "a", "b", "b", "a"],
|
||||
"key2": ["one", "two", "one", "two", "one"],
|
||||
}
|
||||
)
|
||||
grouped = df.groupby("key1")
|
||||
|
||||
def peak_to_peak(arr):
|
||||
return arr.max() - arr.min()
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning,
|
||||
match=r"\['key2'\] did not aggregate successfully",
|
||||
):
|
||||
expected = grouped.agg([peak_to_peak])
|
||||
expected.columns = ["data1", "data2"]
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning,
|
||||
match=r"\['key2'\] did not aggregate successfully",
|
||||
):
|
||||
result = grouped.agg(peak_to_peak)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_datetimes_mixed():
|
||||
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
|
||||
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
data = [
|
||||
[
|
||||
row[0],
|
||||
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
|
||||
row[2],
|
||||
]
|
||||
for row in data
|
||||
]
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
df1["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb1 = df1.groupby("date").aggregate(np.sum)
|
||||
|
||||
df2["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb2 = df2.groupby("date").aggregate(np.sum)
|
||||
|
||||
assert len(gb1) == len(gb2)
|
||||
|
||||
|
||||
def test_agg_period_index():
|
||||
prng = period_range("2012-1-1", freq="M", periods=3)
|
||||
df = DataFrame(np.random.randn(3, 2), index=prng)
|
||||
rs = df.groupby(level=0).sum()
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
|
||||
# GH 3579
|
||||
index = period_range(start="1999-01", periods=5, freq="M")
|
||||
s1 = Series(np.random.rand(len(index)), index=index)
|
||||
s2 = Series(np.random.rand(len(index)), index=index)
|
||||
df = DataFrame.from_dict({"s1": s1, "s2": s2})
|
||||
grouped = df.groupby(df.index.month)
|
||||
list(grouped)
|
||||
|
||||
|
||||
def test_agg_dict_parameter_cast_result_dtypes():
|
||||
# GH 12821
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
|
||||
"time": date_range("1/1/2011", periods=8, freq="H"),
|
||||
}
|
||||
)
|
||||
df.loc[[0, 1, 2, 5], "time"] = None
|
||||
|
||||
# test for `first` function
|
||||
exp = df.loc[[0, 3, 4, 6]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.first(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("first"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
|
||||
tm.assert_series_equal(grouped.time.first(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
|
||||
|
||||
# test for `last` function
|
||||
exp = df.loc[[0, 3, 4, 7]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.last(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("last"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
|
||||
tm.assert_series_equal(grouped.time.last(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
|
||||
|
||||
# count
|
||||
exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.agg(len), exp)
|
||||
tm.assert_series_equal(grouped.time.size(), exp)
|
||||
|
||||
exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.count(), exp)
|
||||
|
||||
|
||||
def test_agg_cast_results_dtypes():
|
||||
# similar to GH12821
|
||||
# xref #11444
|
||||
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
|
||||
v = list("aaabbbbbbccd")
|
||||
df = DataFrame({"X": v, "Y": u})
|
||||
|
||||
result = df.groupby("X")["Y"].agg(len)
|
||||
expected = df.groupby("X")["Y"].count()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_float64_no_int64():
|
||||
# see gh-11199
|
||||
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a", "c"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_api_consistency():
|
||||
# GH 9052
|
||||
# make sure that the aggregates via dict
|
||||
# are consistent
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.randn(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["A", "B"])
|
||||
c_mean = grouped["C"].mean()
|
||||
c_sum = grouped["C"].sum()
|
||||
d_mean = grouped["D"].mean()
|
||||
d_sum = grouped["D"].sum()
|
||||
|
||||
result = grouped["D"].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean], axis=1)
|
||||
expected.columns = ["sum", "mean"]
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg([np.sum, np.mean])
|
||||
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped[["D", "C"]].agg([np.sum, np.mean])
|
||||
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": "mean", "D": "sum"})
|
||||
expected = pd.concat([d_sum, c_mean], axis=1)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
|
||||
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
|
||||
|
||||
msg = r"Column\(s\) \['r', 'r2'\] do not exist"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})
|
||||
|
||||
|
||||
def test_agg_dict_renaming_deprecation():
|
||||
# 15931
|
||||
df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
df.groupby("A").agg(
|
||||
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
|
||||
)
|
||||
|
||||
msg = r"Column\(s\) \['ma'\] do not exist"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
df.groupby("A").B.agg({"foo": "count"})
|
||||
|
||||
|
||||
def test_agg_compat():
|
||||
# GH 12334
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.randn(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"C": ["sum", "std"]})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"C": "sum", "D": "std"})
|
||||
|
||||
|
||||
def test_agg_nested_dicts():
|
||||
# API change for disallowing these types of nested dicts
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.randn(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
|
||||
|
||||
# same name as the original column
|
||||
# GH9052
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"result1": np.sum, "result2": np.mean})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"D": np.sum, "result2": np.mean})
|
||||
|
||||
|
||||
def test_agg_item_by_item_raise_typeerror():
|
||||
df = DataFrame(np.random.randint(10, size=(20, 10)))
|
||||
|
||||
def raiseException(df):
|
||||
pprint_thing("----------------------------------------")
|
||||
pprint_thing(df.to_string())
|
||||
raise TypeError("test")
|
||||
|
||||
with pytest.raises(TypeError, match="test"):
|
||||
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
|
||||
df.groupby(0).agg(raiseException)
|
||||
|
||||
|
||||
def test_series_agg_multikey():
|
||||
ts = tm.makeTimeSeries()
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
result = grouped.agg(np.sum)
|
||||
expected = grouped.sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_agg_multi_pure_python():
|
||||
data = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.randn(11),
|
||||
"E": np.random.randn(11),
|
||||
"F": np.random.randn(11),
|
||||
}
|
||||
)
|
||||
|
||||
def bad(x):
|
||||
assert len(x.values.base) > 0
|
||||
return "foo"
|
||||
|
||||
result = data.groupby(["A", "B"]).agg(bad)
|
||||
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_consistency():
|
||||
# agg with ([]) and () not consistent
|
||||
# GH 6715
|
||||
def P1(a):
|
||||
return np.percentile(a.dropna(), q=1)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, 3, 4],
|
||||
"col2": [10, 25, 26, 31],
|
||||
"date": [
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 11),
|
||||
dt.date(2013, 2, 11),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby("date")
|
||||
|
||||
expected = g.agg([P1])
|
||||
expected.columns = expected.columns.levels[0]
|
||||
|
||||
result = g.agg(P1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_callables():
|
||||
# GH 7929
|
||||
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
|
||||
|
||||
class fn_class:
|
||||
def __call__(self, x):
|
||||
return sum(x)
|
||||
|
||||
equiv_callables = [
|
||||
sum,
|
||||
np.sum,
|
||||
lambda x: sum(x),
|
||||
lambda x: x.sum(),
|
||||
partial(sum),
|
||||
fn_class(),
|
||||
]
|
||||
|
||||
expected = df.groupby("foo").agg(sum)
|
||||
for ecall in equiv_callables:
|
||||
result = df.groupby("foo").agg(ecall)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_over_numpy_arrays():
|
||||
# GH 3788
|
||||
df = DataFrame(
|
||||
[
|
||||
[1, np.array([10, 20, 30])],
|
||||
[1, np.array([40, 50, 60])],
|
||||
[2, np.array([20, 30, 40])],
|
||||
],
|
||||
columns=["category", "arraydata"],
|
||||
)
|
||||
gb = df.groupby("category")
|
||||
|
||||
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
|
||||
expected_index = Index([1, 2], name="category")
|
||||
expected_column = ["arraydata"]
|
||||
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
|
||||
|
||||
alt = gb.sum(numeric_only=False)
|
||||
tm.assert_frame_equal(alt, expected)
|
||||
|
||||
result = gb.agg("sum", numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# FIXME: the original version of this test called `gb.agg(sum)`
|
||||
# and that raises TypeError if `numeric_only=False` is passed
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_period", [True, False])
|
||||
def test_agg_tzaware_non_datetime_result(as_period):
|
||||
# discussed in GH#29589, fixed in GH#29641, operating on tzaware values
|
||||
# with function that is not dtype-preserving
|
||||
dti = date_range("2012-01-01", periods=4, tz="UTC")
|
||||
if as_period:
|
||||
dti = dti.tz_localize(None).to_period("D")
|
||||
|
||||
df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
|
||||
gb = df.groupby("a")
|
||||
|
||||
# Case that _does_ preserve the dtype
|
||||
result = gb["b"].agg(lambda x: x.iloc[0])
|
||||
expected = Series(dti[::2], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# Cases that do _not_ preserve the dtype
|
||||
result = gb["b"].agg(lambda x: x.iloc[0].year)
|
||||
expected = Series([2012, 2012], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
|
||||
expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
|
||||
expected.index.name = "a"
|
||||
if as_period:
|
||||
expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_timezone_round_trip():
|
||||
# GH 15426
|
||||
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
|
||||
df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
|
||||
|
||||
result1 = df.groupby("a")["b"].agg(np.min).iloc[0]
|
||||
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
|
||||
result3 = df.groupby("a")["b"].min().iloc[0]
|
||||
|
||||
assert result1 == ts
|
||||
assert result2 == ts
|
||||
assert result3 == ts
|
||||
|
||||
dates = [
|
||||
pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
|
||||
]
|
||||
df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
|
||||
grouped = df.groupby("A")
|
||||
|
||||
ts = df["B"].iloc[0]
|
||||
assert ts == grouped.nth(0)["B"].iloc[0]
|
||||
assert ts == grouped.head(1)["B"].iloc[0]
|
||||
assert ts == grouped.first()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
|
||||
|
||||
ts = df["B"].iloc[2]
|
||||
assert ts == grouped.last()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
|
||||
|
||||
|
||||
def test_sum_uint64_overflow():
|
||||
# see gh-14758
|
||||
# Convert to uint64 and don't overflow
|
||||
df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
|
||||
df = df + 9223372036854775807
|
||||
|
||||
index = Index(
|
||||
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
|
||||
)
|
||||
expected = DataFrame(
|
||||
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
|
||||
index=index,
|
||||
)
|
||||
|
||||
expected.index.name = 0
|
||||
result = df.groupby(0).sum(numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# out column is non-numeric, so with numeric_only=True it is dropped
|
||||
result2 = df.groupby(0).sum(numeric_only=True)
|
||||
expected2 = expected[[]]
|
||||
tm.assert_frame_equal(result2, expected2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
|
||||
(list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
|
||||
(
|
||||
lambda x: tuple(x),
|
||||
DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
|
||||
),
|
||||
(
|
||||
lambda x: list(x),
|
||||
DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_dataframe(structure, expected):
|
||||
df = DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby(["A", "B"]).aggregate(structure)
|
||||
expected.index.names = ["A", "B"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
(lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_series(structure, expected):
|
||||
# Issue #18079
|
||||
df = DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby("A")["C"].aggregate(structure)
|
||||
expected.index.name = "A"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_category_nansum(observed):
|
||||
categories = ["a", "b", "c"]
|
||||
df = DataFrame(
|
||||
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
|
||||
)
|
||||
result = df.groupby("A", observed=observed).B.agg(np.nansum)
|
||||
expected = Series(
|
||||
[3, 3, 0],
|
||||
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
|
||||
name="B",
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected != 0]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_list_like_func():
|
||||
# GH 18473
|
||||
df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
|
||||
grouped = df.groupby("A", as_index=False, sort=False)
|
||||
result = grouped.agg({"B": lambda x: list(x)})
|
||||
expected = DataFrame(
|
||||
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_lambda_with_timezone():
|
||||
# GH 23683
|
||||
df = DataFrame(
|
||||
{
|
||||
"tag": [1, 1],
|
||||
"date": [
|
||||
pd.Timestamp("2018-01-01", tz="UTC"),
|
||||
pd.Timestamp("2018-01-02", tz="UTC"),
|
||||
],
|
||||
}
|
||||
)
|
||||
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
|
||||
expected = DataFrame(
|
||||
[pd.Timestamp("2018-01-01", tz="UTC")],
|
||||
index=Index([1], name="tag"),
|
||||
columns=["date"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"err_cls",
|
||||
[
|
||||
NotImplementedError,
|
||||
RuntimeError,
|
||||
KeyError,
|
||||
IndexError,
|
||||
OSError,
|
||||
ValueError,
|
||||
ArithmeticError,
|
||||
AttributeError,
|
||||
],
|
||||
)
|
||||
def test_groupby_agg_err_catching(err_cls):
|
||||
# make sure we suppress anything other than TypeError or AssertionError
|
||||
# in _python_agg_general
|
||||
|
||||
# Use a non-standard EA to make sure we don't go down ndarray paths
|
||||
from pandas.tests.extension.decimal.array import (
|
||||
DecimalArray,
|
||||
make_data,
|
||||
to_decimal,
|
||||
)
|
||||
|
||||
data = make_data()[:5]
|
||||
df = DataFrame(
|
||||
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
|
||||
)
|
||||
|
||||
expected = Series(to_decimal([data[0], data[3]]))
|
||||
|
||||
def weird_func(x):
|
||||
# weird function that raise something other than TypeError or IndexError
|
||||
# in _python_agg_general
|
||||
if len(x) == 0:
|
||||
raise err_cls
|
||||
return x.iloc[0]
|
||||
|
||||
result = df["decimals"].groupby(df["id1"]).agg(weird_func)
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
@@ -0,0 +1,191 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
from pandas.core.groupby.base import (
|
||||
reduction_kernels,
|
||||
transformation_kernels,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def sort(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def as_index(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mframe(multiindex_dataframe_random_data):
|
||||
return multiindex_dataframe_random_data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.random.randn(8),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ts():
|
||||
return tm.makeTimeSeries()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tsd():
|
||||
return tm.getTimeSeriesData()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tsframe(tsd):
|
||||
return DataFrame(tsd)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_mixed_floats():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.array(np.random.randn(8), dtype="float32"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def three_group():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.randn(11),
|
||||
"E": np.random.randn(11),
|
||||
"F": np.random.randn(11),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def slice_test_df():
|
||||
data = [
|
||||
[0, "a", "a0_at_0"],
|
||||
[1, "b", "b0_at_1"],
|
||||
[2, "a", "a1_at_2"],
|
||||
[3, "b", "b1_at_3"],
|
||||
[4, "c", "c0_at_4"],
|
||||
[5, "a", "a2_at_5"],
|
||||
[6, "a", "a3_at_6"],
|
||||
[7, "a", "a4_at_7"],
|
||||
]
|
||||
df = DataFrame(data, columns=["Index", "Group", "Value"])
|
||||
return df.set_index("Index")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def slice_test_grouped(slice_test_df):
|
||||
return slice_test_df.groupby("Group", as_index=False)
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(reduction_kernels))
|
||||
def reduction_func(request):
|
||||
"""
|
||||
yields the string names of all groupby reduction functions, one at a time.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(transformation_kernels))
|
||||
def transformation_func(request):
|
||||
"""yields the string names of all groupby transformation functions."""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels))
|
||||
def groupby_func(request):
|
||||
"""yields both aggregation and transformation functions."""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def parallel(request):
|
||||
"""parallel keyword argument for numba.jit"""
|
||||
return request.param
|
||||
|
||||
|
||||
# Can parameterize nogil & nopython over True | False, but limiting per
|
||||
# https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472
|
||||
|
||||
|
||||
@pytest.fixture(params=[False])
|
||||
def nogil(request):
|
||||
"""nogil keyword argument for numba.jit"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True])
|
||||
def nopython(request):
|
||||
"""nopython keyword argument for numba.jit"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("mean", {}),
|
||||
("var", {"ddof": 1}),
|
||||
("var", {"ddof": 0}),
|
||||
("std", {"ddof": 1}),
|
||||
("std", {"ddof": 0}),
|
||||
("sum", {}),
|
||||
]
|
||||
)
|
||||
def numba_supported_reductions(request):
|
||||
"""reductions supported with engine='numba'"""
|
||||
return request.param
|
||||
@@ -0,0 +1,450 @@
|
||||
"""
|
||||
test methods relating to generic function evaluation
|
||||
the so-called white/black lists
|
||||
"""
|
||||
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.groupby.base import (
|
||||
groupby_other_methods,
|
||||
reduction_kernels,
|
||||
transformation_kernels,
|
||||
)
|
||||
|
||||
AGG_FUNCTIONS = [
|
||||
"sum",
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
"median",
|
||||
"mean",
|
||||
"skew",
|
||||
"mad",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
]
|
||||
AGG_FUNCTIONS_WITH_SKIPNA = ["skew", "mad"]
|
||||
|
||||
df_allowlist = [
|
||||
"quantile",
|
||||
"fillna",
|
||||
"mad",
|
||||
"take",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"tshift",
|
||||
"skew",
|
||||
"plot",
|
||||
"hist",
|
||||
"dtypes",
|
||||
"corrwith",
|
||||
"corr",
|
||||
"cov",
|
||||
"diff",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(params=df_allowlist)
|
||||
def df_allowlist_fixture(request):
|
||||
return request.param
|
||||
|
||||
|
||||
s_allowlist = [
|
||||
"quantile",
|
||||
"fillna",
|
||||
"mad",
|
||||
"take",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"tshift",
|
||||
"skew",
|
||||
"plot",
|
||||
"hist",
|
||||
"dtype",
|
||||
"corr",
|
||||
"cov",
|
||||
"diff",
|
||||
"unique",
|
||||
"nlargest",
|
||||
"nsmallest",
|
||||
"is_monotonic_increasing",
|
||||
"is_monotonic_decreasing",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(params=s_allowlist)
|
||||
def s_allowlist_fixture(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.random.randn(8),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_letters():
|
||||
letters = np.array(list(ascii_lowercase))
|
||||
N = 10
|
||||
random_letters = letters.take(np.random.randint(0, 26, N))
|
||||
df = DataFrame(
|
||||
{
|
||||
"floats": N / 10 * Series(np.random.random(N)),
|
||||
"letters": Series(random_letters),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.mark.parametrize("allowlist", [df_allowlist, s_allowlist])
|
||||
def test_groupby_allowlist(df_letters, allowlist):
|
||||
df = df_letters
|
||||
if allowlist == df_allowlist:
|
||||
# dataframe
|
||||
obj = df_letters
|
||||
else:
|
||||
obj = df_letters["floats"]
|
||||
|
||||
gb = obj.groupby(df.letters)
|
||||
|
||||
assert set(allowlist) == set(gb._apply_allowlist)
|
||||
|
||||
|
||||
def check_allowlist(obj, df, m):
|
||||
# check the obj for a particular allowlist m
|
||||
|
||||
gb = obj.groupby(df.letters)
|
||||
|
||||
f = getattr(type(gb), m)
|
||||
|
||||
# name
|
||||
try:
|
||||
n = f.__name__
|
||||
except AttributeError:
|
||||
return
|
||||
assert n == m
|
||||
|
||||
# qualname
|
||||
try:
|
||||
n = f.__qualname__
|
||||
except AttributeError:
|
||||
return
|
||||
assert n.endswith(m)
|
||||
|
||||
|
||||
def test_groupby_series_allowlist(df_letters, s_allowlist_fixture):
|
||||
m = s_allowlist_fixture
|
||||
df = df_letters
|
||||
check_allowlist(df.letters, df, m)
|
||||
|
||||
|
||||
def test_groupby_frame_allowlist(df_letters, df_allowlist_fixture):
|
||||
m = df_allowlist_fixture
|
||||
df = df_letters
|
||||
check_allowlist(df, df, m)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def raw_frame(multiindex_dataframe_random_data):
|
||||
df = multiindex_dataframe_random_data
|
||||
df.iloc[1, [1, 2]] = np.nan
|
||||
df.iloc[7, [0, 1]] = np.nan
|
||||
return df
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", AGG_FUNCTIONS)
|
||||
@pytest.mark.parametrize("level", [0, 1])
|
||||
@pytest.mark.parametrize("axis", [0, 1])
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
def test_regression_allowlist_methods(raw_frame, op, level, axis, skipna, sort):
|
||||
# GH6944
|
||||
# GH 17537
|
||||
# explicitly test the allowlist methods
|
||||
|
||||
if axis == 0:
|
||||
frame = raw_frame
|
||||
else:
|
||||
frame = raw_frame.T
|
||||
|
||||
if op in AGG_FUNCTIONS_WITH_SKIPNA:
|
||||
grouped = frame.groupby(level=level, axis=axis, sort=sort)
|
||||
result = getattr(grouped, op)(skipna=skipna)
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna)
|
||||
if sort:
|
||||
expected = expected.sort_index(axis=axis, level=level)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
grouped = frame.groupby(level=level, axis=axis, sort=sort)
|
||||
result = getattr(grouped, op)()
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
expected = getattr(frame, op)(level=level, axis=axis)
|
||||
if sort:
|
||||
expected = expected.sort_index(axis=axis, level=level)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_blocklist(df_letters):
|
||||
df = df_letters
|
||||
s = df_letters.floats
|
||||
|
||||
blocklist = [
|
||||
"eval",
|
||||
"query",
|
||||
"abs",
|
||||
"where",
|
||||
"mask",
|
||||
"align",
|
||||
"groupby",
|
||||
"clip",
|
||||
"astype",
|
||||
"at",
|
||||
"combine",
|
||||
"consolidate",
|
||||
"convert_objects",
|
||||
]
|
||||
to_methods = [method for method in dir(df) if method.startswith("to_")]
|
||||
|
||||
blocklist.extend(to_methods)
|
||||
|
||||
for bl in blocklist:
|
||||
for obj in (df, s):
|
||||
gb = obj.groupby(df.letters)
|
||||
|
||||
# e.g., to_csv
|
||||
defined_but_not_allowed = (
|
||||
f"(?:^Cannot.+{repr(bl)}.+'{type(gb).__name__}'.+try "
|
||||
f"using the 'apply' method$)"
|
||||
)
|
||||
|
||||
# e.g., query, eval
|
||||
not_defined = (
|
||||
f"(?:^'{type(gb).__name__}' object has no attribute {repr(bl)}$)"
|
||||
)
|
||||
|
||||
msg = f"{defined_but_not_allowed}|{not_defined}"
|
||||
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
getattr(gb, bl)
|
||||
|
||||
|
||||
def test_tab_completion(mframe):
|
||||
grp = mframe.groupby(level="second")
|
||||
results = {v for v in dir(grp) if not v.startswith("_")}
|
||||
expected = {
|
||||
"A",
|
||||
"B",
|
||||
"C",
|
||||
"agg",
|
||||
"aggregate",
|
||||
"apply",
|
||||
"boxplot",
|
||||
"filter",
|
||||
"first",
|
||||
"get_group",
|
||||
"groups",
|
||||
"hist",
|
||||
"indices",
|
||||
"last",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"ngroups",
|
||||
"nth",
|
||||
"ohlc",
|
||||
"plot",
|
||||
"prod",
|
||||
"size",
|
||||
"std",
|
||||
"sum",
|
||||
"transform",
|
||||
"var",
|
||||
"sem",
|
||||
"count",
|
||||
"nunique",
|
||||
"head",
|
||||
"describe",
|
||||
"cummax",
|
||||
"quantile",
|
||||
"rank",
|
||||
"cumprod",
|
||||
"tail",
|
||||
"resample",
|
||||
"cummin",
|
||||
"fillna",
|
||||
"cumsum",
|
||||
"cumcount",
|
||||
"ngroup",
|
||||
"all",
|
||||
"shift",
|
||||
"skew",
|
||||
"take",
|
||||
"tshift",
|
||||
"pct_change",
|
||||
"any",
|
||||
"mad",
|
||||
"corr",
|
||||
"corrwith",
|
||||
"cov",
|
||||
"dtypes",
|
||||
"ndim",
|
||||
"diff",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"ffill",
|
||||
"bfill",
|
||||
"pad",
|
||||
"backfill",
|
||||
"rolling",
|
||||
"expanding",
|
||||
"pipe",
|
||||
"sample",
|
||||
"ewm",
|
||||
"value_counts",
|
||||
}
|
||||
assert results == expected
|
||||
|
||||
|
||||
def test_groupby_function_rename(mframe):
|
||||
grp = mframe.groupby(level="second")
|
||||
for name in ["sum", "prod", "min", "max", "first", "last"]:
|
||||
f = getattr(grp, name)
|
||||
assert f.__name__ == name
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method",
|
||||
[
|
||||
"count",
|
||||
"corr",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"describe",
|
||||
"rank",
|
||||
"quantile",
|
||||
"diff",
|
||||
"shift",
|
||||
"all",
|
||||
"any",
|
||||
"idxmin",
|
||||
"idxmax",
|
||||
"ffill",
|
||||
"bfill",
|
||||
"pct_change",
|
||||
],
|
||||
)
|
||||
def test_groupby_selection_with_methods(df, method):
|
||||
# some methods which require DatetimeIndex
|
||||
rng = date_range("2014", periods=len(df))
|
||||
df.index = rng
|
||||
|
||||
g = df.groupby(["A"])[["C"]]
|
||||
g_exp = df[["C"]].groupby(df["A"])
|
||||
# TODO check groupby with > 1 col ?
|
||||
|
||||
res = getattr(g, method)()
|
||||
exp = getattr(g_exp, method)()
|
||||
|
||||
# should always be frames!
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning")
|
||||
def test_groupby_selection_tshift_raises(df):
|
||||
rng = date_range("2014", periods=len(df))
|
||||
df.index = rng
|
||||
|
||||
g = df.groupby(["A"])[["C"]]
|
||||
|
||||
# check that the index cache is cleared
|
||||
with pytest.raises(ValueError, match="Freq was not set in the index"):
|
||||
# GH#35937
|
||||
g.tshift()
|
||||
|
||||
|
||||
def test_groupby_selection_other_methods(df):
|
||||
# some methods which require DatetimeIndex
|
||||
rng = date_range("2014", periods=len(df))
|
||||
df.columns.name = "foo"
|
||||
df.index = rng
|
||||
|
||||
g = df.groupby(["A"])[["C"]]
|
||||
g_exp = df[["C"]].groupby(df["A"])
|
||||
|
||||
# methods which aren't just .foo()
|
||||
tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
|
||||
tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
|
||||
tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum()))
|
||||
|
||||
tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean())
|
||||
tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc())
|
||||
|
||||
tm.assert_frame_equal(
|
||||
g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)
|
||||
)
|
||||
|
||||
|
||||
def test_all_methods_categorized(mframe):
|
||||
grp = mframe.groupby(mframe.iloc[:, 0])
|
||||
names = {_ for _ in dir(grp) if not _.startswith("_")} - set(mframe.columns)
|
||||
new_names = set(names)
|
||||
new_names -= reduction_kernels
|
||||
new_names -= transformation_kernels
|
||||
new_names -= groupby_other_methods
|
||||
|
||||
assert not (reduction_kernels & transformation_kernels)
|
||||
assert not (reduction_kernels & groupby_other_methods)
|
||||
assert not (transformation_kernels & groupby_other_methods)
|
||||
|
||||
# new public method?
|
||||
if new_names:
|
||||
msg = f"""
|
||||
There are uncatgeorized methods defined on the Grouper class:
|
||||
{new_names}.
|
||||
|
||||
Was a new method recently added?
|
||||
|
||||
Every public method On Grouper must appear in exactly one the
|
||||
following three lists defined in pandas.core.groupby.base:
|
||||
- `reduction_kernels`
|
||||
- `transformation_kernels`
|
||||
- `groupby_other_methods`
|
||||
see the comments in pandas/core/groupby/base.py for guidance on
|
||||
how to fix this test.
|
||||
"""
|
||||
raise AssertionError(msg)
|
||||
|
||||
# removed a public method?
|
||||
all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
|
||||
print(names)
|
||||
print(all_categorized)
|
||||
if not (names == all_categorized):
|
||||
msg = f"""
|
||||
Some methods which are supposed to be on the Grouper class
|
||||
are missing:
|
||||
{all_categorized - names}.
|
||||
|
||||
They're still defined in one of the lists that live in pandas/core/groupby/base.py.
|
||||
If you removed a method, you should update them
|
||||
"""
|
||||
raise AssertionError(msg)
|
||||
@@ -0,0 +1,190 @@
|
||||
import builtins
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg_func", ["any", "all"])
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
["foo", "bar", "baz"],
|
||||
["foo", "", ""],
|
||||
["", "", ""],
|
||||
[1, 2, 3],
|
||||
[1, 0, 0],
|
||||
[0, 0, 0],
|
||||
[1.0, 2.0, 3.0],
|
||||
[1.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.0],
|
||||
[True, True, True],
|
||||
[True, False, False],
|
||||
[False, False, False],
|
||||
[np.nan, np.nan, np.nan],
|
||||
],
|
||||
)
|
||||
def test_groupby_bool_aggs(agg_func, skipna, vals):
|
||||
df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2})
|
||||
|
||||
# Figure out expectation using Python builtin
|
||||
exp = getattr(builtins, agg_func)(vals)
|
||||
|
||||
# edge case for missing data with skipna and 'any'
|
||||
if skipna and all(isna(vals)) and agg_func == "any":
|
||||
exp = False
|
||||
|
||||
exp_df = DataFrame([exp] * 2, columns=["val"], index=Index(["a", "b"], name="key"))
|
||||
result = getattr(df.groupby("key"), agg_func)(skipna=skipna)
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
def test_any():
|
||||
df = DataFrame(
|
||||
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
|
||||
)
|
||||
expected.index.name = "A"
|
||||
result = df.groupby("A").any()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
|
||||
def test_bool_aggs_dup_column_labels(bool_agg_func):
|
||||
# 21668
|
||||
df = DataFrame([[True, True]], columns=["a", "a"])
|
||||
grp_by = df.groupby([0])
|
||||
result = getattr(grp_by, bool_agg_func)()
|
||||
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
[False, False, False],
|
||||
[True, True, True],
|
||||
[pd.NA, pd.NA, pd.NA],
|
||||
[False, pd.NA, False],
|
||||
[True, pd.NA, True],
|
||||
[True, pd.NA, False],
|
||||
],
|
||||
)
|
||||
def test_masked_kleene_logic(bool_agg_func, skipna, data):
|
||||
# GH#37506
|
||||
ser = Series(data, dtype="boolean")
|
||||
|
||||
# The result should match aggregating on the whole series. Correctness
|
||||
# there is verified in test_reductions.py::test_any_all_boolean_kleene_logic
|
||||
expected_data = getattr(ser, bool_agg_func)(skipna=skipna)
|
||||
expected = Series(expected_data, dtype="boolean")
|
||||
|
||||
result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype1,dtype2,exp_col1,exp_col2",
|
||||
[
|
||||
(
|
||||
"float",
|
||||
"Float64",
|
||||
np.array([True], dtype=bool),
|
||||
pd.array([pd.NA], dtype="boolean"),
|
||||
),
|
||||
(
|
||||
"Int64",
|
||||
"float",
|
||||
pd.array([pd.NA], dtype="boolean"),
|
||||
np.array([True], dtype=bool),
|
||||
),
|
||||
(
|
||||
"Int64",
|
||||
"Int64",
|
||||
pd.array([pd.NA], dtype="boolean"),
|
||||
pd.array([pd.NA], dtype="boolean"),
|
||||
),
|
||||
(
|
||||
"Float64",
|
||||
"boolean",
|
||||
pd.array([pd.NA], dtype="boolean"),
|
||||
pd.array([pd.NA], dtype="boolean"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2):
|
||||
# GH#37506
|
||||
data = [1.0, np.nan]
|
||||
df = DataFrame(
|
||||
{"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)}
|
||||
)
|
||||
result = df.groupby([1, 1]).agg("all", skipna=False)
|
||||
|
||||
expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series):
|
||||
# GH#40585
|
||||
obj = frame_or_series([pd.NA, 1], dtype=dtype)
|
||||
expected_res = True
|
||||
if not skipna and bool_agg_func == "all":
|
||||
expected_res = pd.NA
|
||||
expected = frame_or_series([expected_res], index=[1], dtype="boolean")
|
||||
|
||||
result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bool_agg_func,data,expected_res",
|
||||
[
|
||||
("any", [pd.NA, np.nan], False),
|
||||
("any", [pd.NA, 1, np.nan], True),
|
||||
("all", [pd.NA, pd.NaT], True),
|
||||
("all", [pd.NA, False, pd.NaT], False),
|
||||
],
|
||||
)
|
||||
def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series):
|
||||
# GH#37501
|
||||
obj = frame_or_series(data, dtype=object)
|
||||
result = obj.groupby([1] * len(data)).agg(bool_agg_func)
|
||||
expected = frame_or_series([expected_res], index=[1], dtype="bool")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
|
||||
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
|
||||
def test_object_NA_raises_with_skipna_false(bool_agg_func):
|
||||
# GH#37501
|
||||
ser = Series([pd.NA], dtype=object)
|
||||
with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
|
||||
ser.groupby([1]).agg(bool_agg_func, skipna=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
|
||||
def test_empty(frame_or_series, bool_agg_func):
|
||||
# GH 45231
|
||||
kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"}
|
||||
obj = frame_or_series(**kwargs, dtype=object)
|
||||
result = getattr(obj.groupby(obj.index), bool_agg_func)()
|
||||
expected = frame_or_series(**kwargs, dtype=bool)
|
||||
tm.assert_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,130 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_mutate_groups():
|
||||
|
||||
# GH3380
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"cat1": ["a"] * 8 + ["b"] * 6,
|
||||
"cat2": ["c"] * 2
|
||||
+ ["d"] * 2
|
||||
+ ["e"] * 2
|
||||
+ ["f"] * 2
|
||||
+ ["c"] * 2
|
||||
+ ["d"] * 2
|
||||
+ ["e"] * 2,
|
||||
"cat3": [f"g{x}" for x in range(1, 15)],
|
||||
"val": np.random.randint(100, size=14),
|
||||
}
|
||||
)
|
||||
|
||||
def f_copy(x):
|
||||
x = x.copy()
|
||||
x["rank"] = x.val.rank(method="min")
|
||||
return x.groupby("cat2")["rank"].min()
|
||||
|
||||
def f_no_copy(x):
|
||||
x["rank"] = x.val.rank(method="min")
|
||||
return x.groupby("cat2")["rank"].min()
|
||||
|
||||
grpby_copy = df.groupby("cat1").apply(f_copy)
|
||||
grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
|
||||
tm.assert_series_equal(grpby_copy, grpby_no_copy)
|
||||
|
||||
|
||||
def test_no_mutate_but_looks_like():
|
||||
|
||||
# GH 8467
|
||||
# first show's mutation indicator
|
||||
# second does not, but should yield the same results
|
||||
df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
|
||||
|
||||
result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
|
||||
result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
|
||||
tm.assert_series_equal(result1, result2)
|
||||
|
||||
|
||||
def test_apply_function_with_indexing():
|
||||
# GH: 33058
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}
|
||||
)
|
||||
|
||||
def fn(x):
|
||||
x.col2[x.index[-1]] = 0
|
||||
return x.col2
|
||||
|
||||
result = df.groupby(["col1"], as_index=False).apply(fn)
|
||||
expected = pd.Series(
|
||||
[1, 2, 0, 4, 5, 0],
|
||||
index=pd.MultiIndex.from_tuples(
|
||||
[(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)]
|
||||
),
|
||||
name="col2",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_mutate_columns_multiindex():
|
||||
# GH 12652
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
("C", "julian"): [1, 2, 3],
|
||||
("B", "geoffrey"): [1, 2, 3],
|
||||
("A", "julian"): [1, 2, 3],
|
||||
("B", "julian"): [1, 2, 3],
|
||||
("A", "geoffrey"): [1, 2, 3],
|
||||
("C", "geoffrey"): [1, 2, 3],
|
||||
},
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[
|
||||
("A", "julian"),
|
||||
("A", "geoffrey"),
|
||||
("B", "julian"),
|
||||
("B", "geoffrey"),
|
||||
("C", "julian"),
|
||||
("C", "geoffrey"),
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
def add_column(grouped):
|
||||
name = grouped.columns[0][1]
|
||||
grouped["sum", name] = grouped.sum(axis=1)
|
||||
return grouped
|
||||
|
||||
result = df.groupby(level=1, axis=1).apply(add_column)
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
[1, 1, 1, 3, 1, 1, 1, 3],
|
||||
[2, 2, 2, 6, 2, 2, 2, 6],
|
||||
[
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
9,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
9,
|
||||
],
|
||||
],
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[
|
||||
("geoffrey", "A", "geoffrey"),
|
||||
("geoffrey", "B", "geoffrey"),
|
||||
("geoffrey", "C", "geoffrey"),
|
||||
("geoffrey", "sum", "geoffrey"),
|
||||
("julian", "A", "julian"),
|
||||
("julian", "B", "julian"),
|
||||
("julian", "C", "julian"),
|
||||
("julian", "sum", "julian"),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,69 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import lib
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def assert_block_lengths(x):
|
||||
assert len(x) == len(x._mgr.blocks[0].mgr_locs)
|
||||
return 0
|
||||
|
||||
|
||||
def cumsum_max(x):
|
||||
x.cumsum().max()
|
||||
return 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func",
|
||||
[
|
||||
cumsum_max,
|
||||
pytest.param(assert_block_lengths, marks=td.skip_array_manager_invalid_test),
|
||||
],
|
||||
)
|
||||
def test_mgr_locs_updated(func):
|
||||
# https://github.com/pandas-dev/pandas/issues/31802
|
||||
# Some operations may require creating new blocks, which requires
|
||||
# valid mgr_locs
|
||||
df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
|
||||
result = df.groupby(["A", "B"]).agg(func)
|
||||
expected = pd.DataFrame(
|
||||
{"C": [0, 0]},
|
||||
index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"binner,closed,expected",
|
||||
[
|
||||
(
|
||||
np.array([0, 3, 6, 9], dtype=np.int64),
|
||||
"left",
|
||||
np.array([2, 5, 6], dtype=np.int64),
|
||||
),
|
||||
(
|
||||
np.array([0, 3, 6, 9], dtype=np.int64),
|
||||
"right",
|
||||
np.array([3, 6, 6], dtype=np.int64),
|
||||
),
|
||||
(np.array([0, 3, 6], dtype=np.int64), "left", np.array([2, 5], dtype=np.int64)),
|
||||
(
|
||||
np.array([0, 3, 6], dtype=np.int64),
|
||||
"right",
|
||||
np.array([3, 6], dtype=np.int64),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_generate_bins(binner, closed, expected):
|
||||
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
|
||||
result = lib.generate_bins_dt64(values, binner, closed=closed)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
class TestMoments:
|
||||
pass
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,378 @@
|
||||
from itertools import product
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Period,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCounting:
|
||||
def test_cumcount(self):
|
||||
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3])
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series(dtype=object).groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype="int64")
|
||||
|
||||
tm.assert_series_equal(e, ge.cumcount())
|
||||
tm.assert_series_equal(e, se.cumcount())
|
||||
|
||||
def test_cumcount_dupe_index(self):
|
||||
df = DataFrame(
|
||||
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
|
||||
)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=mi)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_groupby_not_col(self):
|
||||
df = DataFrame(
|
||||
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
|
||||
)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_ngroup(self):
|
||||
df = DataFrame({"A": list("aaaba")})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_distinct(self):
|
||||
df = DataFrame({"A": list("abcde")})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series(range(5), dtype="int64")
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_one_group(self):
|
||||
df = DataFrame({"A": [0] * 5})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series(dtype=object).groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype="int64")
|
||||
|
||||
tm.assert_series_equal(e, ge.ngroup())
|
||||
tm.assert_series_equal(e, se.ngroup())
|
||||
|
||||
def test_ngroup_series_matches_frame(self):
|
||||
df = DataFrame({"A": list("aaaba")})
|
||||
s = Series(list("aaaba"))
|
||||
|
||||
tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
|
||||
|
||||
def test_ngroup_dupe_index(self):
|
||||
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame({"A": list("aaaba")}, index=mi)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
expected = Series([0, 0, 0, 1, 0], index=mi)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_groupby_not_col(self):
|
||||
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_descending(self):
|
||||
df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
|
||||
g = df.groupby(["A"])
|
||||
|
||||
ascending = Series([0, 0, 1, 0, 1])
|
||||
descending = Series([1, 1, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
|
||||
tm.assert_series_equal(ascending, g.ngroup(ascending=True))
|
||||
tm.assert_series_equal(descending, g.ngroup(ascending=False))
|
||||
|
||||
def test_ngroup_matches_cumcount(self):
|
||||
# verify one manually-worked out case works
|
||||
df = DataFrame(
|
||||
[["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
|
||||
columns=["A", "X"],
|
||||
)
|
||||
g = df.groupby(["A", "X"])
|
||||
g_ngroup = g.ngroup()
|
||||
g_cumcount = g.cumcount()
|
||||
expected_ngroup = Series([0, 1, 2, 0, 3])
|
||||
expected_cumcount = Series([0, 0, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(g_ngroup, expected_ngroup)
|
||||
tm.assert_series_equal(g_cumcount, expected_cumcount)
|
||||
|
||||
def test_ngroup_cumcount_pair(self):
|
||||
# brute force comparison for all small series
|
||||
for p in product(range(3), repeat=4):
|
||||
df = DataFrame({"a": p})
|
||||
g = df.groupby(["a"])
|
||||
|
||||
order = sorted(set(p))
|
||||
ngroupd = [order.index(val) for val in p]
|
||||
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
|
||||
|
||||
tm.assert_series_equal(g.ngroup(), Series(ngroupd))
|
||||
tm.assert_series_equal(g.cumcount(), Series(cumcounted))
|
||||
|
||||
def test_ngroup_respects_groupby_order(self):
|
||||
np.random.seed(0)
|
||||
df = DataFrame({"a": np.random.choice(list("abcdef"), 100)})
|
||||
for sort_flag in (False, True):
|
||||
g = df.groupby(["a"], sort=sort_flag)
|
||||
df["group_id"] = -1
|
||||
df["group_index"] = -1
|
||||
|
||||
for i, (_, group) in enumerate(g):
|
||||
df.loc[group.index, "group_id"] = i
|
||||
for j, ind in enumerate(group.index):
|
||||
df.loc[ind, "group_index"] = j
|
||||
|
||||
tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
|
||||
tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"datetimelike",
|
||||
[
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
|
||||
[Timedelta(x, unit="h") for x in range(1, 4)],
|
||||
[Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
|
||||
],
|
||||
)
|
||||
def test_count_with_datetimelike(self, datetimelike):
|
||||
# test for #13393, where DataframeGroupBy.count() fails
|
||||
# when counting a datetimelike column.
|
||||
|
||||
df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
|
||||
res = df.groupby("x").count()
|
||||
expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
|
||||
expected.index.name = "x"
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
def test_count_with_only_nans_in_first_group(self):
|
||||
# GH21956
|
||||
df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
|
||||
result = df.groupby(["A", "B"]).C.count()
|
||||
mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
|
||||
expected = Series([], index=mi, dtype=np.int64, name="C")
|
||||
tm.assert_series_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_count_groupby_column_with_nan_in_groupby_column(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/32841
|
||||
df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]})
|
||||
res = df.groupby(["B"]).count()
|
||||
expected = DataFrame(
|
||||
index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
|
||||
)
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
def test_groupby_count_dateparseerror(self):
|
||||
dr = date_range(start="1/1/2012", freq="5min", periods=10)
|
||||
|
||||
# BAD Example, datetimes first
|
||||
ser = Series(np.arange(10), index=[dr, np.arange(10)])
|
||||
grouped = ser.groupby(lambda x: x[1] % 2 == 0)
|
||||
result = grouped.count()
|
||||
|
||||
ser = Series(np.arange(10), index=[np.arange(10), dr])
|
||||
grouped = ser.groupby(lambda x: x[0] % 2 == 0)
|
||||
expected = grouped.count()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_timedelta_cython_count():
|
||||
df = DataFrame(
|
||||
{"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")}
|
||||
)
|
||||
expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delt")
|
||||
result = df.groupby("g").delt.count()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_count():
|
||||
n = 1 << 15
|
||||
dr = date_range("2015-08-30", periods=n // 10, freq="T")
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"1st": np.random.choice(list(ascii_lowercase), n),
|
||||
"2nd": np.random.randint(0, 5, n),
|
||||
"3rd": np.random.randn(n).round(3),
|
||||
"4th": np.random.randint(-10, 10, n),
|
||||
"5th": np.random.choice(dr, n),
|
||||
"6th": np.random.randn(n).round(3),
|
||||
"7th": np.random.randn(n).round(3),
|
||||
"8th": np.random.choice(dr, n) - np.random.choice(dr, 1),
|
||||
"9th": np.random.choice(list(ascii_lowercase), n),
|
||||
}
|
||||
)
|
||||
|
||||
for col in df.columns.drop(["1st", "2nd", "4th"]):
|
||||
df.loc[np.random.choice(n, n // 10), col] = np.nan
|
||||
|
||||
df["9th"] = df["9th"].astype("category")
|
||||
|
||||
for key in ["1st", "2nd", ["1st", "2nd"]]:
|
||||
left = df.groupby(key).count()
|
||||
right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
|
||||
tm.assert_frame_equal(left, right)
|
||||
|
||||
|
||||
def test_count_non_nulls():
|
||||
# GH#5610
|
||||
# count counts non-nulls
|
||||
df = DataFrame(
|
||||
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
|
||||
count_as = df.groupby("A").count()
|
||||
count_not_as = df.groupby("A", as_index=False).count()
|
||||
|
||||
expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
|
||||
expected.index.name = "A"
|
||||
tm.assert_frame_equal(count_not_as, expected.reset_index())
|
||||
tm.assert_frame_equal(count_as, expected)
|
||||
|
||||
count_B = df.groupby("A")["B"].count()
|
||||
tm.assert_series_equal(count_B, expected["B"])
|
||||
|
||||
|
||||
def test_count_object():
|
||||
df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
|
||||
result = df.groupby("c").a.count()
|
||||
expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
|
||||
result = df.groupby("c").a.count()
|
||||
expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_cross_type():
|
||||
# GH8169
|
||||
vals = np.hstack(
|
||||
(np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2)))
|
||||
)
|
||||
|
||||
df = DataFrame(vals, columns=["a", "b", "c", "d"])
|
||||
df[df == 2] = np.nan
|
||||
expected = df.groupby(["c", "d"]).count()
|
||||
|
||||
for t in ["float32", "object"]:
|
||||
df["a"] = df["a"].astype(t)
|
||||
df["b"] = df["b"].astype(t)
|
||||
result = df.groupby(["c", "d"]).count()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_lower_int_prec_count():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.array([0, 1, 2, 100], np.int8),
|
||||
"b": np.array([1, 2, 3, 6], np.uint32),
|
||||
"c": np.array([4, 5, 6, 8], np.int16),
|
||||
"grp": list("ab" * 2),
|
||||
}
|
||||
)
|
||||
result = df.groupby("grp").count()
|
||||
expected = DataFrame(
|
||||
{"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_uses_size_on_exception():
|
||||
class RaisingObjectException(Exception):
|
||||
pass
|
||||
|
||||
class RaisingObject:
|
||||
def __init__(self, msg="I will raise inside Cython"):
|
||||
super().__init__()
|
||||
self.msg = msg
|
||||
|
||||
def __eq__(self, other):
|
||||
# gets called in Cython to check that raising calls the method
|
||||
raise RaisingObjectException(self.msg)
|
||||
|
||||
df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
|
||||
result = df.groupby("grp").count()
|
||||
expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,614 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_filter_series():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = Series([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(s.index),
|
||||
)
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(s.index),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_single_column_df():
|
||||
df = DataFrame([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = DataFrame([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = df[0].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(df.index),
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(df.index),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_multi_column_df():
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
|
||||
)
|
||||
|
||||
|
||||
def test_filter_mixed_df():
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
|
||||
|
||||
|
||||
def test_filter_out_all_groups():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
|
||||
|
||||
|
||||
def test_filter_out_no_groups():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x.mean() > 0)
|
||||
tm.assert_series_equal(filtered, s)
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x["A"].mean() > 0)
|
||||
tm.assert_frame_equal(filtered, df)
|
||||
|
||||
|
||||
def test_filter_out_all_groups_in_df():
|
||||
# GH12768
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
|
||||
res = df.groupby("a")
|
||||
res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
|
||||
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
|
||||
res = df.groupby("a")
|
||||
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
|
||||
expected = DataFrame({"a": [], "b": []}, dtype="int64")
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
|
||||
def test_filter_condition_raises():
|
||||
def raise_if_sum_is_zero(x):
|
||||
if x.sum() == 0:
|
||||
raise ValueError
|
||||
else:
|
||||
return x.sum() > 0
|
||||
|
||||
s = Series([-1, 0, 1, 2])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
grouped.filter(raise_if_sum_is_zero)
|
||||
|
||||
|
||||
def test_filter_with_axis_in_groupby():
|
||||
# issue 11041
|
||||
index = pd.MultiIndex.from_product([range(10), [0, 1]])
|
||||
data = DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64")
|
||||
result = data.groupby(level=0, axis=1).filter(lambda x: x.iloc[0, 0] > 10)
|
||||
expected = data.iloc[:, 12:20]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_bad_shapes():
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
s = df["B"]
|
||||
g_df = df.groupby("B")
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: x
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: x == 1
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: np.outer(x, x)
|
||||
msg = "can't multiply sequence by non-int of type 'str'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
|
||||
def test_filter_nan_is_false():
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
s = df["B"]
|
||||
g_df = df.groupby(df["B"])
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: np.nan
|
||||
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
|
||||
tm.assert_series_equal(g_s.filter(f), s[[]])
|
||||
|
||||
|
||||
def test_filter_against_workaround():
|
||||
np.random.seed(0)
|
||||
# Series of ints
|
||||
s = Series(np.random.randint(0, 100, 1000))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
|
||||
old_way = s[grouped.transform(f).astype("bool")]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
# Series of floats
|
||||
s = 100 * Series(np.random.random(1000))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
old_way = s[grouped.transform(f).astype("bool")]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
# Set up DataFrame of ints, floats, strings.
|
||||
from string import ascii_lowercase
|
||||
|
||||
letters = np.array(list(ascii_lowercase))
|
||||
N = 1000
|
||||
random_letters = letters.take(np.random.randint(0, 26, N))
|
||||
df = DataFrame(
|
||||
{
|
||||
"ints": Series(np.random.randint(0, 100, N)),
|
||||
"floats": N / 10 * Series(np.random.random(N)),
|
||||
"letters": Series(random_letters),
|
||||
}
|
||||
)
|
||||
|
||||
# Group by ints; filter on floats.
|
||||
grouped = df.groupby("ints")
|
||||
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by floats (rounded); filter on strings.
|
||||
grouper = df.floats.apply(lambda x: np.round(x, -1))
|
||||
grouped = df.groupby(grouper)
|
||||
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by strings; filter on ints.
|
||||
grouped = df.groupby("letters")
|
||||
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
|
||||
def test_filter_using_len():
|
||||
# BUG GH4447
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
grouped = df.groupby("B")
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = DataFrame(
|
||||
{"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
|
||||
index=np.arange(2, 6),
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Series have always worked properly, but we'll test anyway.
|
||||
s = df["B"]
|
||||
grouped = s.groupby(s)
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = Series(4 * ["b"], index=np.arange(2, 6), name="B")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = s[[]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_maintains_ordering():
|
||||
# Simple case: index is sequential. #4621
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
|
||||
)
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Now index is sequentially decreasing.
|
||||
df.index = np.arange(len(df) - 1, -1, -1)
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Index is shuffled.
|
||||
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
|
||||
df.index = df.index[SHUFFLED]
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_multiple_timestamp():
|
||||
# GH 10114
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.arange(5, dtype="int64"),
|
||||
"B": ["foo", "bar", "foo", "bar", "bar"],
|
||||
"C": Timestamp("20130101"),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["B", "C"])
|
||||
|
||||
result = grouped["A"].filter(lambda x: True)
|
||||
tm.assert_series_equal(df["A"], result)
|
||||
|
||||
result = grouped["A"].transform(len)
|
||||
expected = Series([2, 3, 2, 3, 3], name="A")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped.filter(lambda x: True)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
result = grouped.transform("sum")
|
||||
expected = DataFrame({"A": [2, 8, 2, 8, 8]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped.transform(len)
|
||||
expected = DataFrame({"A": [2, 3, 2, 3, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 1, 1, 0, 1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_multiple_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 0, 0, 0, 1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_float_index():
|
||||
# GH4620
|
||||
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_timestamp_index():
|
||||
# GH4620
|
||||
t0 = Timestamp("2013-09-30 00:05:00")
|
||||
t1 = Timestamp("2013-10-30 00:05:00")
|
||||
t2 = Timestamp("2013-11-30 00:05:00")
|
||||
index = [t1, t1, t1, t2, t1, t1, t0, t1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_string_index():
|
||||
# GH4620
|
||||
index = list("bbbcbbab")
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_has_access_to_grouped_cols():
|
||||
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
# previously didn't have access to col A #????
|
||||
filt = g.filter(lambda x: x["A"].sum() == 2)
|
||||
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
|
||||
|
||||
|
||||
def test_filter_enforces_scalarness():
|
||||
df = DataFrame(
|
||||
[
|
||||
["best", "a", "x"],
|
||||
["worst", "b", "y"],
|
||||
["best", "c", "x"],
|
||||
["best", "d", "y"],
|
||||
["worst", "d", "y"],
|
||||
["worst", "d", "y"],
|
||||
["best", "d", "z"],
|
||||
],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
with pytest.raises(TypeError, match="filter function returned a.*"):
|
||||
df.groupby("c").filter(lambda g: g["a"] == "best")
|
||||
|
||||
|
||||
def test_filter_non_bool_raises():
|
||||
df = DataFrame(
|
||||
[
|
||||
["best", "a", 1],
|
||||
["worst", "b", 1],
|
||||
["best", "c", 1],
|
||||
["best", "d", 1],
|
||||
["worst", "d", 1],
|
||||
["worst", "d", 1],
|
||||
["best", "d", 1],
|
||||
],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
with pytest.raises(TypeError, match="filter function returned a.*"):
|
||||
df.groupby("a").filter(lambda g: g.c.mean())
|
||||
|
||||
|
||||
def test_filter_dropna_with_empty_groups():
|
||||
# GH 10780
|
||||
data = Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3))
|
||||
groupped = data.groupby(level=0)
|
||||
result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False)
|
||||
expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
|
||||
tm.assert_series_equal(result_false, expected_false)
|
||||
|
||||
result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True)
|
||||
expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64)
|
||||
tm.assert_series_equal(result_true, expected_true)
|
||||
|
||||
|
||||
def test_filter_consistent_result_before_after_agg_func():
|
||||
# GH 17091
|
||||
df = DataFrame({"data": range(6), "key": list("ABCABC")})
|
||||
grouper = df.groupby("key")
|
||||
result = grouper.filter(lambda x: True)
|
||||
expected = DataFrame({"data": range(6), "key": list("ABCABC")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
grouper.sum()
|
||||
result = grouper.filter(lambda x: True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,444 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def education_df():
|
||||
return DataFrame(
|
||||
{
|
||||
"gender": ["male", "male", "female", "male", "female", "male"],
|
||||
"education": ["low", "medium", "high", "low", "high", "low"],
|
||||
"country": ["US", "FR", "US", "FR", "FR", "FR"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def test_axis(education_df):
|
||||
gp = education_df.groupby("country", axis=1)
|
||||
with pytest.raises(NotImplementedError, match="axis"):
|
||||
gp.value_counts()
|
||||
|
||||
|
||||
def test_bad_subset(education_df):
|
||||
gp = education_df.groupby("country")
|
||||
with pytest.raises(ValueError, match="subset"):
|
||||
gp.value_counts(subset=["country"])
|
||||
|
||||
|
||||
def test_basic(education_df):
|
||||
# gh43564
|
||||
result = education_df.groupby("country")[["gender", "education"]].value_counts(
|
||||
normalize=True
|
||||
)
|
||||
expected = Series(
|
||||
data=[0.5, 0.25, 0.25, 0.5, 0.5],
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("FR", "male", "low"),
|
||||
("FR", "female", "high"),
|
||||
("FR", "male", "medium"),
|
||||
("US", "female", "high"),
|
||||
("US", "male", "low"),
|
||||
],
|
||||
names=["country", "gender", "education"],
|
||||
),
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def _frame_value_counts(df, keys, normalize, sort, ascending):
|
||||
return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("groupby", ["column", "array", "function"])
|
||||
@pytest.mark.parametrize("normalize", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"sort, ascending",
|
||||
[
|
||||
(False, None),
|
||||
(True, True),
|
||||
(True, False),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
@pytest.mark.parametrize("frame", [True, False])
|
||||
def test_against_frame_and_seriesgroupby(
|
||||
education_df, groupby, normalize, sort, ascending, as_index, frame
|
||||
):
|
||||
# test all parameters:
|
||||
# - Use column, array or function as by= parameter
|
||||
# - Whether or not to normalize
|
||||
# - Whether or not to sort and how
|
||||
# - Whether or not to use the groupby as an index
|
||||
# - 3-way compare against:
|
||||
# - apply with :meth:`~DataFrame.value_counts`
|
||||
# - `~SeriesGroupBy.value_counts`
|
||||
by = {
|
||||
"column": "country",
|
||||
"array": education_df["country"].values,
|
||||
"function": lambda x: education_df["country"][x] == "US",
|
||||
}[groupby]
|
||||
|
||||
gp = education_df.groupby(by=by, as_index=as_index)
|
||||
result = gp[["gender", "education"]].value_counts(
|
||||
normalize=normalize, sort=sort, ascending=ascending
|
||||
)
|
||||
if frame:
|
||||
# compare against apply with DataFrame value_counts
|
||||
expected = gp.apply(
|
||||
_frame_value_counts, ["gender", "education"], normalize, sort, ascending
|
||||
)
|
||||
|
||||
if as_index:
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
name = "proportion" if normalize else "count"
|
||||
expected = expected.reset_index().rename({0: name}, axis=1)
|
||||
if groupby == "column":
|
||||
expected = expected.rename({"level_0": "country"}, axis=1)
|
||||
expected["country"] = np.where(expected["country"], "US", "FR")
|
||||
elif groupby == "function":
|
||||
expected["level_0"] = expected["level_0"] == 1
|
||||
else:
|
||||
expected["level_0"] = np.where(expected["level_0"], "US", "FR")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
# compare against SeriesGroupBy value_counts
|
||||
education_df["both"] = education_df["gender"] + "-" + education_df["education"]
|
||||
expected = gp["both"].value_counts(
|
||||
normalize=normalize, sort=sort, ascending=ascending
|
||||
)
|
||||
expected.name = None
|
||||
if as_index:
|
||||
index_frame = expected.index.to_frame(index=False)
|
||||
index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
|
||||
index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
|
||||
del index_frame["both"]
|
||||
index_frame = index_frame.rename({0: None}, axis=1)
|
||||
expected.index = MultiIndex.from_frame(index_frame)
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
expected.insert(1, "gender", expected["both"].str.split("-").str.get(0))
|
||||
expected.insert(2, "education", expected["both"].str.split("-").str.get(1))
|
||||
del expected["both"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("normalize", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"sort, ascending, expected_rows, expected_count, expected_group_size",
|
||||
[
|
||||
(False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
|
||||
(True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]),
|
||||
(True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]),
|
||||
],
|
||||
)
|
||||
def test_compound(
|
||||
education_df,
|
||||
normalize,
|
||||
sort,
|
||||
ascending,
|
||||
expected_rows,
|
||||
expected_count,
|
||||
expected_group_size,
|
||||
):
|
||||
# Multiple groupby keys and as_index=False
|
||||
gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
|
||||
result = gp["education"].value_counts(
|
||||
normalize=normalize, sort=sort, ascending=ascending
|
||||
)
|
||||
expected = DataFrame()
|
||||
for column in ["country", "gender", "education"]:
|
||||
expected[column] = [education_df[column][row] for row in expected_rows]
|
||||
if normalize:
|
||||
expected["proportion"] = expected_count
|
||||
expected["proportion"] /= expected_group_size
|
||||
else:
|
||||
expected["count"] = expected_count
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def animals_df():
|
||||
return DataFrame(
|
||||
{"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
|
||||
index=["falcon", "dog", "cat", "ant"],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sort, ascending, normalize, expected_data, expected_index",
|
||||
[
|
||||
(False, None, False, [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]),
|
||||
(True, True, False, [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]),
|
||||
(True, False, False, [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]),
|
||||
(True, False, True, [0.5, 0.25, 0.25], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]),
|
||||
],
|
||||
)
|
||||
def test_data_frame_value_counts(
|
||||
animals_df, sort, ascending, normalize, expected_data, expected_index
|
||||
):
|
||||
# 3-way compare with :meth:`~DataFrame.value_counts`
|
||||
# Tests from frame/methods/test_value_counts.py
|
||||
result_frame = animals_df.value_counts(
|
||||
sort=sort, ascending=ascending, normalize=normalize
|
||||
)
|
||||
expected = Series(
|
||||
data=expected_data,
|
||||
index=MultiIndex.from_arrays(
|
||||
expected_index, names=["key", "num_legs", "num_wings"]
|
||||
),
|
||||
)
|
||||
tm.assert_series_equal(result_frame, expected)
|
||||
|
||||
result_frame_groupby = animals_df.groupby("key").value_counts(
|
||||
sort=sort, ascending=ascending, normalize=normalize
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result_frame_groupby, expected)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nulls_df():
|
||||
n = np.nan
|
||||
return DataFrame(
|
||||
{
|
||||
"A": [1, 1, n, 4, n, 6, 6, 6, 6],
|
||||
"B": [1, 1, 3, n, n, 6, 6, 6, 6],
|
||||
"C": [1, 2, 3, 4, 5, 6, n, 8, n],
|
||||
"D": [1, 2, 3, 4, 5, 6, 7, n, n],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"group_dropna, count_dropna, expected_rows, expected_values",
|
||||
[
|
||||
(
|
||||
False,
|
||||
False,
|
||||
[0, 1, 3, 5, 7, 6, 8, 2, 4],
|
||||
[0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0],
|
||||
),
|
||||
(False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]),
|
||||
(True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
|
||||
(True, True, [0, 1, 5], [0.5, 0.5, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_dropna_combinations(
|
||||
nulls_df, group_dropna, count_dropna, expected_rows, expected_values
|
||||
):
|
||||
gp = nulls_df.groupby(["A", "B"], dropna=group_dropna)
|
||||
result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna)
|
||||
columns = DataFrame()
|
||||
for column in nulls_df.columns:
|
||||
columns[column] = [nulls_df[column][row] for row in expected_rows]
|
||||
index = MultiIndex.from_frame(columns)
|
||||
expected = Series(data=expected_values, index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def names_with_nulls_df(nulls_fixture):
|
||||
return DataFrame(
|
||||
{
|
||||
"key": [1, 1, 1, 1],
|
||||
"first_name": ["John", "Anne", "John", "Beth"],
|
||||
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, expected_data, expected_index",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[1, 1],
|
||||
MultiIndex.from_arrays(
|
||||
[(1, 1), ("Beth", "John"), ("Louise", "Smith")],
|
||||
names=["key", "first_name", "middle_name"],
|
||||
),
|
||||
),
|
||||
(
|
||||
False,
|
||||
[1, 1, 1, 1],
|
||||
MultiIndex(
|
||||
levels=[
|
||||
Index([1]),
|
||||
Index(["Anne", "Beth", "John"]),
|
||||
Index(["Louise", "Smith", np.nan]),
|
||||
],
|
||||
codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
|
||||
names=["key", "first_name", "middle_name"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("normalize", [False, True])
|
||||
def test_data_frame_value_counts_dropna(
|
||||
names_with_nulls_df, dropna, normalize, expected_data, expected_index
|
||||
):
|
||||
# GH 41334
|
||||
# 3-way compare with :meth:`~DataFrame.value_counts`
|
||||
# Tests with nulls from frame/methods/test_value_counts.py
|
||||
result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize)
|
||||
expected = Series(
|
||||
data=expected_data,
|
||||
index=expected_index,
|
||||
)
|
||||
if normalize:
|
||||
expected /= float(len(expected_data))
|
||||
|
||||
tm.assert_series_equal(result_frame, expected)
|
||||
|
||||
result_frame_groupby = names_with_nulls_df.groupby("key").value_counts(
|
||||
dropna=dropna, normalize=normalize
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result_frame_groupby, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_index", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"observed, expected_index",
|
||||
[
|
||||
(
|
||||
False,
|
||||
[
|
||||
("FR", "male", "low"),
|
||||
("FR", "female", "high"),
|
||||
("FR", "male", "medium"),
|
||||
("FR", "female", "low"),
|
||||
("FR", "female", "medium"),
|
||||
("FR", "male", "high"),
|
||||
("US", "female", "high"),
|
||||
("US", "male", "low"),
|
||||
("US", "female", "low"),
|
||||
("US", "female", "medium"),
|
||||
("US", "male", "high"),
|
||||
("US", "male", "medium"),
|
||||
],
|
||||
),
|
||||
(
|
||||
True,
|
||||
[
|
||||
("FR", "male", "low"),
|
||||
("FR", "female", "high"),
|
||||
("FR", "male", "medium"),
|
||||
("US", "female", "high"),
|
||||
("US", "male", "low"),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"normalize, expected_data",
|
||||
[
|
||||
(False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)),
|
||||
(
|
||||
True,
|
||||
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_categorical(
|
||||
education_df, as_index, observed, expected_index, normalize, expected_data
|
||||
):
|
||||
# Test categorical data whether or not observed
|
||||
gp = education_df.astype("category").groupby(
|
||||
"country", as_index=as_index, observed=observed
|
||||
)
|
||||
result = gp.value_counts(normalize=normalize)
|
||||
|
||||
expected_series = Series(
|
||||
data=expected_data[expected_data > 0.0] if observed else expected_data,
|
||||
index=MultiIndex.from_tuples(
|
||||
expected_index,
|
||||
names=["country", "gender", "education"],
|
||||
),
|
||||
)
|
||||
for i in range(3):
|
||||
expected_series.index = expected_series.index.set_levels(
|
||||
CategoricalIndex(expected_series.index.levels[i]), level=i
|
||||
)
|
||||
|
||||
if as_index:
|
||||
tm.assert_series_equal(result, expected_series)
|
||||
else:
|
||||
expected = expected_series.reset_index(
|
||||
name="proportion" if normalize else "count"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"normalize, expected_label, expected_values",
|
||||
[
|
||||
(False, "count", [1, 1, 1]),
|
||||
(True, "proportion", [0.5, 0.5, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_mixed_groupings(normalize, expected_label, expected_values):
|
||||
# Test multiple groupings
|
||||
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
|
||||
gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False)
|
||||
result = gp.value_counts(sort=True, normalize=normalize)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"level_0": [4, 4, 5],
|
||||
"A": [1, 1, 2],
|
||||
"level_2": [8, 8, 7],
|
||||
"B": [1, 3, 2],
|
||||
expected_label: expected_values,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test, expected_names",
|
||||
[
|
||||
("repeat", ["a", None, "d", "b", "b", "e"]),
|
||||
("level", ["a", None, "d", "b", "c", "level_1"]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("as_index", [False, True])
|
||||
def test_column_name_clashes(test, expected_names, as_index):
|
||||
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]})
|
||||
if test == "repeat":
|
||||
df.columns = list("abbde")
|
||||
else:
|
||||
df.columns = list("abcd") + ["level_1"]
|
||||
|
||||
if as_index:
|
||||
result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
|
||||
expected = Series(
|
||||
data=(1, 1),
|
||||
index=MultiIndex.from_tuples(
|
||||
[(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)],
|
||||
names=expected_names,
|
||||
),
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="cannot insert"):
|
||||
df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
|
||||
|
||||
|
||||
def test_ambiguous_grouping():
|
||||
# Test that groupby is not confused by groupings length equal to row count
|
||||
df = DataFrame({"a": [1, 1]})
|
||||
gb = df.groupby([1, 1])
|
||||
result = gb.value_counts()
|
||||
expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]))
|
||||
tm.assert_series_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,372 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"]],
|
||||
{
|
||||
"c": [13.0, 12.3, 123.23],
|
||||
"d": [13.0, 233.0, 123.0],
|
||||
"e": [13.0, 12.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
|
||||
dropna, tuples, outputs, nulls_fixture
|
||||
):
|
||||
# GH 3729 this is to test that NA is in one group
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", nulls_fixture, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
["A", "B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels(["A", "B", np.nan], level="b")
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
|
||||
{
|
||||
"c": [12.0, 13.3, 123.23, 1.0],
|
||||
"d": [12.0, 234.0, 123.0, 1.0],
|
||||
"e": [12.0, 13.0, 1.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
|
||||
dropna, tuples, outputs, nulls_fixture, nulls_fixture2
|
||||
):
|
||||
# GH 3729 this is to test that NA in different groups with different representations
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", nulls_fixture, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
[nulls_fixture2, "B", 1, 1, 1.0],
|
||||
["A", nulls_fixture2, 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, idx, outputs",
|
||||
[
|
||||
(True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}),
|
||||
(
|
||||
False,
|
||||
["A", "B", np.nan],
|
||||
{
|
||||
"b": [123.23, 13.0, 12.3],
|
||||
"c": [123.0, 13.0, 233.0],
|
||||
"d": [1.0, 13.0, 12.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
|
||||
# GH 3729
|
||||
df_list = [
|
||||
["B", 12, 12, 12],
|
||||
[None, 12.3, 233.0, 12],
|
||||
["A", 123.23, 123, 1],
|
||||
["B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
|
||||
grouped = df.groupby("a", dropna=dropna).sum()
|
||||
|
||||
expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, idx, expected",
|
||||
[
|
||||
(True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])),
|
||||
(
|
||||
False,
|
||||
["a", "a", "b", np.nan],
|
||||
pd.Series([3, 3, 3], index=["a", "b", np.nan]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_series_level(dropna, idx, expected):
|
||||
ser = pd.Series([1, 2, 3, 3], index=idx)
|
||||
|
||||
result = ser.groupby(level=0, dropna=dropna).sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, expected",
|
||||
[
|
||||
(True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")),
|
||||
(
|
||||
False,
|
||||
pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_series_by(dropna, expected):
|
||||
ser = pd.Series(
|
||||
[390.0, 350.0, 30.0, 20.0],
|
||||
index=["Falcon", "Falcon", "Parrot", "Parrot"],
|
||||
name="Max Speed",
|
||||
)
|
||||
|
||||
result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", (False, True))
|
||||
def test_grouper_dropna_propagation(dropna):
|
||||
# GH 36604
|
||||
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
|
||||
gb = df.groupby("A", dropna=dropna)
|
||||
assert gb.grouper.dropna == dropna
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna,input_index,expected_data,expected_index",
|
||||
[
|
||||
(True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)),
|
||||
(True, list("abcd"), {"B": [2, 2, 1]}, list("abc")),
|
||||
(
|
||||
True,
|
||||
pd.MultiIndex.from_tuples(
|
||||
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
|
||||
),
|
||||
{"B": [2, 2, 1]},
|
||||
pd.MultiIndex.from_tuples(
|
||||
[(1, "R"), (1, "B"), (2, "R")], names=["num", "col"]
|
||||
),
|
||||
),
|
||||
(False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)),
|
||||
(False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")),
|
||||
(
|
||||
False,
|
||||
pd.MultiIndex.from_tuples(
|
||||
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
|
||||
),
|
||||
{"B": [2, 2, 1, 1]},
|
||||
pd.MultiIndex.from_tuples(
|
||||
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dataframe_slice_then_transform(
|
||||
dropna, input_index, expected_data, expected_index
|
||||
):
|
||||
# GH35014 & GH35612
|
||||
|
||||
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index)
|
||||
gb = df.groupby("A", dropna=dropna)
|
||||
|
||||
result = gb.transform(len)
|
||||
expected = pd.DataFrame(expected_data, index=expected_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = gb[["B"]].transform(len)
|
||||
expected = pd.DataFrame(expected_data, index=expected_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = gb["B"].transform(len)
|
||||
expected = pd.Series(expected_data["B"], index=expected_index, name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"]],
|
||||
{
|
||||
"c": [13.0, 12.3, 123.23],
|
||||
"d": [12.0, 233.0, 123.0],
|
||||
"e": [1.0, 12.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
|
||||
# GH 3729
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", None, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
["A", "B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
agg_dict = {"c": sum, "d": max, "e": "min"}
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict)
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels(["A", "B", np.nan], level="b")
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.arm_slow
|
||||
@pytest.mark.parametrize(
|
||||
"datetime1, datetime2",
|
||||
[
|
||||
(pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")),
|
||||
(pd.Timedelta("-2 days"), pd.Timedelta("-1 days")),
|
||||
(pd.Period("2020-01-01"), pd.Period("2020-02-01")),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
|
||||
def test_groupby_dropna_datetime_like_data(
|
||||
dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
|
||||
):
|
||||
# 3729
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"values": [1, 2, 3, 4, 5, 6],
|
||||
"dt": [
|
||||
datetime1,
|
||||
unique_nulls_fixture,
|
||||
datetime2,
|
||||
unique_nulls_fixture2,
|
||||
datetime1,
|
||||
datetime1,
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if dropna:
|
||||
indexes = [datetime1, datetime2]
|
||||
else:
|
||||
indexes = [datetime1, datetime2, np.nan]
|
||||
|
||||
grouped = df.groupby("dt", dropna=dropna).agg({"values": sum})
|
||||
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, data, selected_data, levels",
|
||||
[
|
||||
pytest.param(
|
||||
False,
|
||||
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
["a", "b", np.nan],
|
||||
id="dropna_false_has_nan",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0]},
|
||||
None,
|
||||
id="dropna_true_has_nan",
|
||||
),
|
||||
pytest.param(
|
||||
# no nan in "groups"; dropna=True|False should be same.
|
||||
False,
|
||||
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
None,
|
||||
id="dropna_false_no_nan",
|
||||
),
|
||||
pytest.param(
|
||||
# no nan in "groups"; dropna=True|False should be same.
|
||||
True,
|
||||
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
None,
|
||||
id="dropna_true_no_nan",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
|
||||
# GH 35889
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
gb = df.groupby("groups", dropna=dropna)
|
||||
result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
|
||||
|
||||
mi_tuples = tuple(zip(data["groups"], selected_data["values"]))
|
||||
mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna and levels:
|
||||
mi = mi.set_levels(levels, level="groups")
|
||||
|
||||
expected = pd.DataFrame(selected_data, index=mi)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nan_included():
|
||||
# GH 35646
|
||||
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", dropna=False)
|
||||
result = grouped.indices
|
||||
dtype = np.intp
|
||||
expected = {
|
||||
"g1": np.array([0, 2], dtype=dtype),
|
||||
"g2": np.array([3], dtype=dtype),
|
||||
np.nan: np.array([1, 4], dtype=dtype),
|
||||
}
|
||||
for result_values, expected_values in zip(result.values(), expected.values()):
|
||||
tm.assert_numpy_array_equal(result_values, expected_values)
|
||||
assert np.isnan(list(result.keys())[2])
|
||||
assert list(result.keys())[0:2] == ["g1", "g2"]
|
||||
@@ -0,0 +1,133 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_group_shift_with_null_key():
|
||||
# This test is designed to replicate the segfault in issue #13813.
|
||||
n_rows = 1200
|
||||
|
||||
# Generate a moderately large dataframe with occasional missing
|
||||
# values in column `B`, and then group by [`A`, `B`]. This should
|
||||
# force `-1` in `labels` array of `g.grouper.group_info` exactly
|
||||
# at those places, where the group-by key is partially missing.
|
||||
df = DataFrame(
|
||||
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["A", "B", "Z"],
|
||||
index=None,
|
||||
)
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = DataFrame(
|
||||
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["Z"],
|
||||
index=None,
|
||||
)
|
||||
result = g.shift(-1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_shift_with_fill_value():
|
||||
# GH #24128
|
||||
n_rows = 24
|
||||
df = DataFrame(
|
||||
[(i % 12, i % 3, i) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["A", "B", "Z"],
|
||||
index=None,
|
||||
)
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = DataFrame(
|
||||
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["Z"],
|
||||
index=None,
|
||||
)
|
||||
result = g.shift(-1, fill_value=0)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_shift_lose_timezone():
|
||||
# GH 30134
|
||||
now_dt = Timestamp.utcnow()
|
||||
df = DataFrame({"a": [1, 1], "date": now_dt})
|
||||
result = df.groupby("a").shift(0).iloc[0]
|
||||
expected = Series({"date": now_dt}, name=result.name)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_real(any_real_numpy_dtype):
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
|
||||
dtype=any_real_numpy_dtype,
|
||||
)
|
||||
result = df.groupby("a")["b"].diff()
|
||||
exp_dtype = "float"
|
||||
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
||||
exp_dtype = "float32"
|
||||
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
[
|
||||
Timestamp("2013-01-01"),
|
||||
Timestamp("2013-01-02"),
|
||||
Timestamp("2013-01-03"),
|
||||
],
|
||||
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
|
||||
],
|
||||
)
|
||||
def test_group_diff_datetimelike(data):
|
||||
df = DataFrame({"a": [1, 2, 2], "b": data})
|
||||
result = df.groupby("a")["b"].diff()
|
||||
expected = Series([NaT, NaT, Timedelta("1 days")], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_bool():
|
||||
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
||||
result = df.groupby("a")["b"].diff()
|
||||
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_object_raises(object_dtype):
|
||||
df = DataFrame(
|
||||
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
|
||||
)
|
||||
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
|
||||
df.groupby("a")["b"].diff()
|
||||
|
||||
|
||||
def test_empty_shift_with_fill():
|
||||
# GH 41264, single-index check
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
shifted = df.groupby(["a"]).shift(1)
|
||||
shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
|
||||
tm.assert_frame_equal(shifted, shifted_with_fill)
|
||||
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
||||
|
||||
|
||||
def test_multindex_empty_shift_with_fill():
|
||||
# GH 41264, multi-index check
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
shifted = df.groupby(["a", "b"]).shift(1)
|
||||
shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
|
||||
tm.assert_frame_equal(shifted, shifted_with_fill)
|
||||
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
||||
@@ -0,0 +1,113 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.groupby.base import maybe_normalize_deprecated_kernels
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"obj",
|
||||
[
|
||||
tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
|
||||
tm.SubclassedSeries(np.arange(0, 10), name="A"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning")
|
||||
def test_groupby_preserves_subclass(obj, groupby_func):
|
||||
# GH28330 -- preserve subclass through groupby operations
|
||||
|
||||
if isinstance(obj, Series) and groupby_func in {"corrwith"}:
|
||||
pytest.skip("Not applicable")
|
||||
# TODO(2.0) Remove after pad/backfill deprecation enforced
|
||||
groupby_func = maybe_normalize_deprecated_kernels(groupby_func)
|
||||
grouped = obj.groupby(np.arange(0, 10))
|
||||
|
||||
# Groups should preserve subclass type
|
||||
assert isinstance(grouped.get_group(0), type(obj))
|
||||
|
||||
args = []
|
||||
if groupby_func in {"fillna", "nth"}:
|
||||
args.append(0)
|
||||
elif groupby_func == "corrwith":
|
||||
args.append(obj)
|
||||
elif groupby_func == "tshift":
|
||||
args.extend([0, 0])
|
||||
|
||||
result1 = getattr(grouped, groupby_func)(*args)
|
||||
result2 = grouped.agg(groupby_func, *args)
|
||||
|
||||
# Reduction or transformation kernels should preserve type
|
||||
slices = {"ngroup", "cumcount", "size"}
|
||||
if isinstance(obj, DataFrame) and groupby_func in slices:
|
||||
assert isinstance(result1, tm.SubclassedSeries)
|
||||
else:
|
||||
assert isinstance(result1, type(obj))
|
||||
|
||||
# Confirm .agg() groupby operations return same results
|
||||
if isinstance(result1, DataFrame):
|
||||
tm.assert_frame_equal(result1, result2)
|
||||
else:
|
||||
tm.assert_series_equal(result1, result2)
|
||||
|
||||
|
||||
def test_groupby_preserves_metadata():
|
||||
# GH-37343
|
||||
custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
|
||||
assert "testattr" in custom_df._metadata
|
||||
custom_df.testattr = "hello"
|
||||
for _, group_df in custom_df.groupby("c"):
|
||||
assert group_df.testattr == "hello"
|
||||
|
||||
# GH-45314
|
||||
def func(group):
|
||||
assert isinstance(group, tm.SubclassedDataFrame)
|
||||
assert hasattr(group, "testattr")
|
||||
return group.testattr
|
||||
|
||||
result = custom_df.groupby("c").apply(func)
|
||||
expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def func2(group):
|
||||
assert isinstance(group, tm.SubclassedSeries)
|
||||
assert hasattr(group, "testattr")
|
||||
return group.testattr
|
||||
|
||||
custom_series = tm.SubclassedSeries([1, 2, 3])
|
||||
custom_series.testattr = "hello"
|
||||
result = custom_series.groupby(custom_df["c"]).apply(func2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = custom_series.groupby(custom_df["c"]).agg(func2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame])
|
||||
def test_groupby_resample_preserves_subclass(obj):
|
||||
# GH28330 -- preserve subclass through groupby.resample()
|
||||
|
||||
df = obj(
|
||||
{
|
||||
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
df = df.set_index("Date")
|
||||
|
||||
# Confirm groupby.resample() preserves dataframe type
|
||||
result = df.groupby("Buyer").resample("5D").sum()
|
||||
assert isinstance(result, obj)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,82 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(params=[["inner"], ["inner", "outer"]])
|
||||
def frame(request):
|
||||
levels = request.param
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 3, 1, 2, 3],
|
||||
"A": np.arange(6),
|
||||
"B": ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
if levels:
|
||||
df = df.set_index(levels)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def series():
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 3, 1, 2, 3],
|
||||
"A": np.arange(6),
|
||||
"B": ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
s = df.set_index(["outer", "inner", "B"])["A"]
|
||||
|
||||
return s
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key_strs,groupers",
|
||||
[
|
||||
("inner", pd.Grouper(level="inner")), # Index name
|
||||
(["inner"], [pd.Grouper(level="inner")]), # List of index name
|
||||
(["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index
|
||||
(["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column
|
||||
],
|
||||
)
|
||||
def test_grouper_index_level_as_string(frame, key_strs, groupers):
|
||||
result = frame.groupby(key_strs).mean()
|
||||
expected = frame.groupby(groupers).mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"levels",
|
||||
[
|
||||
"inner",
|
||||
"outer",
|
||||
"B",
|
||||
["inner"],
|
||||
["outer"],
|
||||
["B"],
|
||||
["inner", "outer"],
|
||||
["outer", "inner"],
|
||||
["inner", "outer", "B"],
|
||||
["B", "outer", "inner"],
|
||||
],
|
||||
)
|
||||
def test_grouper_index_level_as_string_series(series, levels):
|
||||
|
||||
# Compute expected result
|
||||
if isinstance(levels, list):
|
||||
groupers = [pd.Grouper(level=lv) for lv in levels]
|
||||
else:
|
||||
groupers = pd.Grouper(level=levels)
|
||||
|
||||
expected = series.groupby(groupers).mean()
|
||||
|
||||
# Compute and check result
|
||||
result = series.groupby(levels).mean()
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,316 @@
|
||||
# Test GroupBy._positional_selector positional grouped indexing GH#42864
|
||||
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[0, [0, 1, 4]],
|
||||
[2, [5]],
|
||||
[5, []],
|
||||
[-1, [3, 4, 7]],
|
||||
[-2, [1, 6]],
|
||||
[-6, []],
|
||||
],
|
||||
)
|
||||
def test_int(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test single integer
|
||||
result = slice_test_grouped._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_slice(slice_test_df, slice_test_grouped):
|
||||
# Test single slice
|
||||
result = slice_test_grouped._positional_selector[0:3:2]
|
||||
expected = slice_test_df.iloc[[0, 1, 4, 5]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[[0, 2], [0, 1, 4, 5]],
|
||||
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
|
||||
[range(0, 3, 2), [0, 1, 4, 5]],
|
||||
[{0, 2}, [0, 1, 4, 5]],
|
||||
],
|
||||
ids=[
|
||||
"list",
|
||||
"negative",
|
||||
"range",
|
||||
"set",
|
||||
],
|
||||
)
|
||||
def test_list(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test lists of integers and integer valued iterables
|
||||
result = slice_test_grouped._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ints(slice_test_df, slice_test_grouped):
|
||||
# Test tuple of ints
|
||||
result = slice_test_grouped._positional_selector[0, 2, -1]
|
||||
expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_slices(slice_test_df, slice_test_grouped):
|
||||
# Test tuple of slices
|
||||
result = slice_test_grouped._positional_selector[:2, -2:]
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_mix(slice_test_df, slice_test_grouped):
|
||||
# Test mixed tuple of ints and slices
|
||||
result = slice_test_grouped._positional_selector[0, 1, -2:]
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[0, [0, 1, 4]],
|
||||
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
|
||||
[(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]],
|
||||
],
|
||||
)
|
||||
def test_as_index(slice_test_df, arg, expected_rows):
|
||||
# Test the default as_index behaviour
|
||||
result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_doc_examples():
|
||||
# Test the examples in the documentation
|
||||
df = pd.DataFrame(
|
||||
[["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"]
|
||||
)
|
||||
|
||||
grouped = df.groupby("A", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[1:2]
|
||||
expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped._positional_selector[1, -1]
|
||||
expected = pd.DataFrame(
|
||||
[["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def multiindex_data():
|
||||
ndates = 100
|
||||
nitems = 20
|
||||
dates = pd.date_range("20130101", periods=ndates, freq="D")
|
||||
items = [f"item {i}" for i in range(nitems)]
|
||||
|
||||
data = {}
|
||||
for date in dates:
|
||||
nitems_for_date = nitems - random.randint(0, 12)
|
||||
levels = [
|
||||
(item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100)
|
||||
for item in items[:nitems_for_date]
|
||||
]
|
||||
levels.sort(key=lambda x: x[1])
|
||||
data[date] = levels
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def _make_df_from_data(data):
|
||||
rows = {}
|
||||
for date in data:
|
||||
for level in data[date]:
|
||||
rows[(date, level[0])] = {"A": level[1], "B": level[2]}
|
||||
|
||||
df = pd.DataFrame.from_dict(rows, orient="index")
|
||||
df.index.names = ("Date", "Item")
|
||||
return df
|
||||
|
||||
|
||||
def test_multiindex(multiindex_data):
|
||||
# Test the multiindex mentioned as the use-case in the documentation
|
||||
df = _make_df_from_data(multiindex_data)
|
||||
result = df.groupby("Date", as_index=False).nth(slice(3, -3))
|
||||
|
||||
sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data}
|
||||
expected = _make_df_from_data(sliced)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000])
|
||||
@pytest.mark.parametrize("method", ["head", "tail"])
|
||||
@pytest.mark.parametrize("simulated", [True, False])
|
||||
def test_against_head_and_tail(arg, method, simulated):
|
||||
# Test gives the same results as grouped head and tail
|
||||
n_groups = 100
|
||||
n_rows_per_group = 30
|
||||
|
||||
data = {
|
||||
"group": [
|
||||
f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)
|
||||
],
|
||||
"value": [
|
||||
f"group {g} row {j}"
|
||||
for j in range(n_rows_per_group)
|
||||
for g in range(n_groups)
|
||||
],
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", as_index=False)
|
||||
size = arg if arg >= 0 else n_rows_per_group + arg
|
||||
|
||||
if method == "head":
|
||||
result = grouped._positional_selector[:arg]
|
||||
|
||||
if simulated:
|
||||
indices = []
|
||||
for j in range(size):
|
||||
for i in range(n_groups):
|
||||
if j * n_groups + i < n_groups * n_rows_per_group:
|
||||
indices.append(j * n_groups + i)
|
||||
|
||||
expected = df.iloc[indices]
|
||||
|
||||
else:
|
||||
expected = grouped.head(arg)
|
||||
|
||||
else:
|
||||
result = grouped._positional_selector[-arg:]
|
||||
|
||||
if simulated:
|
||||
indices = []
|
||||
for j in range(size):
|
||||
for i in range(n_groups):
|
||||
if (n_rows_per_group + j - size) * n_groups + i >= 0:
|
||||
indices.append((n_rows_per_group + j - size) * n_groups + i)
|
||||
|
||||
expected = df.iloc[indices]
|
||||
|
||||
else:
|
||||
expected = grouped.tail(arg)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10])
|
||||
@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10])
|
||||
@pytest.mark.parametrize("step", [None, 1, 5])
|
||||
def test_against_df_iloc(start, stop, step):
|
||||
# Test that a single group gives the same results as DataFame.iloc
|
||||
n_rows = 30
|
||||
|
||||
data = {
|
||||
"group": ["group 0"] * n_rows,
|
||||
"value": list(range(n_rows)),
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[start:stop:step]
|
||||
expected = df.iloc[start:stop:step]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_series():
|
||||
# Test grouped Series
|
||||
ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"])
|
||||
grouped = ser.groupby(level=0)
|
||||
result = grouped._positional_selector[1:2]
|
||||
expected = pd.Series([2, 5], index=["a", "b"])
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("step", [1, 2, 3, 4, 5])
|
||||
def test_step(step):
|
||||
# Test slice with various step values
|
||||
data = [["x", f"x{i}"] for i in range(5)]
|
||||
data += [["y", f"y{i}"] for i in range(4)]
|
||||
data += [["z", f"z{i}"] for i in range(3)]
|
||||
df = pd.DataFrame(data, columns=["A", "B"])
|
||||
|
||||
grouped = df.groupby("A", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[::step]
|
||||
|
||||
data = [["x", f"x{i}"] for i in range(0, 5, step)]
|
||||
data += [["y", f"y{i}"] for i in range(0, 4, step)]
|
||||
data += [["z", f"z{i}"] for i in range(0, 3, step)]
|
||||
|
||||
index = [0 + i for i in range(0, 5, step)]
|
||||
index += [5 + i for i in range(0, 4, step)]
|
||||
index += [9 + i for i in range(0, 3, step)]
|
||||
|
||||
expected = pd.DataFrame(data, columns=["A", "B"], index=index)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def column_group_df():
|
||||
return pd.DataFrame(
|
||||
[[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]],
|
||||
columns=["A", "B", "C", "D", "E", "F", "G"],
|
||||
)
|
||||
|
||||
|
||||
def test_column_axis(column_group_df):
|
||||
g = column_group_df.groupby(column_group_df.iloc[1], axis=1)
|
||||
result = g._positional_selector[1:-1]
|
||||
expected = column_group_df.iloc[:, [1, 3]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_columns_on_iter():
|
||||
# GitHub issue #44821
|
||||
df = pd.DataFrame({k: range(10) for k in "ABC"})
|
||||
|
||||
# Group-by and select columns
|
||||
cols = ["A", "B"]
|
||||
for _, dg in df.groupby(df.A < 4)[cols]:
|
||||
tm.assert_index_equal(dg.columns, pd.Index(cols))
|
||||
assert "C" not in dg.columns
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array])
|
||||
def test_groupby_duplicated_columns(func):
|
||||
# GH#44924
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2],
|
||||
"B": [3, 3],
|
||||
"C": ["G", "G"],
|
||||
}
|
||||
)
|
||||
result = df.groupby("C")[func(["A", "B", "A"])].mean()
|
||||
expected = pd.DataFrame(
|
||||
[[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,283 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import groupby as libgroupby
|
||||
from pandas._libs.groupby import (
|
||||
group_cumprod_float64,
|
||||
group_cumsum,
|
||||
group_mean,
|
||||
group_var,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import ensure_platform_int
|
||||
|
||||
from pandas import isna
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class GroupVarTestMixin:
|
||||
def test_group_var_generic_1d(self):
|
||||
prng = np.random.RandomState(1234)
|
||||
|
||||
out = (np.nan * np.ones((5, 1))).astype(self.dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.rand(15, 1).astype(self.dtype)
|
||||
labels = np.tile(np.arange(5), (3,)).astype("intp")
|
||||
|
||||
expected_out = (
|
||||
np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
|
||||
)[:, np.newaxis]
|
||||
expected_counts = counts + 3
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
assert np.allclose(out, expected_out, self.rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_1d_flat_labels(self):
|
||||
prng = np.random.RandomState(1234)
|
||||
|
||||
out = (np.nan * np.ones((1, 1))).astype(self.dtype)
|
||||
counts = np.zeros(1, dtype="int64")
|
||||
values = 10 * prng.rand(5, 1).astype(self.dtype)
|
||||
labels = np.zeros(5, dtype="intp")
|
||||
|
||||
expected_out = np.array([[values.std(ddof=1) ** 2]])
|
||||
expected_counts = counts + 5
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
|
||||
assert np.allclose(out, expected_out, self.rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_2d_all_finite(self):
|
||||
prng = np.random.RandomState(1234)
|
||||
|
||||
out = (np.nan * np.ones((5, 2))).astype(self.dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.rand(10, 2).astype(self.dtype)
|
||||
labels = np.tile(np.arange(5), (2,)).astype("intp")
|
||||
|
||||
expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
|
||||
expected_counts = counts + 2
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
assert np.allclose(out, expected_out, self.rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_2d_some_nan(self):
|
||||
prng = np.random.RandomState(1234)
|
||||
|
||||
out = (np.nan * np.ones((5, 2))).astype(self.dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.rand(10, 2).astype(self.dtype)
|
||||
values[:, 1] = np.nan
|
||||
labels = np.tile(np.arange(5), (2,)).astype("intp")
|
||||
|
||||
expected_out = np.vstack(
|
||||
[
|
||||
values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
|
||||
np.nan * np.ones(5),
|
||||
]
|
||||
).T.astype(self.dtype)
|
||||
expected_counts = counts + 2
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
tm.assert_almost_equal(out, expected_out, rtol=0.5e-06)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_constant(self):
|
||||
# Regression test from GH 10448.
|
||||
|
||||
out = np.array([[np.nan]], dtype=self.dtype)
|
||||
counts = np.array([0], dtype="int64")
|
||||
values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
|
||||
labels = np.zeros(3, dtype="intp")
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
|
||||
assert counts[0] == 3
|
||||
assert out[0, 0] >= 0
|
||||
tm.assert_almost_equal(out[0, 0], 0.0)
|
||||
|
||||
|
||||
class TestGroupVarFloat64(GroupVarTestMixin):
|
||||
__test__ = True
|
||||
|
||||
algo = staticmethod(group_var)
|
||||
dtype = np.float64
|
||||
rtol = 1e-5
|
||||
|
||||
def test_group_var_large_inputs(self):
|
||||
prng = np.random.RandomState(1234)
|
||||
|
||||
out = np.array([[np.nan]], dtype=self.dtype)
|
||||
counts = np.array([0], dtype="int64")
|
||||
values = (prng.rand(10**6) + 10**12).astype(self.dtype)
|
||||
values.shape = (10**6, 1)
|
||||
labels = np.zeros(10**6, dtype="intp")
|
||||
|
||||
self.algo(out, counts, values, labels)
|
||||
|
||||
assert counts[0] == 10**6
|
||||
tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3)
|
||||
|
||||
|
||||
class TestGroupVarFloat32(GroupVarTestMixin):
|
||||
__test__ = True
|
||||
|
||||
algo = staticmethod(group_var)
|
||||
dtype = np.float32
|
||||
rtol = 1e-2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["float32", "float64"])
|
||||
def test_group_ohlc(dtype):
|
||||
obj = np.array(np.random.randn(20), dtype=dtype)
|
||||
|
||||
bins = np.array([6, 12, 20])
|
||||
out = np.zeros((3, 4), dtype)
|
||||
counts = np.zeros(len(out), dtype=np.int64)
|
||||
labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
|
||||
|
||||
func = libgroupby.group_ohlc
|
||||
func(out, counts, obj[:, None], labels)
|
||||
|
||||
def _ohlc(group):
|
||||
if isna(group).all():
|
||||
return np.repeat(np.nan, 4)
|
||||
return [group[0], group.max(), group.min(), group[-1]]
|
||||
|
||||
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
|
||||
|
||||
tm.assert_almost_equal(out, expected)
|
||||
tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
|
||||
|
||||
obj[:6] = np.nan
|
||||
func(out, counts, obj[:, None], labels)
|
||||
expected[0] = np.nan
|
||||
tm.assert_almost_equal(out, expected)
|
||||
|
||||
|
||||
def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
|
||||
"""
|
||||
Check a group transform that executes a cumulative function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pd_op : callable
|
||||
The pandas cumulative function.
|
||||
np_op : callable
|
||||
The analogous one in NumPy.
|
||||
dtype : type
|
||||
The specified dtype of the data.
|
||||
"""
|
||||
is_datetimelike = False
|
||||
|
||||
data = np.array([[1], [2], [3], [4]], dtype=dtype)
|
||||
answer = np.zeros_like(data)
|
||||
|
||||
labels = np.array([0, 0, 0, 0], dtype=np.intp)
|
||||
ngroups = 1
|
||||
pd_op(answer, data, labels, ngroups, is_datetimelike)
|
||||
|
||||
tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False)
|
||||
|
||||
|
||||
def test_cython_group_transform_cumsum(any_real_numpy_dtype):
|
||||
# see gh-4095
|
||||
dtype = np.dtype(any_real_numpy_dtype).type
|
||||
pd_op, np_op = group_cumsum, np.cumsum
|
||||
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
|
||||
|
||||
|
||||
def test_cython_group_transform_cumprod():
|
||||
# see gh-4095
|
||||
dtype = np.float64
|
||||
pd_op, np_op = group_cumprod_float64, np.cumproduct
|
||||
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
|
||||
|
||||
|
||||
def test_cython_group_transform_algos():
|
||||
# see gh-4095
|
||||
is_datetimelike = False
|
||||
|
||||
# with nans
|
||||
labels = np.array([0, 0, 0, 0, 0], dtype=np.intp)
|
||||
ngroups = 1
|
||||
|
||||
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
|
||||
actual = np.zeros_like(data)
|
||||
actual.fill(np.nan)
|
||||
group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike)
|
||||
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
|
||||
tm.assert_numpy_array_equal(actual[:, 0], expected)
|
||||
|
||||
actual = np.zeros_like(data)
|
||||
actual.fill(np.nan)
|
||||
group_cumsum(actual, data, labels, ngroups, is_datetimelike)
|
||||
expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
|
||||
tm.assert_numpy_array_equal(actual[:, 0], expected)
|
||||
|
||||
# timedelta
|
||||
is_datetimelike = True
|
||||
data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
|
||||
actual = np.zeros_like(data, dtype="int64")
|
||||
group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
|
||||
expected = np.array(
|
||||
[
|
||||
np.timedelta64(1, "ns"),
|
||||
np.timedelta64(2, "ns"),
|
||||
np.timedelta64(3, "ns"),
|
||||
np.timedelta64(4, "ns"),
|
||||
np.timedelta64(5, "ns"),
|
||||
]
|
||||
)
|
||||
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
|
||||
|
||||
|
||||
def test_cython_group_mean_datetimelike():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.array([0], dtype="int64")
|
||||
data = (
|
||||
np.array(
|
||||
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
|
||||
dtype="m8[ns]",
|
||||
)[:, None]
|
||||
.view("int64")
|
||||
.astype("float64")
|
||||
)
|
||||
labels = np.zeros(len(data), dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=True)
|
||||
|
||||
tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
|
||||
|
||||
|
||||
def test_cython_group_mean_wrong_min_count():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.zeros(1, dtype="int64")
|
||||
data = np.zeros(1, dtype="float64")[:, None]
|
||||
labels = np.zeros(1, dtype=np.intp)
|
||||
|
||||
with pytest.raises(AssertionError, match="min_count"):
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
|
||||
|
||||
|
||||
def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.array([0], dtype="int64")
|
||||
data = (
|
||||
np.array(
|
||||
[np.timedelta64("NaT"), np.timedelta64("NaT")],
|
||||
dtype="m8[ns]",
|
||||
)[:, None]
|
||||
.view("int64")
|
||||
.astype("float64")
|
||||
)
|
||||
labels = np.zeros(len(data), dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=False)
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
|
||||
)
|
||||
@@ -0,0 +1,227 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import iNaT
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.api import Int64Index
|
||||
|
||||
|
||||
def test_max_min_non_numeric():
|
||||
# #2700
|
||||
aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]})
|
||||
|
||||
result = aa.groupby("nn").max()
|
||||
assert "ss" in result
|
||||
|
||||
result = aa.groupby("nn").max(numeric_only=False)
|
||||
assert "ss" in result
|
||||
|
||||
result = aa.groupby("nn").min()
|
||||
assert "ss" in result
|
||||
|
||||
result = aa.groupby("nn").min(numeric_only=False)
|
||||
assert "ss" in result
|
||||
|
||||
|
||||
def test_max_min_object_multiple_columns(using_array_manager):
|
||||
# GH#41111 case where the aggregation is valid for some columns but not
|
||||
# others; we split object blocks column-wise, consistent with
|
||||
# DataFrame._reduce
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1, 2, 2, 3],
|
||||
"B": [1, "foo", 2, "bar", False],
|
||||
"C": ["a", "b", "c", "d", "e"],
|
||||
}
|
||||
)
|
||||
df._consolidate_inplace() # should already be consolidate, but double-check
|
||||
if not using_array_manager:
|
||||
assert len(df._mgr.blocks) == 2
|
||||
|
||||
gb = df.groupby("A")
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
|
||||
result = gb.max(numeric_only=False)
|
||||
# "max" is valid for column "C" but not for "B"
|
||||
ei = Index([1, 2, 3], name="A")
|
||||
expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
|
||||
result = gb.min(numeric_only=False)
|
||||
# "min" is valid for column "C" but not for "B"
|
||||
ei = Index([1, 2, 3], name="A")
|
||||
expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_min_date_with_nans():
|
||||
# GH26321
|
||||
dates = pd.to_datetime(
|
||||
Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d"
|
||||
).dt.date
|
||||
df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates})
|
||||
|
||||
result = df.groupby("b", as_index=False)["c"].min()["c"]
|
||||
expected = pd.to_datetime(
|
||||
Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d"
|
||||
).dt.date
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby("b")["c"].min()
|
||||
expected.index.name = "b"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_max_inat():
|
||||
# GH#40767 dont interpret iNaT as NaN
|
||||
ser = Series([1, iNaT])
|
||||
gb = ser.groupby([1, 1])
|
||||
|
||||
result = gb.max(min_count=2)
|
||||
expected = Series({1: 1}, dtype=np.int64)
|
||||
tm.assert_series_equal(result, expected, check_exact=True)
|
||||
|
||||
result = gb.min(min_count=2)
|
||||
expected = Series({1: iNaT}, dtype=np.int64)
|
||||
tm.assert_series_equal(result, expected, check_exact=True)
|
||||
|
||||
# not enough entries -> gets masked to NaN
|
||||
result = gb.min(min_count=3)
|
||||
expected = Series({1: np.nan})
|
||||
tm.assert_series_equal(result, expected, check_exact=True)
|
||||
|
||||
|
||||
def test_max_inat_not_all_na():
|
||||
# GH#40767 dont interpret iNaT as NaN
|
||||
|
||||
# make sure we dont round iNaT+1 to iNaT
|
||||
ser = Series([1, iNaT, 2, iNaT + 1])
|
||||
gb = ser.groupby([1, 2, 3, 3])
|
||||
result = gb.min(min_count=2)
|
||||
|
||||
# Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy
|
||||
expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1})
|
||||
tm.assert_series_equal(result, expected, check_exact=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["min", "max"])
|
||||
def test_groupby_aggregate_period_column(func):
|
||||
# GH 31471
|
||||
groups = [1, 2]
|
||||
periods = pd.period_range("2020", periods=2, freq="Y")
|
||||
df = DataFrame({"a": groups, "b": periods})
|
||||
|
||||
result = getattr(df.groupby("a")["b"], func)()
|
||||
idx = Int64Index([1, 2], name="a")
|
||||
expected = Series(periods, index=idx, name="b")
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["min", "max"])
|
||||
def test_groupby_aggregate_period_frame(func):
|
||||
# GH 31471
|
||||
groups = [1, 2]
|
||||
periods = pd.period_range("2020", periods=2, freq="Y")
|
||||
df = DataFrame({"a": groups, "b": periods})
|
||||
|
||||
result = getattr(df.groupby("a"), func)()
|
||||
idx = Int64Index([1, 2], name="a")
|
||||
expected = DataFrame({"b": periods}, index=idx)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_numeric_object_dtype():
|
||||
# https://github.com/pandas-dev/pandas/issues/39329
|
||||
# simplified case: multiple object columns where one is all-NaN
|
||||
# -> gets split as the all-NaN is inferred as float
|
||||
df = DataFrame(
|
||||
{"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4},
|
||||
).astype(object)
|
||||
result = df.groupby("key").min()
|
||||
expected = DataFrame(
|
||||
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}
|
||||
).set_index("key")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# same but with numbers
|
||||
df = DataFrame(
|
||||
{"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)},
|
||||
).astype(object)
|
||||
result = df.groupby("key").min()
|
||||
expected = DataFrame(
|
||||
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
|
||||
).set_index("key")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["min", "max"])
|
||||
def test_aggregate_categorical_lost_index(func: str):
|
||||
# GH: 28641 groupby drops index, when grouping over categorical column with min/max
|
||||
ds = Series(["b"], dtype="category").cat.as_ordered()
|
||||
df = DataFrame({"A": [1997], "B": ds})
|
||||
result = df.groupby("A").agg({"B": func})
|
||||
expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A"))
|
||||
|
||||
# ordered categorical dtype should be preserved
|
||||
expected["B"] = expected["B"].astype(ds.dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"])
|
||||
def test_groupby_min_max_nullable(dtype):
|
||||
if dtype == "Int64":
|
||||
# GH#41743 avoid precision loss
|
||||
ts = 1618556707013635762
|
||||
elif dtype == "boolean":
|
||||
ts = 0
|
||||
else:
|
||||
ts = 4.0
|
||||
|
||||
df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]})
|
||||
df["ts"] = df["ts"].astype(dtype)
|
||||
|
||||
gb = df.groupby("id")
|
||||
|
||||
result = gb.min()
|
||||
expected = df.iloc[:1].set_index("id")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
res_max = gb.max()
|
||||
expected_max = df.iloc[1:].set_index("id")
|
||||
tm.assert_frame_equal(res_max, expected_max)
|
||||
|
||||
result2 = gb.min(min_count=3)
|
||||
expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype)
|
||||
tm.assert_frame_equal(result2, expected2)
|
||||
|
||||
res_max2 = gb.max(min_count=3)
|
||||
tm.assert_frame_equal(res_max2, expected2)
|
||||
|
||||
# Case with NA values
|
||||
df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]})
|
||||
df2["ts"] = df2["ts"].astype(dtype)
|
||||
gb2 = df2.groupby("id")
|
||||
|
||||
result3 = gb2.min()
|
||||
tm.assert_frame_equal(result3, expected)
|
||||
|
||||
res_max3 = gb2.max()
|
||||
tm.assert_frame_equal(res_max3, expected_max)
|
||||
|
||||
result4 = gb2.min(min_count=100)
|
||||
tm.assert_frame_equal(result4, expected2)
|
||||
|
||||
res_max4 = gb2.max(min_count=100)
|
||||
tm.assert_frame_equal(res_max4, expected2)
|
||||
@@ -0,0 +1,155 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["ffill", "bfill"])
|
||||
def test_groupby_column_index_name_lost_fill_funcs(func):
|
||||
# GH: 29764 groupby loses index sometimes
|
||||
df = DataFrame(
|
||||
[[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
|
||||
columns=Index(["type", "a", "b"], name="idx"),
|
||||
)
|
||||
df_grouped = df.groupby(["type"])[["a", "b"]]
|
||||
result = getattr(df_grouped, func)().columns
|
||||
expected = Index(["a", "b"], name="idx")
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["ffill", "bfill"])
|
||||
def test_groupby_fill_duplicate_column_names(func):
|
||||
# GH: 25610 ValueError with duplicate column names
|
||||
df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
|
||||
df2 = DataFrame({"field1": [1, np.nan, 4]})
|
||||
df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
|
||||
expected = DataFrame(
|
||||
[[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
|
||||
)
|
||||
result = getattr(df_grouped, func)()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ffill_missing_arguments():
|
||||
# GH 14955
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 1]})
|
||||
with pytest.raises(ValueError, match="Must specify a fill"):
|
||||
df.groupby("b").fillna()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, expected", [("ffill", [None, "a", "a"]), ("bfill", ["a", "a", None])]
|
||||
)
|
||||
def test_fillna_with_string_dtype(method, expected):
|
||||
# GH 40250
|
||||
df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]})
|
||||
grp = df.groupby("b")
|
||||
result = grp.fillna(method=method)
|
||||
expected = DataFrame({"a": pd.array(expected, dtype="string")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fill_consistency():
|
||||
|
||||
# GH9221
|
||||
# pass thru keyword arguments to the generated wrapper
|
||||
# are set if the passed kw is None (only)
|
||||
df = DataFrame(
|
||||
index=pd.MultiIndex.from_product(
|
||||
[["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
|
||||
),
|
||||
columns=Index(["1", "2"], name="id"),
|
||||
)
|
||||
df["1"] = [
|
||||
np.nan,
|
||||
1,
|
||||
np.nan,
|
||||
np.nan,
|
||||
11,
|
||||
np.nan,
|
||||
np.nan,
|
||||
2,
|
||||
np.nan,
|
||||
np.nan,
|
||||
22,
|
||||
np.nan,
|
||||
]
|
||||
df["2"] = [
|
||||
np.nan,
|
||||
3,
|
||||
np.nan,
|
||||
np.nan,
|
||||
33,
|
||||
np.nan,
|
||||
np.nan,
|
||||
4,
|
||||
np.nan,
|
||||
np.nan,
|
||||
44,
|
||||
np.nan,
|
||||
]
|
||||
|
||||
expected = df.groupby(level=0, axis=0).fillna(method="ffill")
|
||||
result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["ffill", "bfill"])
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
@pytest.mark.parametrize("has_nan_group", [True, False])
|
||||
def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
|
||||
# GH 34725
|
||||
|
||||
df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])
|
||||
|
||||
ridx = [-1, 0, -1, -1, 1, -1]
|
||||
df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
|
||||
|
||||
group_b = np.nan if has_nan_group else "b"
|
||||
df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)
|
||||
|
||||
grouped = df.groupby(by="group_col", dropna=dropna)
|
||||
result = getattr(grouped, method)(limit=None)
|
||||
|
||||
expected_rows = {
|
||||
("ffill", True, True): [-1, 0, 0, -1, -1, -1],
|
||||
("ffill", True, False): [-1, 0, 0, -1, 1, 1],
|
||||
("ffill", False, True): [-1, 0, 0, -1, 1, 1],
|
||||
("ffill", False, False): [-1, 0, 0, -1, 1, 1],
|
||||
("bfill", True, True): [0, 0, -1, -1, -1, -1],
|
||||
("bfill", True, False): [0, 0, -1, 1, 1, -1],
|
||||
("bfill", False, True): [0, 0, -1, 1, 1, -1],
|
||||
("bfill", False, False): [0, 0, -1, 1, 1, -1],
|
||||
}
|
||||
|
||||
ridx = expected_rows.get((method, dropna, has_nan_group))
|
||||
expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
|
||||
# columns are a 'take' on df.columns, which are object dtype
|
||||
expected.columns = expected.columns.astype(object)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
|
||||
@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
|
||||
def test_min_count(func, min_count, value):
|
||||
# GH#37821
|
||||
df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
|
||||
result = getattr(df.groupby("a"), func)(min_count=min_count)
|
||||
expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_indices_with_missing():
|
||||
# GH 9304
|
||||
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
|
||||
g = df.groupby(["a", "b"])
|
||||
result = g.indices
|
||||
expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
|
||||
assert result == expected
|
||||
@@ -0,0 +1,843 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_first_last_nth(df):
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(first, expected)
|
||||
|
||||
nth = grouped.nth(0)
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
tm.assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(-1)
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.loc[[2, 3], ["B", "C", "D"]].copy()
|
||||
expected.index = Index(["foo", "bar"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
# it works!
|
||||
grouped["B"].first()
|
||||
grouped["B"].last()
|
||||
grouped["B"].nth(0)
|
||||
|
||||
df.loc[df["A"] == "foo", "B"] = np.nan
|
||||
assert isna(grouped["B"].first()["foo"])
|
||||
assert isna(grouped["B"].last()["foo"])
|
||||
assert isna(grouped["B"].nth(0)["foo"])
|
||||
|
||||
# v0.14.0 whatsnew
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
result = g.first()
|
||||
expected = df.iloc[[1, 2]].set_index("A")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df.iloc[[1, 2]].set_index("A")
|
||||
result = g.nth(0, dropna="any")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last_with_na_object(method, nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32123
|
||||
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
|
||||
result = getattr(groups, method)()
|
||||
|
||||
if method == "first":
|
||||
values = [1, 3]
|
||||
else:
|
||||
values = [2, 3]
|
||||
|
||||
values = np.array(values, dtype=result["b"].dtype)
|
||||
idx = Index([1, 2], name="a")
|
||||
expected = DataFrame({"b": values}, index=idx)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index", [0, -1])
|
||||
def test_nth_with_na_object(index, nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32123
|
||||
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
|
||||
result = groups.nth(index)
|
||||
|
||||
if index == 0:
|
||||
values = [1, 3]
|
||||
else:
|
||||
values = [2, nulls_fixture]
|
||||
|
||||
values = np.array(values, dtype=result["b"].dtype)
|
||||
idx = Index([1, 2], name="a")
|
||||
expected = DataFrame({"b": values}, index=idx)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last_with_None(method):
|
||||
# https://github.com/pandas-dev/pandas/issues/32800
|
||||
# None should be preserved as object dtype
|
||||
df = DataFrame.from_dict({"id": ["a"], "value": [None]})
|
||||
groups = df.groupby("id", as_index=False)
|
||||
result = getattr(groups, method)()
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
@pytest.mark.parametrize(
|
||||
"df, expected",
|
||||
[
|
||||
(
|
||||
DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
|
||||
DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
|
||||
),
|
||||
(
|
||||
DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
|
||||
DataFrame({"value": [None]}, index=Index(["a"], name="id")),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_with_None_expanded(method, df, expected):
|
||||
# GH 32800, 38286
|
||||
result = getattr(df.groupby("id"), method)()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes(df_mixed_floats):
|
||||
|
||||
df = df_mixed_floats.copy()
|
||||
df["E"] = True
|
||||
df["F"] = 1
|
||||
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(first, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.loc[[3, 2], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
# GH 2763, first/last shifting dtypes
|
||||
idx = list(range(10))
|
||||
idx.append(9)
|
||||
s = Series(data=range(11), index=idx, name="IntCol")
|
||||
assert s.dtype == "int64"
|
||||
f = s.groupby(level=0).first()
|
||||
assert f.dtype == "int64"
|
||||
|
||||
|
||||
def test_first_last_nth_nan_dtype():
|
||||
# GH 33591
|
||||
df = DataFrame({"data": ["A"], "nans": Series([np.nan], dtype=object)})
|
||||
|
||||
grouped = df.groupby("data")
|
||||
expected = df.set_index("data").nans
|
||||
tm.assert_series_equal(grouped.nans.first(), expected)
|
||||
tm.assert_series_equal(grouped.nans.last(), expected)
|
||||
tm.assert_series_equal(grouped.nans.nth(-1), expected)
|
||||
tm.assert_series_equal(grouped.nans.nth(0), expected)
|
||||
|
||||
|
||||
def test_first_strings_timestamps():
|
||||
# GH 11244
|
||||
test = DataFrame(
|
||||
{
|
||||
Timestamp("2012-01-01 00:00:00"): ["a", "b"],
|
||||
Timestamp("2012-01-02 00:00:00"): ["c", "d"],
|
||||
"name": ["e", "e"],
|
||||
"aaaa": ["f", "g"],
|
||||
}
|
||||
)
|
||||
result = test.groupby("name").first()
|
||||
expected = DataFrame(
|
||||
[["a", "c", "f"]],
|
||||
columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
|
||||
index=Index(["e"], name="name"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth():
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
|
||||
tm.assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth(1), df.iloc[[1]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth(2), df.loc[[]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth(-3), df.loc[[]].set_index("A"))
|
||||
tm.assert_series_equal(g.B.nth(0), df.set_index("A").B.iloc[[0, 2]])
|
||||
tm.assert_series_equal(g.B.nth(1), df.set_index("A").B.iloc[[1]])
|
||||
tm.assert_frame_equal(g[["B"]].nth(0), df.loc[[0, 2], ["A", "B"]].set_index("A"))
|
||||
|
||||
exp = df.set_index("A")
|
||||
tm.assert_frame_equal(g.nth(0, dropna="any"), exp.iloc[[1, 2]])
|
||||
tm.assert_frame_equal(g.nth(-1, dropna="any"), exp.iloc[[1, 2]])
|
||||
|
||||
exp["B"] = np.nan
|
||||
tm.assert_frame_equal(g.nth(7, dropna="any"), exp.iloc[[1, 2]])
|
||||
tm.assert_frame_equal(g.nth(2, dropna="any"), exp.iloc[[1, 2]])
|
||||
|
||||
# out of bounds, regression from 0.13.1
|
||||
# GH 6621
|
||||
df = DataFrame(
|
||||
{
|
||||
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
|
||||
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
|
||||
"two": {
|
||||
0: 1.5456590000000001,
|
||||
1: -0.070345000000000005,
|
||||
2: -2.4004539999999999,
|
||||
3: 0.46206000000000003,
|
||||
4: 0.52350799999999997,
|
||||
},
|
||||
"one": {
|
||||
0: 0.56573799999999996,
|
||||
1: -0.9742360000000001,
|
||||
2: 1.033801,
|
||||
3: -0.78543499999999999,
|
||||
4: 0.70422799999999997,
|
||||
},
|
||||
}
|
||||
).set_index(["color", "food"])
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(2)
|
||||
expected = df.iloc[[-1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(3)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 7559
|
||||
# from the vbench
|
||||
df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype="int64")
|
||||
s = df[1]
|
||||
g = df[0]
|
||||
expected = s.groupby(g).first()
|
||||
expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(expected2, expected, check_names=False)
|
||||
assert expected.name == 1
|
||||
assert expected2.name == 1
|
||||
|
||||
# validate first
|
||||
v = s[g == 1].iloc[0]
|
||||
assert expected.iloc[0] == v
|
||||
assert expected2.iloc[0] == v
|
||||
|
||||
# this is NOT the same as .first (as sorted is default!)
|
||||
# as it keeps the order in the series (and not the group order)
|
||||
# related GH 7287
|
||||
expected = s.groupby(g, sort=False).first()
|
||||
result = s.groupby(g, sort=False).nth(0, dropna="all")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError, match="For a DataFrame"):
|
||||
s.groupby(g, sort=False).nth(0, dropna=True)
|
||||
|
||||
# doc example
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
result = g.B.nth(0, dropna="all")
|
||||
expected = g.B.first()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# test multiple nth values
|
||||
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
|
||||
tm.assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index("A"))
|
||||
tm.assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index("A"))
|
||||
|
||||
business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B")
|
||||
df = DataFrame(1, index=business_dates, columns=["a", "b"])
|
||||
# get the first, fourth and last two business days for each month
|
||||
key = [df.index.year, df.index.month]
|
||||
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
|
||||
expected_dates = pd.to_datetime(
|
||||
[
|
||||
"2014/4/1",
|
||||
"2014/4/4",
|
||||
"2014/4/29",
|
||||
"2014/4/30",
|
||||
"2014/5/1",
|
||||
"2014/5/6",
|
||||
"2014/5/29",
|
||||
"2014/5/30",
|
||||
"2014/6/2",
|
||||
"2014/6/5",
|
||||
"2014/6/27",
|
||||
"2014/6/30",
|
||||
]
|
||||
)
|
||||
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_multi_index(three_group):
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on MultiIndex, should match .first()
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = grouped.first()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected_first, expected_last",
|
||||
[
|
||||
(
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
),
|
||||
(
|
||||
{
|
||||
"id": ["A", "B", "A"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
],
|
||||
"foo": [1, 2, 3],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [1, 2],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [3, 2],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_tz(data, expected_first, expected_last):
|
||||
# GH15884
|
||||
# Test that the timezone is retained when calling first
|
||||
# or last on groupby with as_index=False
|
||||
|
||||
df = DataFrame(data)
|
||||
|
||||
result = df.groupby("id", as_index=False).first()
|
||||
expected = DataFrame(expected_first)
|
||||
cols = ["id", "time", "foo"]
|
||||
tm.assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].first()
|
||||
tm.assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
result = df.groupby("id", as_index=False).last()
|
||||
expected = DataFrame(expected_last)
|
||||
cols = ["id", "time", "foo"]
|
||||
tm.assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].last()
|
||||
tm.assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, ts, alpha",
|
||||
[
|
||||
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
|
||||
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
|
||||
],
|
||||
)
|
||||
def test_first_last_tz_multi_column(method, ts, alpha):
|
||||
# GH 21603
|
||||
category_string = Series(list("abc")).astype("category")
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": [1, 1, 2],
|
||||
"category_string": category_string,
|
||||
"datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("group"), method)()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"category_string": pd.Categorical(
|
||||
[alpha, "c"], dtype=category_string.dtype
|
||||
),
|
||||
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.array([True, False], dtype="boolean"),
|
||||
pd.array([1, 2], dtype="Int64"),
|
||||
pd.to_datetime(["2020-01-01", "2020-02-01"]),
|
||||
pd.to_timedelta([1, 2], unit="D"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("function", ["first", "last", "min", "max"])
|
||||
def test_first_last_extension_array_keeps_dtype(values, function):
|
||||
# https://github.com/pandas-dev/pandas/issues/33071
|
||||
# https://github.com/pandas-dev/pandas/issues/32194
|
||||
df = DataFrame({"a": [1, 2], "b": values})
|
||||
grouped = df.groupby("a")
|
||||
idx = Index([1, 2], name="a")
|
||||
expected_series = Series(values, name="b", index=idx)
|
||||
expected_frame = DataFrame({"b": values}, index=idx)
|
||||
|
||||
result_series = getattr(grouped["b"], function)()
|
||||
tm.assert_series_equal(result_series, expected_series)
|
||||
|
||||
result_frame = grouped.agg({"b": function})
|
||||
tm.assert_frame_equal(result_frame, expected_frame)
|
||||
|
||||
|
||||
def test_nth_multi_index_as_expected():
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on MultiIndex
|
||||
three_group = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
}
|
||||
)
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = DataFrame(
|
||||
{"C": ["dull", "dull", "dull", "dull"]},
|
||||
index=MultiIndex.from_arrays(
|
||||
[["bar", "bar", "foo", "foo"], ["one", "two", "one", "two"]],
|
||||
names=["A", "B"],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, n, expected_rows",
|
||||
[
|
||||
("head", -1, [0]),
|
||||
("head", 0, []),
|
||||
("head", 1, [0, 2]),
|
||||
("head", 7, [0, 1, 2]),
|
||||
("tail", -1, [1]),
|
||||
("tail", 0, []),
|
||||
("tail", 1, [1, 2]),
|
||||
("tail", 7, [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A", as_index=as_index)
|
||||
expected = df.iloc[expected_rows]
|
||||
if columns is not None:
|
||||
g = g[columns]
|
||||
expected = expected[columns]
|
||||
result = getattr(g, op)(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, n, expected_cols",
|
||||
[
|
||||
("head", -1, [0]),
|
||||
("head", 0, []),
|
||||
("head", 1, [0, 2]),
|
||||
("head", 7, [0, 1, 2]),
|
||||
("tail", -1, [1]),
|
||||
("tail", 0, []),
|
||||
("tail", 1, [1, 2]),
|
||||
("tail", 7, [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
def test_groupby_head_tail_axis_1(op, n, expected_cols):
|
||||
# GH 9772
|
||||
df = DataFrame(
|
||||
[[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
|
||||
)
|
||||
g = df.groupby([0, 0, 1], axis=1)
|
||||
expected = df.iloc[:, expected_cols]
|
||||
result = getattr(g, op)(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_selection_cache():
|
||||
# GH 12839 nth, head, and tail should return same result consistently
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
expected = df.iloc[[0, 2]].set_index("A")
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.head(n=2)
|
||||
result2 = g.nth(0)
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.tail(n=2)
|
||||
result2 = g.nth(0)
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.head(n=2)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.tail(n=2)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
|
||||
def test_nth_empty():
|
||||
# GH 16064
|
||||
df = DataFrame(index=[0], columns=["a", "b", "c"])
|
||||
result = df.groupby("a").nth(10)
|
||||
expected = DataFrame(index=Index([], name="a"), columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["a", "b"]).nth(10)
|
||||
expected = DataFrame(
|
||||
index=MultiIndex([[], []], [[], []], names=["a", "b"]), columns=["c"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_column_order():
|
||||
# GH 20760
|
||||
# Check that nth preserves column order
|
||||
df = DataFrame(
|
||||
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
|
||||
columns=["A", "C", "B"],
|
||||
)
|
||||
result = df.groupby("A").nth(0)
|
||||
expected = DataFrame(
|
||||
[["b", 100.0], ["c", 200.0]], columns=["C", "B"], index=Index([1, 2], name="A")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").nth(-1, dropna="any")
|
||||
expected = DataFrame(
|
||||
[["a", 50.0], ["d", 150.0]], columns=["C", "B"], index=Index([1, 2], name="A")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
||||
def test_nth_nan_in_grouper(dropna):
|
||||
# GH 26011
|
||||
df = DataFrame(
|
||||
[[np.nan, 0, 1], ["abc", 2, 3], [np.nan, 4, 5], ["def", 6, 7], [np.nan, 8, 9]],
|
||||
columns=list("abc"),
|
||||
)
|
||||
result = df.groupby("a").nth(0, dropna=dropna)
|
||||
expected = DataFrame(
|
||||
[[2, 3], [6, 7]], columns=list("bc"), index=Index(["abc", "def"], name="a")
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_categorical_and_datetime_data_nat():
|
||||
# GH 20520
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": ["first", "first", "second", "third", "third"],
|
||||
"time": 5 * [np.datetime64("NaT")],
|
||||
"categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
|
||||
}
|
||||
)
|
||||
result = df.groupby("group").first()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"time": 3 * [np.datetime64("NaT")],
|
||||
"categories": Series(["a", "c", "a"]).astype(
|
||||
pd.CategoricalDtype(["a", "b", "c"])
|
||||
),
|
||||
}
|
||||
)
|
||||
expected.index = Index(["first", "second", "third"], name="group")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_multi_key_groupbby_categorical():
|
||||
# GH 22512
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1, 1, 2, 2],
|
||||
"B": [100, 100, 200, 100, 100],
|
||||
"C": ["apple", "orange", "mango", "mango", "orange"],
|
||||
"D": ["jupiter", "mercury", "mars", "venus", "venus"],
|
||||
}
|
||||
)
|
||||
df = df.astype({"D": "category"})
|
||||
result = df.groupby(by=["A", "B"]).first()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": ["apple", "mango", "mango"],
|
||||
"D": Series(["jupiter", "mars", "venus"]).astype(
|
||||
pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
|
||||
),
|
||||
}
|
||||
)
|
||||
expected.index = MultiIndex.from_tuples(
|
||||
[(1, 100), (1, 200), (2, 100)], names=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last", "nth"])
|
||||
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
|
||||
# GH29645
|
||||
expected = Series(["y"])
|
||||
data = Series(
|
||||
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
|
||||
index=[0, 0, 0, 0, 0],
|
||||
).groupby(level=0)
|
||||
|
||||
if method == "nth":
|
||||
result = getattr(data, method)(3)
|
||||
else:
|
||||
result = getattr(data, method)()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[slice(None, 3, 2), [0, 1, 4, 5]],
|
||||
[slice(None, -2), [0, 2, 5]],
|
||||
[[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
||||
[[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
||||
],
|
||||
)
|
||||
def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test slices GH #42947
|
||||
|
||||
result = slice_test_grouped.nth[arg]
|
||||
equivalent = slice_test_grouped.nth(arg)
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(equivalent, expected)
|
||||
|
||||
|
||||
def test_nth_indexed(slice_test_df, slice_test_grouped):
|
||||
# Test index notation GH #44688
|
||||
|
||||
result = slice_test_grouped.nth[0, 1, -2:]
|
||||
equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(equivalent, expected)
|
||||
|
||||
|
||||
def test_invalid_argument(slice_test_grouped):
|
||||
# Test for error on invalid argument
|
||||
|
||||
with pytest.raises(TypeError, match="Invalid index"):
|
||||
slice_test_grouped.nth(3.14)
|
||||
|
||||
|
||||
def test_negative_step(slice_test_grouped):
|
||||
# Test for error on negative slice step
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid step"):
|
||||
slice_test_grouped.nth(slice(None, None, -1))
|
||||
|
||||
|
||||
def test_np_ints(slice_test_df, slice_test_grouped):
|
||||
# Test np ints work
|
||||
|
||||
result = slice_test_grouped.nth(np.array([0, 1]))
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nth_with_column_axis():
|
||||
# GH43926
|
||||
df = DataFrame(
|
||||
[
|
||||
[4, 5, 6],
|
||||
[8, 8, 7],
|
||||
],
|
||||
index=["z", "y"],
|
||||
columns=["C", "B", "A"],
|
||||
)
|
||||
result = df.groupby(df.iloc[1], axis=1).nth(0)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[6, 4],
|
||||
[7, 8],
|
||||
],
|
||||
index=["z", "y"],
|
||||
columns=[7, 8],
|
||||
)
|
||||
expected.columns.name = "y"
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start, stop, expected_values, expected_columns",
|
||||
[
|
||||
(None, None, [0, 1, 2, 3, 4], [5, 5, 5, 6, 6]),
|
||||
(None, 1, [0, 3], [5, 6]),
|
||||
(None, 9, [0, 1, 2, 3, 4], [5, 5, 5, 6, 6]),
|
||||
(None, -1, [0, 1, 3], [5, 5, 6]),
|
||||
(1, None, [1, 2, 4], [5, 5, 6]),
|
||||
(1, -1, [1], [5]),
|
||||
(-1, None, [2, 4], [5, 6]),
|
||||
(-1, 2, [4], [6]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("method", ["call", "index"])
|
||||
def test_nth_slices_with_column_axis(
|
||||
start, stop, expected_values, expected_columns, method
|
||||
):
|
||||
df = DataFrame([range(5)], columns=[list("ABCDE")])
|
||||
gb = df.groupby([5, 5, 5, 6, 6], axis=1)
|
||||
result = {
|
||||
"call": lambda start, stop: gb.nth(slice(start, stop)),
|
||||
"index": lambda start, stop: gb.nth[start:stop],
|
||||
}[method](start, stop)
|
||||
expected = DataFrame([expected_values], columns=expected_columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_head_tail_dropna_true():
|
||||
# GH#45089
|
||||
df = DataFrame(
|
||||
[["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
|
||||
)
|
||||
expected = DataFrame([["a", "z"]], columns=["X", "Y"])
|
||||
|
||||
result = df.groupby(["X", "Y"]).head(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"]).tail(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"]).nth(n=0).reset_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_head_tail_dropna_false():
|
||||
# GH#45089
|
||||
df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
||||
expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).head(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).nth(n=0).reset_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,73 @@
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
@pytest.mark.filterwarnings("ignore:\n")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
class TestEngine:
|
||||
def test_cython_vs_numba_frame(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = df.groupby("a", sort=sort)
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
# check_dtype can be removed if GH 44952 is addressed
|
||||
check_dtype = func != "sum"
|
||||
tm.assert_frame_equal(result, expected, check_dtype=check_dtype)
|
||||
|
||||
def test_cython_vs_numba_getitem(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = df.groupby("a", sort=sort)["c"]
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
# check_dtype can be removed if GH 44952 is addressed
|
||||
check_dtype = func != "sum"
|
||||
tm.assert_series_equal(result, expected, check_dtype=check_dtype)
|
||||
|
||||
def test_cython_vs_numba_series(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
ser = Series(range(3), index=[1, 2, 1], name="foo")
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = ser.groupby(level=0, sort=sort)
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
# check_dtype can be removed if GH 44952 is addressed
|
||||
check_dtype = func != "sum"
|
||||
tm.assert_series_equal(result, expected, check_dtype=check_dtype)
|
||||
|
||||
def test_as_index_false_unsupported(self, numba_supported_reductions):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
gb = df.groupby("a", as_index=False)
|
||||
with pytest.raises(NotImplementedError, match="as_index=False"):
|
||||
getattr(gb, func)(engine="numba", **kwargs)
|
||||
|
||||
def test_axis_1_unsupported(self, numba_supported_reductions):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
gb = df.groupby("a", axis=1)
|
||||
with pytest.raises(NotImplementedError, match="axis=1"):
|
||||
getattr(gb, func)(engine="numba", **kwargs)
|
||||
@@ -0,0 +1,184 @@
|
||||
import datetime as dt
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
NaT,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("n", 10 ** np.arange(2, 6))
|
||||
@pytest.mark.parametrize("m", [10, 100, 1000])
|
||||
@pytest.mark.parametrize("sort", [False, True])
|
||||
@pytest.mark.parametrize("dropna", [False, True])
|
||||
def test_series_groupby_nunique(n, m, sort, dropna):
|
||||
def check_nunique(df, keys, as_index=True):
|
||||
original_df = df.copy()
|
||||
gr = df.groupby(keys, as_index=as_index, sort=sort)
|
||||
left = gr["julie"].nunique(dropna=dropna)
|
||||
|
||||
gr = df.groupby(keys, as_index=as_index, sort=sort)
|
||||
right = gr["julie"].apply(Series.nunique, dropna=dropna)
|
||||
if not as_index:
|
||||
right = right.reset_index(drop=True)
|
||||
|
||||
if as_index:
|
||||
tm.assert_series_equal(left, right, check_names=False)
|
||||
else:
|
||||
tm.assert_frame_equal(left, right, check_names=False)
|
||||
tm.assert_frame_equal(df, original_df)
|
||||
|
||||
days = date_range("2015-08-23", periods=10)
|
||||
|
||||
frame = DataFrame(
|
||||
{
|
||||
"jim": np.random.choice(list(ascii_lowercase), n),
|
||||
"joe": np.random.choice(days, n),
|
||||
"julie": np.random.randint(0, m, n),
|
||||
}
|
||||
)
|
||||
|
||||
check_nunique(frame, ["jim"])
|
||||
check_nunique(frame, ["jim", "joe"])
|
||||
|
||||
frame.loc[1::17, "jim"] = None
|
||||
frame.loc[3::37, "joe"] = None
|
||||
frame.loc[7::19, "julie"] = None
|
||||
frame.loc[8::19, "julie"] = None
|
||||
frame.loc[9::19, "julie"] = None
|
||||
|
||||
check_nunique(frame, ["jim"])
|
||||
check_nunique(frame, ["jim", "joe"])
|
||||
check_nunique(frame, ["jim"], as_index=False)
|
||||
check_nunique(frame, ["jim", "joe"], as_index=False)
|
||||
|
||||
|
||||
def test_nunique():
|
||||
df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
|
||||
|
||||
expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]})
|
||||
result = df.groupby("A", as_index=False).nunique()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# as_index
|
||||
expected.index = list("abc")
|
||||
expected.index.name = "A"
|
||||
expected = expected.drop(columns="A")
|
||||
result = df.groupby("A").nunique()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# with na
|
||||
result = df.replace({"x": None}).groupby("A").nunique(dropna=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# dropna
|
||||
expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
|
||||
expected.index.name = "A"
|
||||
result = df.replace({"x": None}).groupby("A").nunique()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nunique_with_object():
|
||||
# GH 11077
|
||||
data = DataFrame(
|
||||
[
|
||||
[100, 1, "Alice"],
|
||||
[200, 2, "Bob"],
|
||||
[300, 3, "Charlie"],
|
||||
[-400, 4, "Dan"],
|
||||
[500, 5, "Edith"],
|
||||
],
|
||||
columns=["amount", "id", "name"],
|
||||
)
|
||||
|
||||
result = data.groupby(["id", "amount"])["name"].nunique()
|
||||
index = MultiIndex.from_arrays([data.id, data.amount])
|
||||
expected = Series([1] * 5, name="name", index=index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_nunique_with_empty_series():
|
||||
# GH 12553
|
||||
data = Series(name="name", dtype=object)
|
||||
result = data.groupby(level=0).nunique()
|
||||
expected = Series(name="name", dtype="int64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_nunique_with_timegrouper():
|
||||
# GH 13453
|
||||
test = DataFrame(
|
||||
{
|
||||
"time": [
|
||||
Timestamp("2016-06-28 09:35:35"),
|
||||
Timestamp("2016-06-28 16:09:30"),
|
||||
Timestamp("2016-06-28 16:46:28"),
|
||||
],
|
||||
"data": ["1", "2", "3"],
|
||||
}
|
||||
).set_index("time")
|
||||
result = test.groupby(pd.Grouper(freq="h"))["data"].nunique()
|
||||
expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, data, dropna, expected",
|
||||
[
|
||||
(
|
||||
["x", "x", "x"],
|
||||
[Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")],
|
||||
True,
|
||||
Series([1], index=pd.Index(["x"], name="key"), name="data"),
|
||||
),
|
||||
(
|
||||
["x", "x", "x"],
|
||||
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
|
||||
True,
|
||||
Series([1], index=pd.Index(["x"], name="key"), name="data"),
|
||||
),
|
||||
(
|
||||
["x", "x", "x", "y", "y"],
|
||||
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
|
||||
False,
|
||||
Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"),
|
||||
),
|
||||
(
|
||||
["x", "x", "x", "x", "y"],
|
||||
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
|
||||
False,
|
||||
Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_nunique_with_NaT(key, data, dropna, expected):
|
||||
# GH 27951
|
||||
df = DataFrame({"key": key, "data": data})
|
||||
result = df.groupby(["key"])["data"].nunique(dropna=dropna)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_nunique_preserves_column_level_names():
|
||||
# GH 23222
|
||||
test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0"))
|
||||
result = test.groupby([0, 0, 0]).nunique()
|
||||
expected = DataFrame([2], columns=test.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nunique_transform_with_datetime():
|
||||
# GH 35109 - transform with nunique on datetimes results in integers
|
||||
df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"])
|
||||
result = df.groupby([0, 0, 1])["date"].transform("nunique")
|
||||
expected = Series([2, 2, 1], name="date")
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,82 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.api import Int64Index
|
||||
|
||||
|
||||
def test_pipe():
|
||||
# Test the pipe method of DataFrameGroupBy.
|
||||
# Issue #17871
|
||||
|
||||
random_state = np.random.RandomState(1234567890)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": random_state.randn(8),
|
||||
"C": random_state.randn(8),
|
||||
}
|
||||
)
|
||||
|
||||
def f(dfgb):
|
||||
return dfgb.B.max() - dfgb.C.min().min()
|
||||
|
||||
def square(srs):
|
||||
return srs**2
|
||||
|
||||
# Note that the transformations are
|
||||
# GroupBy -> Series
|
||||
# Series -> Series
|
||||
# This then chains the GroupBy.pipe and the
|
||||
# NDFrame.pipe methods
|
||||
result = df.groupby("A").pipe(f).pipe(square)
|
||||
|
||||
index = Index(["bar", "foo"], dtype="object", name="A")
|
||||
expected = pd.Series([8.99110003361, 8.17516964785], name="B", index=index)
|
||||
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_pipe_args():
|
||||
# Test passing args to the pipe method of DataFrameGroupBy.
|
||||
# Issue #17871
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": ["A", "A", "B", "B", "C"],
|
||||
"x": [1.0, 2.0, 3.0, 2.0, 5.0],
|
||||
"y": [10.0, 100.0, 1000.0, -100.0, -1000.0],
|
||||
}
|
||||
)
|
||||
|
||||
def f(dfgb, arg1):
|
||||
return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby(
|
||||
dfgb.grouper
|
||||
)
|
||||
|
||||
def g(dfgb, arg2):
|
||||
return dfgb.sum() / dfgb.sum().sum() + arg2
|
||||
|
||||
def h(df, arg3):
|
||||
return df.x + df.y - arg3
|
||||
|
||||
result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100)
|
||||
|
||||
# Assert the results here
|
||||
index = Index(["A", "B", "C"], name="group")
|
||||
expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index)
|
||||
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
# test SeriesGroupby.pipe
|
||||
ser = pd.Series([1, 1, 2, 2, 3, 3])
|
||||
result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
|
||||
|
||||
expected = pd.Series([4, 8, 12], index=Int64Index([1, 2, 3]))
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,331 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"a_vals,b_vals",
|
||||
[
|
||||
# Ints
|
||||
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
|
||||
([1, 2, 3, 4], [4, 3, 2, 1]),
|
||||
([1, 2, 3, 4, 5], [4, 3, 2, 1]),
|
||||
# Floats
|
||||
([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
|
||||
# Missing data
|
||||
([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
|
||||
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
|
||||
# Timestamps
|
||||
(
|
||||
list(pd.date_range("1/1/18", freq="D", periods=5)),
|
||||
list(pd.date_range("1/1/18", freq="D", periods=5))[::-1],
|
||||
),
|
||||
# All NA
|
||||
([np.nan] * 5, [np.nan] * 5),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
|
||||
def test_quantile(interpolation, a_vals, b_vals, q):
|
||||
if interpolation == "nearest" and q == 0.5 and b_vals == [4, 3, 2, 1]:
|
||||
pytest.skip(
|
||||
"Unclear numpy expectation for nearest result with equidistant data"
|
||||
)
|
||||
|
||||
a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
|
||||
b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
|
||||
|
||||
df = DataFrame(
|
||||
{"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": a_vals + b_vals}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
[a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
|
||||
)
|
||||
result = df.groupby("key").quantile(q, interpolation=interpolation)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array():
|
||||
# https://github.com/pandas-dev/pandas/issues/27526
|
||||
df = DataFrame({"A": [0, 1, 2, 3, 4]})
|
||||
result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25])
|
||||
|
||||
index = pd.MultiIndex.from_product([[0, 1], [0.25]])
|
||||
expected = DataFrame({"A": [0.25, 2.50]}, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
|
||||
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
|
||||
|
||||
result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75])
|
||||
expected = DataFrame(
|
||||
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array2():
|
||||
# https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
|
||||
df = DataFrame(
|
||||
np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC")
|
||||
)
|
||||
result = df.groupby("A").quantile([0.3, 0.7])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0],
|
||||
"C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0],
|
||||
},
|
||||
index=pd.MultiIndex.from_product(
|
||||
[[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array_no_sort():
|
||||
df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
|
||||
result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75])
|
||||
expected = DataFrame(
|
||||
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
|
||||
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25])
|
||||
expected = DataFrame(
|
||||
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
|
||||
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array_multiple_levels():
|
||||
df = DataFrame(
|
||||
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
|
||||
)
|
||||
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
|
||||
index = pd.MultiIndex.from_tuples(
|
||||
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
|
||||
names=["c", "d", None],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
|
||||
@pytest.mark.parametrize("groupby", [[0], [0, 1]])
|
||||
@pytest.mark.parametrize("q", [[0.5, 0.6]])
|
||||
def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
|
||||
# GH30289
|
||||
nrow, ncol = frame_size
|
||||
df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
|
||||
|
||||
idx_levels = [list(range(min(nrow, 4)))] * len(groupby) + [q]
|
||||
idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
|
||||
list(range(len(q))) * min(nrow, 4)
|
||||
]
|
||||
expected_index = pd.MultiIndex(
|
||||
levels=idx_levels, codes=idx_codes, names=groupby + [None]
|
||||
)
|
||||
expected_values = [
|
||||
[float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
|
||||
]
|
||||
expected_columns = [x for x in range(ncol) if x not in groupby]
|
||||
expected = DataFrame(
|
||||
expected_values, index=expected_index, columns=expected_columns
|
||||
)
|
||||
result = df.groupby(groupby).quantile(q)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_raises():
|
||||
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
|
||||
|
||||
with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match="Dropping invalid columns"
|
||||
):
|
||||
df.groupby("key").quantile()
|
||||
|
||||
|
||||
def test_quantile_out_of_bounds_q_raises():
|
||||
# https://github.com/pandas-dev/pandas/issues/27470
|
||||
df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
|
||||
g = df.groupby([0, 0, 0, 1, 1, 1])
|
||||
with pytest.raises(ValueError, match="Got '50.0' instead"):
|
||||
g.quantile(50)
|
||||
|
||||
with pytest.raises(ValueError, match="Got '-1.0' instead"):
|
||||
g.quantile(-1)
|
||||
|
||||
|
||||
def test_quantile_missing_group_values_no_segfaults():
|
||||
# GH 28662
|
||||
data = np.array([1.0, np.nan, 1.0])
|
||||
df = DataFrame({"key": data, "val": range(3)})
|
||||
|
||||
# Random segfaults; would have been guaranteed in loop
|
||||
grp = df.groupby("key")
|
||||
for _ in range(100):
|
||||
grp.quantile()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, val, expected_key, expected_val",
|
||||
[
|
||||
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
|
||||
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
|
||||
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
|
||||
([0], [42], [0], [42.0]),
|
||||
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
|
||||
],
|
||||
)
|
||||
def test_quantile_missing_group_values_correct_results(
|
||||
key, val, expected_key, expected_val
|
||||
):
|
||||
# GH 28662, GH 33200, GH 33569
|
||||
df = DataFrame({"key": key, "val": val})
|
||||
|
||||
expected = DataFrame(
|
||||
expected_val, index=Index(expected_key, name="key"), columns=["val"]
|
||||
)
|
||||
|
||||
grp = df.groupby("key")
|
||||
|
||||
result = grp.quantile(0.5)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grp.quantile()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.array([1, 0, None] * 2, dtype="Int64"),
|
||||
pd.array([True, False, None] * 2, dtype="boolean"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
||||
def test_groupby_quantile_nullable_array(values, q):
|
||||
# https://github.com/pandas-dev/pandas/issues/33136
|
||||
df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
|
||||
result = df.groupby("a")["b"].quantile(q)
|
||||
|
||||
if isinstance(q, list):
|
||||
idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
|
||||
true_quantiles = [0.0, 0.5, 1.0]
|
||||
else:
|
||||
idx = Index(["x", "y"], name="a")
|
||||
true_quantiles = [0.5]
|
||||
|
||||
expected = pd.Series(true_quantiles * 2, index=idx, name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
||||
def test_groupby_quantile_skips_invalid_dtype(q):
|
||||
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
|
||||
result = df.groupby("a").quantile(q)
|
||||
|
||||
expected = df.groupby("a")[["b"]].quantile(q)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_NA_float(any_float_dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
exp_index = Index([1.0], dtype=any_float_dtype, name="x")
|
||||
expected = pd.Series([0.2], dtype=float, index=exp_index, name="y")
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
result = df.groupby("x")["y"].quantile([0.5, 0.75])
|
||||
expected = pd.Series(
|
||||
[0.2] * 2,
|
||||
index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
|
||||
name="y",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_NA_int(any_int_ea_dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
expected = pd.Series(
|
||||
[3.5], dtype=float, index=Index([1], name="x", dtype=any_int_ea_dtype), name="y"
|
||||
)
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
result = df.groupby("x").quantile(0.5)
|
||||
expected = DataFrame({"y": 3.5}, index=Index([1], name="x", dtype=any_int_ea_dtype))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Float64", "Float32"])
|
||||
def test_groupby_quantile_allNA_column(dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
expected = pd.Series(
|
||||
[np.nan], dtype=float, index=Index([1.0], dtype=dtype), name="y"
|
||||
)
|
||||
expected.index.name = "x"
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_groupby_timedelta_quantile():
|
||||
# GH: 29485
|
||||
df = DataFrame(
|
||||
{"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]}
|
||||
)
|
||||
result = df.groupby("group").quantile(0.99)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"value": [
|
||||
pd.Timedelta("0 days 00:00:00.990000"),
|
||||
pd.Timedelta("0 days 00:00:02.990000"),
|
||||
]
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_columns_groupby_quantile():
|
||||
# GH 33795
|
||||
df = DataFrame(
|
||||
np.arange(12).reshape(3, -1),
|
||||
index=list("XYZ"),
|
||||
columns=pd.Series(list("ABAB"), name="col"),
|
||||
)
|
||||
result = df.groupby("col", axis=1).quantile(q=[0.8, 0.2])
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1.6, 0.4, 2.6, 1.4],
|
||||
[5.6, 4.4, 6.6, 5.4],
|
||||
[9.6, 8.4, 10.6, 9.4],
|
||||
],
|
||||
index=list("XYZ"),
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
|
||||
),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,663 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_rank_apply():
|
||||
lev1 = tm.rands_array(10, 100)
|
||||
lev2 = tm.rands_array(10, 130)
|
||||
lab1 = np.random.randint(0, 100, size=500)
|
||||
lab2 = np.random.randint(0, 130, size=500)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"value": np.random.randn(500),
|
||||
"key1": lev1.take(lab1),
|
||||
"key2": lev2.take(lab2),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank()
|
||||
|
||||
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
|
||||
|
||||
expected = [
|
||||
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
|
||||
]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
np.array([2, 2, 8, 2, 6], dtype=dtype)
|
||||
for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
|
||||
]
|
||||
+ [
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-08").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-06").to_period("D"),
|
||||
],
|
||||
],
|
||||
ids=lambda x: type(x[0]),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,pct,exp",
|
||||
[
|
||||
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
|
||||
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
|
||||
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
|
||||
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
|
||||
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
|
||||
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
|
||||
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
|
||||
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
|
||||
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
|
||||
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
|
||||
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
|
||||
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
|
||||
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
|
||||
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
|
||||
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
|
||||
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
|
||||
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
|
||||
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
|
||||
orig_vals = vals
|
||||
vals = list(vals) * len(grps)
|
||||
if isinstance(orig_vals, np.ndarray):
|
||||
vals = np.array(vals, dtype=orig_vals.dtype)
|
||||
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,exp",
|
||||
[
|
||||
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
|
||||
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
|
||||
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
|
||||
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
|
||||
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
|
||||
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
|
||||
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
|
||||
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
|
||||
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
|
||||
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
|
||||
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
|
||||
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
|
||||
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
|
||||
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
|
||||
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
|
||||
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
|
||||
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
|
||||
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
|
||||
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
|
||||
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
|
||||
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
|
||||
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
|
||||
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
|
||||
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
|
||||
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
|
||||
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
|
||||
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
|
||||
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
|
||||
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
|
||||
# GH 20561
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option
|
||||
)
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
|
||||
for dtype in ["f8", "f4", "f2"]
|
||||
]
|
||||
+ [
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-06").to_period("D"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
],
|
||||
ids=lambda x: type(x[0]),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,pct,exp",
|
||||
[
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
|
||||
),
|
||||
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
|
||||
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
|
||||
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
|
||||
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
1.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
np.nan,
|
||||
3.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
3.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
np.nan,
|
||||
1.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
|
||||
),
|
||||
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
|
||||
),
|
||||
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
|
||||
),
|
||||
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
|
||||
),
|
||||
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
|
||||
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
|
||||
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
|
||||
),
|
||||
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
|
||||
),
|
||||
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
|
||||
),
|
||||
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
|
||||
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
|
||||
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
|
||||
orig_vals = vals
|
||||
vals = list(vals) * len(grps)
|
||||
if isinstance(orig_vals, np.ndarray):
|
||||
vals = np.array(vals, dtype=orig_vals.dtype)
|
||||
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
|
||||
)
|
||||
def test_rank_resets_each_group(pct, exp):
|
||||
df = DataFrame(
|
||||
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
|
||||
)
|
||||
result = df.groupby("key").rank(pct=pct)
|
||||
exp_df = DataFrame(exp * 2, columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
|
||||
)
|
||||
@pytest.mark.parametrize("upper", [True, False])
|
||||
def test_rank_avg_even_vals(dtype, upper):
|
||||
if upper:
|
||||
# use IntegerDtype/FloatingDtype
|
||||
dtype = dtype[0].upper() + dtype[1:]
|
||||
dtype = dtype.replace("Ui", "UI")
|
||||
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
|
||||
df["val"] = df["val"].astype(dtype)
|
||||
assert df["val"].dtype == dtype
|
||||
|
||||
result = df.groupby("key").rank()
|
||||
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
|
||||
)
|
||||
def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
mask = df["val"].isna()
|
||||
|
||||
gb = df.groupby("key")
|
||||
res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
|
||||
|
||||
# construct our expected by using numeric values with the same ordering
|
||||
if mask.any():
|
||||
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
|
||||
else:
|
||||
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
|
||||
|
||||
gb2 = df2.groupby("key")
|
||||
alt = gb2.rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(res, alt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", [True, "bad", 1])
|
||||
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
["bar", "bar", "foo", "bar", "baz"],
|
||||
["bar", np.nan, "foo", np.nan, "baz"],
|
||||
[1, np.nan, 2, np.nan, 3],
|
||||
],
|
||||
)
|
||||
def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
|
||||
def test_rank_empty_group():
|
||||
# see gh-22519
|
||||
column = "A"
|
||||
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
|
||||
|
||||
result = df.groupby(column).B.rank(pct=True)
|
||||
expected = Series([0.5, np.nan, 1.0], name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(column).rank(pct=True)
|
||||
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_key,input_value,output_value",
|
||||
[
|
||||
([1, 2], [1, 1], [1.0, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
|
||||
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
|
||||
],
|
||||
)
|
||||
def test_rank_zero_div(input_key, input_value, output_value):
|
||||
# GH 23666
|
||||
df = DataFrame({"A": input_key, "B": input_value})
|
||||
|
||||
result = df.groupby("A").rank(method="dense", pct=True)
|
||||
expected = DataFrame({"B": output_value})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_min_int():
|
||||
# GH-32859
|
||||
df = DataFrame(
|
||||
{
|
||||
"grp": [1, 1, 2],
|
||||
"int_col": [
|
||||
np.iinfo(np.int64).min,
|
||||
np.iinfo(np.int64).max,
|
||||
np.iinfo(np.int64).min,
|
||||
],
|
||||
"datetimelike": [NaT, datetime(2001, 1, 1), NaT],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("grp").rank()
|
||||
expected = DataFrame(
|
||||
{"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.NaN, 1.0, np.NaN]}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_nan", [True, False])
|
||||
def test_rank_pct_equal_values_on_group_transition(use_nan):
|
||||
# GH#40518
|
||||
fill_value = np.nan if use_nan else 3
|
||||
df = DataFrame(
|
||||
[
|
||||
[-1, 1],
|
||||
[-1, 2],
|
||||
[1, fill_value],
|
||||
[-1, fill_value],
|
||||
],
|
||||
columns=["group", "val"],
|
||||
)
|
||||
result = df.groupby(["group"])["val"].rank(
|
||||
method="dense",
|
||||
pct=True,
|
||||
)
|
||||
if use_nan:
|
||||
expected = Series([0.5, 1, np.nan, np.nan], name="val")
|
||||
else:
|
||||
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_multiindex():
|
||||
# GH27721
|
||||
df = concat(
|
||||
{
|
||||
"a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
|
||||
"b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
|
||||
},
|
||||
axis=1,
|
||||
)
|
||||
|
||||
gb = df.groupby(level=0, axis=1)
|
||||
result = gb.rank(axis=1)
|
||||
|
||||
expected = concat(
|
||||
[
|
||||
df["a"].rank(axis=1),
|
||||
df["b"].rank(axis=1),
|
||||
],
|
||||
axis=1,
|
||||
keys=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_axis0_rank_axis1():
|
||||
# GH#41320
|
||||
df = DataFrame(
|
||||
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
|
||||
index=["a", "a", "b", "b"],
|
||||
)
|
||||
gb = df.groupby(level=0, axis=0)
|
||||
|
||||
res = gb.rank(axis=1)
|
||||
|
||||
# This should match what we get when "manually" operating group-by-group
|
||||
expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
# check that we haven't accidentally written a case that coincidentally
|
||||
# matches rank(axis=0)
|
||||
alt = gb.rank(axis=0)
|
||||
assert not alt.equals(expected)
|
||||
|
||||
|
||||
def test_groupby_axis0_cummax_axis1():
|
||||
# case where groupby axis is 0 and axis keyword in transform is 1
|
||||
|
||||
# df has mixed dtype -> multiple blocks
|
||||
df = DataFrame(
|
||||
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
|
||||
index=["a", "a", "b", "b"],
|
||||
)
|
||||
gb = df.groupby(level=0, axis=0)
|
||||
|
||||
cmax = gb.cummax(axis=1)
|
||||
expected = df[[0, 1]].astype(np.float64)
|
||||
expected[2] = expected[1]
|
||||
tm.assert_frame_equal(cmax, expected)
|
||||
|
||||
|
||||
def test_non_unique_index():
|
||||
# GH 16577
|
||||
df = DataFrame(
|
||||
{"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
|
||||
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
||||
)
|
||||
result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
|
||||
expected = Series(
|
||||
[1.0] * 4, index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, name="value"
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,144 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
|
||||
def test_groupby_sample_balanced_groups_shape(n, frac):
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=n, frac=frac)
|
||||
values = [1] * 2 + [2] * 2
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=n, frac=frac)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_unbalanced_groups_shape():
|
||||
values = [1] * 10 + [2] * 20
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=5)
|
||||
values = [1] * 5 + [2] * 5
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=5)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_index_value_spans_groups():
|
||||
values = [1] * 3 + [2] * 3
|
||||
df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
|
||||
|
||||
result = df.groupby("a").sample(n=2)
|
||||
values = [1] * 2 + [2] * 2
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=2)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_n_and_frac_raises():
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
msg = "Please enter a value for `frac` OR `n`, not both"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(n=1, frac=1.0)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(n=1, frac=1.0)
|
||||
|
||||
|
||||
def test_groupby_sample_frac_gt_one_without_replacement_raises():
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(frac=1.5, replace=False)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(frac=1.5, replace=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [-1, 1.5])
|
||||
def test_groupby_sample_invalid_n_raises(n):
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
|
||||
if n < 0:
|
||||
msg = "A negative number of rows requested. Please provide `n` >= 0."
|
||||
else:
|
||||
msg = "Only integers accepted as `n` values"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(n=n)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(n=n)
|
||||
|
||||
|
||||
def test_groupby_sample_oversample():
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(frac=2.0, replace=True)
|
||||
values = [1] * 20 + [2] * 20
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_without_n_or_frac():
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=None, frac=None)
|
||||
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=None, frac=None)
|
||||
expected = Series([1, 2], name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index, expected_index",
|
||||
[(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
|
||||
)
|
||||
def test_groupby_sample_with_weights(index, expected_index):
|
||||
# GH 39927 - tests for integer index needed
|
||||
values = [1] * 2 + [2] * 2
|
||||
df = DataFrame({"a": values, "b": values}, index=Index(index))
|
||||
|
||||
result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||||
expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||||
expected = Series(values, name="b", index=Index(expected_index))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_with_selections():
|
||||
# GH 39928
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values, "c": values})
|
||||
|
||||
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
|
||||
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,67 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
||||
def test_size(df, by):
|
||||
grouped = df.groupby(by=by)
|
||||
result = grouped.size()
|
||||
for key, group in grouped:
|
||||
assert result[key] == len(group)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
def test_size_sort(df, sort, by):
|
||||
df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("ABC"))
|
||||
left = df.groupby(by=by, sort=sort).size()
|
||||
right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
|
||||
tm.assert_series_equal(left, right, check_names=False)
|
||||
|
||||
|
||||
def test_size_series_dataframe():
|
||||
# https://github.com/pandas-dev/pandas/issues/11699
|
||||
df = DataFrame(columns=["A", "B"])
|
||||
out = Series(dtype="int64", index=Index([], name="A"))
|
||||
tm.assert_series_equal(df.groupby("A").size(), out)
|
||||
|
||||
|
||||
def test_size_groupby_all_null():
|
||||
# https://github.com/pandas-dev/pandas/issues/23050
|
||||
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
|
||||
df = DataFrame({"A": [None, None]}) # all-null groups
|
||||
result = df.groupby("A").size()
|
||||
expected = Series(dtype="int64", index=Index([], name="A"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_size_period_index():
|
||||
# https://github.com/pandas-dev/pandas/issues/34010
|
||||
ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
|
||||
grp = ser.groupby(level="A")
|
||||
result = grp.size()
|
||||
tm.assert_series_equal(result, ser)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_size_on_categorical(as_index):
|
||||
df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
|
||||
df["A"] = df["A"].astype("category")
|
||||
result = df.groupby(["A", "B"], as_index=as_index).size()
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
|
||||
)
|
||||
expected["A"] = expected["A"].astype("category")
|
||||
if as_index:
|
||||
expected = expected.set_index(["A", "B"])["size"].rename(None)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
@@ -0,0 +1,910 @@
|
||||
""" test with the TimeGrouper / grouping with datetimes """
|
||||
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
offsets,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.groupby.grouper import Grouper
|
||||
from pandas.core.groupby.ops import BinGrouper
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_for_truncated_bingrouper():
|
||||
"""
|
||||
DataFrame used by groupby_with_truncated_bingrouper, made into
|
||||
a separate fixture for easier re-use in
|
||||
test_groupby_apply_timegrouper_with_nat_apply_squeeze
|
||||
"""
|
||||
df = DataFrame(
|
||||
{
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
Timestamp(2013, 9, 1, 13, 0),
|
||||
Timestamp(2013, 9, 1, 13, 5),
|
||||
Timestamp(2013, 10, 1, 20, 0),
|
||||
Timestamp(2013, 10, 3, 10, 0),
|
||||
pd.NaT,
|
||||
Timestamp(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
|
||||
"""
|
||||
GroupBy object such that gb.grouper is a BinGrouper and
|
||||
len(gb.grouper.result_index) < len(gb.grouper.group_keys_seq)
|
||||
|
||||
Aggregations on this groupby should have
|
||||
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
|
||||
|
||||
As either the index or an index level.
|
||||
"""
|
||||
df = frame_for_truncated_bingrouper
|
||||
|
||||
tdg = Grouper(key="Date", freq="5D")
|
||||
gb = df.groupby(tdg)
|
||||
|
||||
# check we're testing the case we're interested in
|
||||
assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
|
||||
|
||||
return gb
|
||||
|
||||
|
||||
class TestGroupBy:
|
||||
def test_groupby_with_timegrouper(self):
|
||||
# GH 4161
|
||||
# TimeGrouper requires a sorted index
|
||||
# also verifies that the resultant index has the correct name
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
# GH 6908 change target column's order
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
df = df.set_index(["Date"])
|
||||
|
||||
expected = DataFrame(
|
||||
{"Quantity": 0},
|
||||
index=date_range(
|
||||
"20130901", "20131205", freq="5D", name="Date", inclusive="left"
|
||||
),
|
||||
)
|
||||
expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64")
|
||||
|
||||
result1 = df.resample("5D").sum()
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
|
||||
df_sorted = df.sort_index()
|
||||
result2 = df_sorted.groupby(Grouper(freq="5D")).sum()
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
result3 = df.groupby(Grouper(freq="5D")).sum()
|
||||
tm.assert_frame_equal(result3, expected)
|
||||
|
||||
@pytest.mark.parametrize("should_sort", [True, False])
|
||||
def test_groupby_with_timegrouper_methods(self, should_sort):
|
||||
# GH 3881
|
||||
# make sure API of timegrouper conforms
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 8, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if should_sort:
|
||||
df = df.sort_values(by="Quantity", ascending=False)
|
||||
|
||||
df = df.set_index("Date", drop=False)
|
||||
g = df.groupby(Grouper(freq="6M"))
|
||||
assert g.group_keys
|
||||
|
||||
assert isinstance(g.grouper, BinGrouper)
|
||||
groups = g.groups
|
||||
assert isinstance(groups, dict)
|
||||
assert len(groups) == 3
|
||||
|
||||
def test_timegrouper_with_reg_groups(self):
|
||||
|
||||
# GH 3794
|
||||
# allow combination of timegrouper/reg groups
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
).set_index("Date")
|
||||
|
||||
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
|
||||
|
||||
for df in [df_original, df_sorted]:
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
|
||||
result = df.groupby([Grouper(freq="A"), "Buyer"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Mark Carl Joe".split(),
|
||||
"Quantity": [1, 3, 9, 18],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 10, 1, 13, 0),
|
||||
datetime(2013, 10, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 2, 12, 0),
|
||||
datetime(2013, 10, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
).set_index("Date")
|
||||
|
||||
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
|
||||
for df in [df_original, df_sorted]:
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark Carl Joe".split(),
|
||||
"Quantity": [6, 8, 3, 4, 10],
|
||||
"Date": [
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
|
||||
result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# passing the name
|
||||
df = df.reset_index()
|
||||
result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
|
||||
df.groupby([Grouper(freq="1M", key="foo"), "Buyer"]).sum()
|
||||
|
||||
# passing the level
|
||||
df = df.set_index("Date")
|
||||
result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError, match="The level foo is not valid"):
|
||||
df.groupby([Grouper(freq="1M", level="foo"), "Buyer"]).sum()
|
||||
|
||||
# multi names
|
||||
df = df.copy()
|
||||
df["Date"] = df.index + offsets.MonthEnd(2)
|
||||
result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# error as we have both a level and a name!
|
||||
msg = "The Grouper cannot specify both a key and a level!"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(
|
||||
[Grouper(freq="1M", key="Date", level="Date"), "Buyer"]
|
||||
).sum()
|
||||
|
||||
# single groupers
|
||||
expected = DataFrame(
|
||||
[[31]],
|
||||
columns=["Quantity"],
|
||||
index=DatetimeIndex(
|
||||
[datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
|
||||
),
|
||||
)
|
||||
result = df.groupby(Grouper(freq="1M")).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1M")]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = expected.index.shift(1)
|
||||
assert expected.index.freq == offsets.MonthEnd()
|
||||
result = df.groupby(Grouper(freq="1M", key="Date")).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1M", key="Date")]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"])
|
||||
def test_timegrouper_with_reg_groups_freq(self, freq):
|
||||
# GH 6764 multiple grouping with/without sort
|
||||
df = DataFrame(
|
||||
{
|
||||
"date": pd.to_datetime(
|
||||
[
|
||||
"20121002",
|
||||
"20121007",
|
||||
"20130130",
|
||||
"20130202",
|
||||
"20130305",
|
||||
"20121002",
|
||||
"20121207",
|
||||
"20130130",
|
||||
"20130202",
|
||||
"20130305",
|
||||
"20130202",
|
||||
"20130305",
|
||||
]
|
||||
),
|
||||
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
|
||||
"whole_cost": [
|
||||
1790,
|
||||
364,
|
||||
280,
|
||||
259,
|
||||
201,
|
||||
623,
|
||||
90,
|
||||
312,
|
||||
359,
|
||||
301,
|
||||
359,
|
||||
801,
|
||||
],
|
||||
"cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
|
||||
}
|
||||
).set_index("date")
|
||||
|
||||
expected = (
|
||||
df.groupby("user_id")["whole_cost"]
|
||||
.resample(freq)
|
||||
.sum(min_count=1) # XXX
|
||||
.dropna()
|
||||
.reorder_levels(["date", "user_id"])
|
||||
.sort_index()
|
||||
.astype("int64")
|
||||
)
|
||||
expected.name = "whole_cost"
|
||||
|
||||
result1 = (
|
||||
df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
|
||||
)
|
||||
tm.assert_series_equal(result1, expected)
|
||||
|
||||
result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
|
||||
tm.assert_series_equal(result2, expected)
|
||||
|
||||
def test_timegrouper_get_group(self):
|
||||
# GH 6914
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Joe Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
# single grouping
|
||||
expected_list = [
|
||||
df_original.iloc[[0, 1, 5]],
|
||||
df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(Grouper(freq="M", key="Date"))
|
||||
for t, expected in zip(dt_list, expected_list):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# multiple grouping
|
||||
expected_list = [
|
||||
df_original.iloc[[1]],
|
||||
df_original.iloc[[3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(["Buyer", Grouper(freq="M", key="Date")])
|
||||
for (b, t), expected in zip(g_list, expected_list):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group((b, dt))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# with index
|
||||
df_original = df_original.set_index("Date")
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
expected_list = [
|
||||
df_original.iloc[[0, 1, 5]],
|
||||
df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(Grouper(freq="M"))
|
||||
for t, expected in zip(dt_list, expected_list):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_timegrouper_apply_return_type_series(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt["date"] = pd.to_datetime(df_dt["date"])
|
||||
|
||||
def sumfunc_series(x):
|
||||
return Series([x["value"].sum()], ("sum",))
|
||||
|
||||
expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
|
||||
result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series)
|
||||
tm.assert_frame_equal(
|
||||
result.reset_index(drop=True), expected.reset_index(drop=True)
|
||||
)
|
||||
|
||||
def test_timegrouper_apply_return_type_value(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt["date"] = pd.to_datetime(df_dt["date"])
|
||||
|
||||
def sumfunc_value(x):
|
||||
return x.value.sum()
|
||||
|
||||
expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
|
||||
result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value)
|
||||
tm.assert_series_equal(
|
||||
result.reset_index(drop=True), expected.reset_index(drop=True)
|
||||
)
|
||||
|
||||
def test_groupby_groups_datetimeindex(self):
|
||||
# GH#1430
|
||||
periods = 1000
|
||||
ind = date_range(start="2012/1/1", freq="5min", periods=periods)
|
||||
df = DataFrame(
|
||||
{"high": np.arange(periods), "low": np.arange(periods)}, index=ind
|
||||
)
|
||||
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
|
||||
|
||||
# it works!
|
||||
groups = grouped.groups
|
||||
assert isinstance(list(groups.keys())[0], datetime)
|
||||
|
||||
# GH#11442
|
||||
index = date_range("2015/01/01", periods=5, name="date")
|
||||
df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
|
||||
result = df.groupby(level="date").groups
|
||||
dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
|
||||
expected = {
|
||||
Timestamp(date): DatetimeIndex([date], name="date") for date in dates
|
||||
}
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
grouped = df.groupby(level="date")
|
||||
for date in dates:
|
||||
result = grouped.get_group(date)
|
||||
data = [[df.loc[date, "A"], df.loc[date, "B"]]]
|
||||
expected_index = DatetimeIndex([date], name="date", freq="D")
|
||||
expected = DataFrame(data, columns=list("AB"), index=expected_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_datetimeindex_tz(self):
|
||||
# GH 3950
|
||||
dates = [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
]
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"datetime": dates,
|
||||
"value1": np.arange(6, dtype="int64"),
|
||||
"value2": [1, 2] * 3,
|
||||
}
|
||||
)
|
||||
df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
|
||||
|
||||
exp_idx1 = DatetimeIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
tz="US/Pacific",
|
||||
name="datetime",
|
||||
)
|
||||
exp_idx2 = Index(["a", "b"] * 3, name="label")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(["datetime", "label"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = DatetimeIndex(dates, tz="Asia/Tokyo")
|
||||
df = DataFrame(
|
||||
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
|
||||
index=didx,
|
||||
)
|
||||
|
||||
exp_idx = DatetimeIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
tz="Asia/Tokyo",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_datetime64_handling_groupby(self):
|
||||
# it works!
|
||||
df = DataFrame(
|
||||
[(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
|
||||
columns=["a", "date"],
|
||||
)
|
||||
result = df.groupby("a").first()
|
||||
assert result["date"][3] == Timestamp("2012-07-03")
|
||||
|
||||
def test_groupby_multi_timezone(self):
|
||||
|
||||
# combining multiple / different timezones yields UTC
|
||||
|
||||
data = """0,2000-01-28 16:47:00,America/Chicago
|
||||
1,2000-01-29 16:48:00,America/Chicago
|
||||
2,2000-01-30 16:49:00,America/Los_Angeles
|
||||
3,2000-01-31 16:50:00,America/Chicago
|
||||
4,2000-01-01 16:50:00,America/New_York"""
|
||||
|
||||
df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"])
|
||||
result = df.groupby("tz").date.apply(
|
||||
lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
|
||||
)
|
||||
|
||||
expected = Series(
|
||||
[
|
||||
Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
|
||||
Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
|
||||
],
|
||||
name="date",
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
tz = "America/Chicago"
|
||||
res_values = df.groupby("tz").date.get_group(tz)
|
||||
result = pd.to_datetime(res_values).dt.tz_localize(tz)
|
||||
exp_values = Series(
|
||||
["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
|
||||
index=[0, 1, 3],
|
||||
name="date",
|
||||
)
|
||||
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_periods(self):
|
||||
dates = [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
]
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"period": [pd.Period(d, freq="H") for d in dates],
|
||||
"value1": np.arange(6, dtype="int64"),
|
||||
"value2": [1, 2] * 3,
|
||||
}
|
||||
)
|
||||
|
||||
exp_idx1 = pd.PeriodIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
freq="H",
|
||||
name="period",
|
||||
)
|
||||
exp_idx2 = Index(["a", "b"] * 3, name="label")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(["period", "label"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = pd.PeriodIndex(dates, freq="H")
|
||||
df = DataFrame(
|
||||
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
|
||||
index=didx,
|
||||
)
|
||||
|
||||
exp_idx = pd.PeriodIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
freq="H",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_first_datetime64(self):
|
||||
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
|
||||
df[1] = df[1].view("M8[ns]")
|
||||
|
||||
assert issubclass(df[1].dtype.type, np.datetime64)
|
||||
|
||||
result = df.groupby(level=0).first()
|
||||
got_dt = result[1].dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
result = df[1].groupby(level=0).first()
|
||||
got_dt = result.dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
def test_groupby_max_datetime64(self):
|
||||
# GH 5869
|
||||
# datetimelike dtype conversion from int
|
||||
df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
|
||||
expected = df.groupby("A")["A"].apply(lambda x: x.max())
|
||||
result = df.groupby("A")["A"].max()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_datetime64_32_bit(self):
|
||||
# GH 6410 / numpy 4328
|
||||
# 32-bit under 1.9-dev indexing issue
|
||||
|
||||
df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2})
|
||||
result = df.groupby("A")["B"].transform(min)
|
||||
expected = Series([Timestamp("2000-01-1")] * 2, name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_with_timezone_selection(self):
|
||||
# GH 11616
|
||||
# Test that column selection returns output in correct timezone.
|
||||
np.random.seed(42)
|
||||
df = DataFrame(
|
||||
{
|
||||
"factor": np.random.randint(0, 3, size=60),
|
||||
"time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"),
|
||||
}
|
||||
)
|
||||
df1 = df.groupby("factor").max()["time"]
|
||||
df2 = df.groupby("factor")["time"].max()
|
||||
tm.assert_series_equal(df1, df2)
|
||||
|
||||
def test_timezone_info(self):
|
||||
# see gh-11682: Timezone info lost when broadcasting
|
||||
# scalar datetime to DataFrame
|
||||
|
||||
df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]})
|
||||
assert df["b"][0].tzinfo == pytz.utc
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
df["b"] = datetime.now(pytz.utc)
|
||||
assert df["b"][0].tzinfo == pytz.utc
|
||||
|
||||
def test_datetime_count(self):
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="T")}
|
||||
)
|
||||
result = df.groupby("a").dates.count()
|
||||
expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_first_last_max_min_on_time_data(self):
|
||||
# GH 10295
|
||||
# Verify that NaT is not in the result of max, min, first and last on
|
||||
# Dataframe with datetime or timedelta values.
|
||||
from datetime import timedelta as td
|
||||
|
||||
df_test = DataFrame(
|
||||
{
|
||||
"dt": [
|
||||
np.nan,
|
||||
"2015-07-24 10:10",
|
||||
"2015-07-25 11:11",
|
||||
"2015-07-23 12:12",
|
||||
np.nan,
|
||||
],
|
||||
"td": [np.nan, td(days=1), td(days=2), td(days=3), np.nan],
|
||||
}
|
||||
)
|
||||
df_test.dt = pd.to_datetime(df_test.dt)
|
||||
df_test["group"] = "A"
|
||||
df_ref = df_test[df_test.dt.notna()]
|
||||
|
||||
grouped_test = df_test.groupby("group")
|
||||
grouped_ref = df_ref.groupby("group")
|
||||
|
||||
tm.assert_frame_equal(grouped_ref.max(), grouped_test.max())
|
||||
tm.assert_frame_equal(grouped_ref.min(), grouped_test.min())
|
||||
tm.assert_frame_equal(grouped_ref.first(), grouped_test.first())
|
||||
tm.assert_frame_equal(grouped_ref.last(), grouped_test.last())
|
||||
|
||||
def test_nunique_with_timegrouper_and_nat(self):
|
||||
# GH 17575
|
||||
test = DataFrame(
|
||||
{
|
||||
"time": [
|
||||
Timestamp("2016-06-28 09:35:35"),
|
||||
pd.NaT,
|
||||
Timestamp("2016-06-28 16:46:28"),
|
||||
],
|
||||
"data": ["1", "2", "3"],
|
||||
}
|
||||
)
|
||||
|
||||
grouper = Grouper(key="time", freq="h")
|
||||
result = test.groupby(grouper)["data"].nunique()
|
||||
expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
|
||||
expected.index = expected.index._with_freq(None)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_scalar_call_versus_list_call(self):
|
||||
# Issue: 17530
|
||||
data_frame = {
|
||||
"location": ["shanghai", "beijing", "shanghai"],
|
||||
"time": Series(
|
||||
["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
|
||||
dtype="datetime64[ns]",
|
||||
),
|
||||
"value": [1, 2, 3],
|
||||
}
|
||||
data_frame = DataFrame(data_frame).set_index("time")
|
||||
grouper = Grouper(freq="D")
|
||||
|
||||
grouped = data_frame.groupby(grouper)
|
||||
result = grouped.count()
|
||||
grouped = data_frame.groupby([grouper])
|
||||
expected = grouped.count()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_grouper_period_index(self):
|
||||
# GH 32108
|
||||
periods = 2
|
||||
index = pd.period_range(
|
||||
start="2018-01", periods=periods, freq="M", name="Month"
|
||||
)
|
||||
period_series = Series(range(periods), index=index)
|
||||
result = period_series.groupby(period_series.index.month).sum()
|
||||
|
||||
expected = Series(
|
||||
range(0, periods), index=Index(range(1, periods + 1), name=index.name)
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_dict_returns(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
# GH#43500 case where gb.grouper.result_index and gb.grouper.group_keys_seq
|
||||
# have different lengths that goes through the `isinstance(values[0], dict)`
|
||||
# path
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
|
||||
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
|
||||
mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
|
||||
expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_scalar_returns(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
# GH#43500 Previously raised ValueError bc used index with incorrect
|
||||
# length in wrap_applied_result
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
|
||||
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
|
||||
expected = Series(
|
||||
[18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
|
||||
index=dti._with_freq(None),
|
||||
name="Quantity",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_apply_squeeze(
|
||||
self, frame_for_truncated_bingrouper
|
||||
):
|
||||
df = frame_for_truncated_bingrouper
|
||||
|
||||
# We need to create a GroupBy object with only one non-NaT group,
|
||||
# so use a huge freq so that all non-NaT dates will be grouped together
|
||||
tdg = Grouper(key="Date", freq="100Y")
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match="`squeeze` parameter"):
|
||||
gb = df.groupby(tdg, squeeze=True)
|
||||
|
||||
# check that we will go through the singular_series path
|
||||
# in _wrap_applied_output_series
|
||||
assert gb.ngroups == 1
|
||||
assert gb._selected_obj._get_axis(gb.axis).nlevels == 1
|
||||
|
||||
# function that returns a Series
|
||||
res = gb.apply(lambda x: x["Quantity"] * 2)
|
||||
|
||||
key = Timestamp("2013-12-31")
|
||||
ordering = df["Date"].sort_values().dropna().index
|
||||
mi = MultiIndex.from_product([[key], ordering], names=["Date", None])
|
||||
|
||||
ex_values = df["Quantity"].take(ordering).values * 2
|
||||
expected = Series(ex_values, index=mi, name="Quantity")
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_groupby_agg_numba_timegrouper_with_nat(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
# See discussion in GH#43487
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
result = gb["Quantity"].aggregate(
|
||||
lambda values, index: np.nanmean(values), engine="numba"
|
||||
)
|
||||
|
||||
expected = gb["Quantity"].aggregate(np.nanmean)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result_df = gb[["Quantity"]].aggregate(
|
||||
lambda values, index: np.nanmean(values), engine="numba"
|
||||
)
|
||||
expected_df = gb[["Quantity"]].aggregate(np.nanmean)
|
||||
tm.assert_frame_equal(result_df, expected_df)
|
||||
@@ -0,0 +1,174 @@
|
||||
"""
|
||||
these are systematically testing all of the args to value_counts
|
||||
with different size combinations. This is to ensure stability of the sorting
|
||||
and proper parameter handling
|
||||
"""
|
||||
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
Grouper,
|
||||
MultiIndex,
|
||||
Series,
|
||||
date_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
# our starting frame
|
||||
def seed_df(seed_nans, n, m):
|
||||
np.random.seed(1234)
|
||||
days = date_range("2015-08-24", periods=10)
|
||||
|
||||
frame = DataFrame(
|
||||
{
|
||||
"1st": np.random.choice(list("abcd"), n),
|
||||
"2nd": np.random.choice(days, n),
|
||||
"3rd": np.random.randint(1, m + 1, n),
|
||||
}
|
||||
)
|
||||
|
||||
if seed_nans:
|
||||
frame.loc[1::11, "1st"] = np.nan
|
||||
frame.loc[3::17, "2nd"] = np.nan
|
||||
frame.loc[7::19, "3rd"] = np.nan
|
||||
frame.loc[8::19, "3rd"] = np.nan
|
||||
frame.loc[9::19, "3rd"] = np.nan
|
||||
|
||||
return frame
|
||||
|
||||
|
||||
# create input df, keys, and the bins
|
||||
binned = []
|
||||
ids = []
|
||||
for seed_nans in [True, False]:
|
||||
for n, m in product((100, 1000), (5, 20)):
|
||||
|
||||
df = seed_df(seed_nans, n, m)
|
||||
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
|
||||
keys = "1st", "2nd", ["1st", "2nd"]
|
||||
for k, b in product(keys, bins):
|
||||
binned.append((df, k, b, n, m))
|
||||
ids.append(f"{k}-{n}-{m}")
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
|
||||
@pytest.mark.parametrize("isort", [True, False])
|
||||
@pytest.mark.parametrize("normalize", [True, False])
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_series_groupby_value_counts(
|
||||
df, keys, bins, n, m, isort, normalize, sort, ascending, dropna
|
||||
):
|
||||
def rebuild_index(df):
|
||||
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
|
||||
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
|
||||
return df
|
||||
|
||||
kwargs = {
|
||||
"normalize": normalize,
|
||||
"sort": sort,
|
||||
"ascending": ascending,
|
||||
"dropna": dropna,
|
||||
"bins": bins,
|
||||
}
|
||||
|
||||
gr = df.groupby(keys, sort=isort)
|
||||
left = gr["3rd"].value_counts(**kwargs)
|
||||
|
||||
gr = df.groupby(keys, sort=isort)
|
||||
right = gr["3rd"].apply(Series.value_counts, **kwargs)
|
||||
right.index.names = right.index.names[:-1] + ["3rd"]
|
||||
|
||||
# have to sort on index because of unstable sort on values
|
||||
left, right = map(rebuild_index, (left, right)) # xref GH9212
|
||||
tm.assert_series_equal(left.sort_index(), right.sort_index())
|
||||
|
||||
|
||||
def test_series_groupby_value_counts_with_grouper():
|
||||
# GH28479
|
||||
df = DataFrame(
|
||||
{
|
||||
"Timestamp": [
|
||||
1565083561,
|
||||
1565083561 + 86400,
|
||||
1565083561 + 86500,
|
||||
1565083561 + 86400 * 2,
|
||||
1565083561 + 86400 * 3,
|
||||
1565083561 + 86500 * 3,
|
||||
1565083561 + 86400 * 4,
|
||||
],
|
||||
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
|
||||
}
|
||||
).drop([3])
|
||||
|
||||
df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s")
|
||||
dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
|
||||
|
||||
# have to sort on index because of unstable sort on values xref GH9212
|
||||
result = dfg["Food"].value_counts().sort_index()
|
||||
expected = dfg["Food"].apply(Series.value_counts).sort_index()
|
||||
expected.index.names = result.index.names
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
|
||||
def test_series_groupby_value_counts_empty(columns):
|
||||
# GH39172
|
||||
df = DataFrame(columns=columns)
|
||||
dfg = df.groupby(columns[:-1])
|
||||
|
||||
result = dfg[columns[-1]].value_counts()
|
||||
expected = Series([], name=columns[-1], dtype=result.dtype)
|
||||
expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
|
||||
def test_series_groupby_value_counts_one_row(columns):
|
||||
# GH42618
|
||||
df = DataFrame(data=[range(len(columns))], columns=columns)
|
||||
dfg = df.groupby(columns[:-1])
|
||||
|
||||
result = dfg[columns[-1]].value_counts()
|
||||
expected = df.value_counts().rename(columns[-1])
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_groupby_value_counts_on_categorical():
|
||||
# GH38672
|
||||
|
||||
s = Series(Categorical(["a"], categories=["a", "b"]))
|
||||
result = s.groupby([0]).value_counts()
|
||||
|
||||
expected = Series(
|
||||
data=[1, 0],
|
||||
index=MultiIndex.from_arrays(
|
||||
[
|
||||
[0, 0],
|
||||
CategoricalIndex(
|
||||
["a", "b"], categories=["a", "b"], ordered=False, dtype="category"
|
||||
),
|
||||
]
|
||||
),
|
||||
name=0,
|
||||
)
|
||||
|
||||
# Expected:
|
||||
# 0 a 1
|
||||
# b 0
|
||||
# Name: 0, dtype: int64
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,205 @@
|
||||
import pytest
|
||||
|
||||
from pandas.errors import NumbaUtilError
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_correct_function_signature():
|
||||
def incorrect_function(x):
|
||||
return x + 1
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key").transform(incorrect_function, engine="numba")
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key")["data"].transform(incorrect_function, engine="numba")
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_check_nopython_kwargs():
|
||||
def incorrect_function(x, **kwargs):
|
||||
return x + 1
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key").transform(incorrect_function, engine="numba", a=1)
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
@pytest.mark.filterwarnings("ignore:\n")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython):
|
||||
def func(values, index):
|
||||
return values + 1
|
||||
|
||||
if jit:
|
||||
# Test accepted jitted functions
|
||||
import numba
|
||||
|
||||
func = numba.jit(func)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
@pytest.mark.filterwarnings("ignore:\n")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
|
||||
# Test that the functions are cached correctly if we switch functions
|
||||
def func_1(values, index):
|
||||
return values + 1
|
||||
|
||||
def func_2(values, index):
|
||||
return values * 5
|
||||
|
||||
if jit:
|
||||
import numba
|
||||
|
||||
func_1 = numba.jit(func_1)
|
||||
func_2 = numba.jit(func_2)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
# func_1 should be in the cache now
|
||||
assert (func_1, "groupby_transform") in NUMBA_FUNC_CACHE
|
||||
|
||||
# Add func_2 to the cache
|
||||
result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x * 5, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
assert (func_2, "groupby_transform") in NUMBA_FUNC_CACHE
|
||||
|
||||
# Retest func_1 which should use the cache
|
||||
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_use_global_config():
|
||||
def func_1(values, index):
|
||||
return values + 1
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
expected = grouped.transform(func_1, engine="numba")
|
||||
with option_context("compute.use_numba", True):
|
||||
result = grouped.transform(func_1, engine=None)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
@pytest.mark.parametrize(
|
||||
"agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
|
||||
)
|
||||
def test_multifunc_notimplimented(agg_func):
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
with pytest.raises(NotImplementedError, match="Numba engine can"):
|
||||
grouped.transform(agg_func, engine="numba")
|
||||
|
||||
with pytest.raises(NotImplementedError, match="Numba engine can"):
|
||||
grouped[1].transform(agg_func, engine="numba")
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_args_not_cached():
|
||||
# GH 41647
|
||||
def sum_last(values, index, n):
|
||||
return values[-n:].sum()
|
||||
|
||||
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
||||
grouped_x = df.groupby("id")["x"]
|
||||
result = grouped_x.transform(sum_last, 1, engine="numba")
|
||||
expected = Series([1.0] * 4, name="x")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped_x.transform(sum_last, 2, engine="numba")
|
||||
expected = Series([2.0] * 4, name="x")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_index_data_correctly_passed():
|
||||
# GH 43133
|
||||
def f(values, index):
|
||||
return index - 1
|
||||
|
||||
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
||||
result = df.groupby("group").transform(f, engine="numba")
|
||||
expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_multiindex_one_key(nogil, parallel, nopython):
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby("A").transform(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@td.skip_if_no("numba")
|
||||
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
with pytest.raises(NotImplementedError, match="More than 1 grouping labels"):
|
||||
df.groupby(["A", "B"]).transform(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user