first commit

This commit is contained in:
Carla Floricel
2022-08-02 09:52:52 -04:00
parent 417ea8660b
commit 05e52aa52b
10444 changed files with 2300232 additions and 0 deletions

View File

@@ -0,0 +1,393 @@
"""
test cython .agg behavior
"""
import numpy as np
import pytest
from pandas.core.dtypes.common import is_float_dtype
import pandas as pd
from pandas import (
DataFrame,
Index,
NaT,
Series,
Timedelta,
Timestamp,
bdate_range,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"op_name",
[
"count",
"sum",
"std",
"var",
"sem",
"mean",
pytest.param(
"median",
# ignore mean of empty slice
# and all-NaN
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
),
"prod",
"min",
"max",
],
)
def test_cythonized_aggers(op_name):
data = {
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
"B": ["A", "B"] * 6,
"C": np.random.randn(12),
}
df = DataFrame(data)
df.loc[2:10:2, "C"] = np.nan
op = lambda x: getattr(x, op_name)()
# single column
grouped = df.drop(["B"], axis=1).groupby("A")
exp = {cat: op(group["C"]) for cat, group in grouped}
exp = DataFrame({"C": exp})
exp.index.name = "A"
result = op(grouped)
tm.assert_frame_equal(result, exp)
# multiple columns
grouped = df.groupby(["A", "B"])
expd = {}
for (cat1, cat2), group in grouped:
expd.setdefault(cat1, {})[cat2] = op(group["C"])
exp = DataFrame(expd).T.stack(dropna=False)
exp.index.names = ["A", "B"]
exp.name = "C"
result = op(grouped)["C"]
if op_name in ["sum", "prod"]:
tm.assert_series_equal(result, exp)
def test_cython_agg_boolean():
frame = DataFrame(
{
"a": np.random.randint(0, 5, 50),
"b": np.random.randint(0, 2, 50).astype("bool"),
}
)
result = frame.groupby("a")["b"].mean()
expected = frame.groupby("a")["b"].agg(np.mean)
tm.assert_series_equal(result, expected)
def test_cython_agg_nothing_to_agg():
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
with pytest.raises(NotImplementedError, match="does not implement"):
frame.groupby("a")["b"].mean(numeric_only=True)
with pytest.raises(TypeError, match="Could not convert (foo|bar)*"):
frame.groupby("a")["b"].mean()
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
with tm.assert_produces_warning(FutureWarning):
result = frame[["b"]].groupby(frame["a"]).mean()
expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
tm.assert_frame_equal(result, expected)
def test_cython_agg_nothing_to_agg_with_dates():
frame = DataFrame(
{
"a": np.random.randint(0, 5, 50),
"b": ["foo", "bar"] * 25,
"dates": pd.date_range("now", periods=50, freq="T"),
}
)
with pytest.raises(NotImplementedError, match="does not implement"):
frame.groupby("b").dates.mean(numeric_only=True)
def test_cython_agg_frame_columns():
# #2113
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
df.groupby(level=0, axis="columns").mean()
df.groupby(level=0, axis="columns").mean()
df.groupby(level=0, axis="columns").mean()
df.groupby(level=0, axis="columns").mean()
def test_cython_agg_return_dict():
# GH 16741
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
}
)
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
expected = Series(
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
index=Index(["bar", "foo"], name="A"),
name="B",
)
tm.assert_series_equal(ts, expected)
def test_cython_fail_agg():
dr = bdate_range("1/1/2000", periods=50)
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
grouped = ts.groupby(lambda x: x.month)
summed = grouped.sum()
expected = grouped.agg(np.sum)
tm.assert_series_equal(summed, expected)
@pytest.mark.parametrize(
"op, targop",
[
("mean", np.mean),
("median", np.median),
("var", np.var),
("add", np.sum),
("prod", np.prod),
("min", np.min),
("max", np.max),
("first", lambda x: x.iloc[0]),
("last", lambda x: x.iloc[-1]),
],
)
def test__cython_agg_general(op, targop):
df = DataFrame(np.random.randn(1000))
labels = np.random.randint(0, 50, size=1000).astype(float)
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
expected = df.groupby(labels).agg(targop)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, targop",
[
("mean", np.mean),
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
("var", lambda x: np.var(x, ddof=1)),
("min", np.min),
("max", np.max),
],
)
def test_cython_agg_empty_buckets(op, targop, observed):
df = DataFrame([11, 12, 13])
grps = range(0, 55, 5)
# calling _cython_agg_general directly, instead of via the user API
# which sets different values for min_count, so do that here.
g = df.groupby(pd.cut(df[0], grps), observed=observed)
result = g._cython_agg_general(op, alt=None, numeric_only=True)
g = df.groupby(pd.cut(df[0], grps), observed=observed)
expected = g.agg(lambda x: targop(x))
tm.assert_frame_equal(result, expected)
def test_cython_agg_empty_buckets_nanops(observed):
# GH-18869 can't call nanops on empty groups, so hardcode expected
# for these
df = DataFrame([11, 12, 13], columns=["a"])
grps = range(0, 25, 5)
# add / sum
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"add", alt=None, numeric_only=True
)
intervals = pd.interval_range(0, 20, freq=5)
expected = DataFrame(
{"a": [0, 0, 36, 0]},
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
)
if observed:
expected = expected[expected.a != 0]
tm.assert_frame_equal(result, expected)
# prod
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"prod", alt=None, numeric_only=True
)
expected = DataFrame(
{"a": [1, 1, 1716, 1]},
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
)
if observed:
expected = expected[expected.a != 1]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
@pytest.mark.parametrize(
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
)
def test_cython_with_timestamp_and_nat(op, data):
# https://github.com/pandas-dev/pandas/issues/19526
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
index = Index([0, 1], name="a")
# We will group by a and test the cython aggregations
expected = DataFrame({"b": [data, NaT]}, index=index)
result = df.groupby("a").aggregate(op)
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"agg",
[
"min",
"max",
"count",
"sum",
"prod",
"var",
"mean",
"median",
"ohlc",
"cumprod",
"cumsum",
"shift",
"any",
"all",
"quantile",
"first",
"last",
"rank",
"cummin",
"cummax",
],
)
def test_read_only_buffer_source_agg(agg):
# https://github.com/pandas-dev/pandas/issues/36014
df = DataFrame(
{
"sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
"species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
}
)
df._mgr.arrays[0].flags.writeable = False
result = df.groupby(["species"]).agg({"sepal_length": agg})
expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"op_name",
[
"count",
"sum",
"std",
"var",
"sem",
"mean",
"median",
"prod",
"min",
"max",
],
)
def test_cython_agg_nullable_int(op_name):
# ensure that the cython-based aggregations don't fail for nullable dtype
# (eg https://github.com/pandas-dev/pandas/issues/37415)
df = DataFrame(
{
"A": ["A", "B"] * 5,
"B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
}
)
result = getattr(df.groupby("A")["B"], op_name)()
df2 = df.assign(B=df["B"].astype("float64"))
expected = getattr(df2.groupby("A")["B"], op_name)()
if op_name != "count":
# the result is not yet consistently using Int64/Float64 dtype,
# so for now just checking the values by casting to float
result = result.astype("float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("with_na", [True, False])
@pytest.mark.parametrize(
"op_name, action",
[
# ("count", "always_int"),
("sum", "large_int"),
# ("std", "always_float"),
("var", "always_float"),
# ("sem", "always_float"),
("mean", "always_float"),
("median", "always_float"),
("prod", "large_int"),
("min", "preserve"),
("max", "preserve"),
("first", "preserve"),
("last", "preserve"),
],
)
@pytest.mark.parametrize(
"data",
[
pd.array([1, 2, 3, 4], dtype="Int64"),
pd.array([1, 2, 3, 4], dtype="Int8"),
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
pd.array([True, True, False, False], dtype="boolean"),
],
)
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
if with_na:
data[3] = pd.NA
df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
grouped = df.groupby("key")
if action == "always_int":
# always Int64
expected_dtype = pd.Int64Dtype()
elif action == "large_int":
# for any int/bool use Int64, for float preserve dtype
if is_float_dtype(data.dtype):
expected_dtype = data.dtype
else:
expected_dtype = pd.Int64Dtype()
elif action == "always_float":
# for any int/bool use Float64, for float preserve dtype
if is_float_dtype(data.dtype):
expected_dtype = data.dtype
else:
expected_dtype = pd.Float64Dtype()
elif action == "preserve":
expected_dtype = data.dtype
result = getattr(grouped, op_name)()
assert result["col"].dtype == expected_dtype
result = grouped.aggregate(op_name)
assert result["col"].dtype == expected_dtype
result = getattr(grouped["col"], op_name)()
assert result.dtype == expected_dtype
result = grouped["col"].aggregate(op_name)
assert result.dtype == expected_dtype

View File

@@ -0,0 +1,216 @@
import numpy as np
import pytest
from pandas.errors import NumbaUtilError
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
Index,
NamedAgg,
Series,
option_context,
)
import pandas._testing as tm
from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
@td.skip_if_no("numba")
def test_correct_function_signature():
def incorrect_function(x):
return sum(x) * 2.7
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key").agg(incorrect_function, engine="numba")
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key")["data"].agg(incorrect_function, engine="numba")
@td.skip_if_no("numba")
def test_check_nopython_kwargs():
def incorrect_function(x, **kwargs):
return sum(x) * 2.7
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key").agg(incorrect_function, engine="numba", a=1)
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
@td.skip_if_no("numba")
@pytest.mark.filterwarnings("ignore:\n")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython):
def func_numba(values, index):
return np.mean(values) * 2.7
if jit:
# Test accepted jitted functions
import numba
func_numba = numba.jit(func_numba)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
if pandas_obj == "Series":
grouped = grouped[1]
result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
tm.assert_equal(result, expected)
@td.skip_if_no("numba")
@pytest.mark.filterwarnings("ignore:\n")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
# Test that the functions are cached correctly if we switch functions
def func_1(values, index):
return np.mean(values) - 3.4
def func_2(values, index):
return np.mean(values) * 2.7
if jit:
import numba
func_1 = numba.jit(func_1)
func_2 = numba.jit(func_2)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
if pandas_obj == "Series":
grouped = grouped[1]
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
tm.assert_equal(result, expected)
# func_1 should be in the cache now
assert (func_1, "groupby_agg") in NUMBA_FUNC_CACHE
# Add func_2 to the cache
result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
tm.assert_equal(result, expected)
assert (func_2, "groupby_agg") in NUMBA_FUNC_CACHE
# Retest func_1 which should use the cache
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
tm.assert_equal(result, expected)
@td.skip_if_no("numba")
def test_use_global_config():
def func_1(values, index):
return np.mean(values) - 3.4
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
grouped = data.groupby(0)
expected = grouped.agg(func_1, engine="numba")
with option_context("compute.use_numba", True):
result = grouped.agg(func_1, engine=None)
tm.assert_frame_equal(expected, result)
@td.skip_if_no("numba")
@pytest.mark.parametrize(
"agg_func",
[
["min", "max"],
"min",
{"B": ["min", "max"], "C": "sum"},
NamedAgg(column="B", aggfunc="min"),
],
)
def test_multifunc_notimplimented(agg_func):
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
grouped = data.groupby(0)
with pytest.raises(NotImplementedError, match="Numba engine can"):
grouped.agg(agg_func, engine="numba")
with pytest.raises(NotImplementedError, match="Numba engine can"):
grouped[1].agg(agg_func, engine="numba")
@td.skip_if_no("numba")
def test_args_not_cached():
# GH 41647
def sum_last(values, index, n):
return values[-n:].sum()
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
grouped_x = df.groupby("id")["x"]
result = grouped_x.agg(sum_last, 1, engine="numba")
expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
tm.assert_series_equal(result, expected)
result = grouped_x.agg(sum_last, 2, engine="numba")
expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
tm.assert_series_equal(result, expected)
@td.skip_if_no("numba")
def test_index_data_correctly_passed():
# GH 43133
def f(values, index):
return np.mean(index)
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
result = df.groupby("group").aggregate(f, engine="numba")
expected = DataFrame(
[-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
)
tm.assert_frame_equal(result, expected)
@td.skip_if_no("numba")
def test_multiindex_one_key(nogil, parallel, nopython):
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
result = df.groupby("A").agg(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
tm.assert_frame_equal(result, expected)
@td.skip_if_no("numba")
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
with pytest.raises(NotImplementedError, match="More than 1 grouping labels"):
df.groupby(["A", "B"]).agg(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)

View File

@@ -0,0 +1,673 @@
"""
test all other .agg behavior
"""
import datetime as dt
from functools import partial
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
PeriodIndex,
Series,
date_range,
period_range,
)
import pandas._testing as tm
from pandas.core.base import SpecificationError
from pandas.io.formats.printing import pprint_thing
def test_agg_api():
# GH 6337
# https://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
# different api for agg when passed custom function with mixed frame
df = DataFrame(
{
"data1": np.random.randn(5),
"data2": np.random.randn(5),
"key1": ["a", "a", "b", "b", "a"],
"key2": ["one", "two", "one", "two", "one"],
}
)
grouped = df.groupby("key1")
def peak_to_peak(arr):
return arr.max() - arr.min()
with tm.assert_produces_warning(
FutureWarning,
match=r"\['key2'\] did not aggregate successfully",
):
expected = grouped.agg([peak_to_peak])
expected.columns = ["data1", "data2"]
with tm.assert_produces_warning(
FutureWarning,
match=r"\['key2'\] did not aggregate successfully",
):
result = grouped.agg(peak_to_peak)
tm.assert_frame_equal(result, expected)
def test_agg_datetimes_mixed():
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
df1 = DataFrame(
{
"key": [x[0] for x in data],
"date": [x[1] for x in data],
"value": [x[2] for x in data],
}
)
data = [
[
row[0],
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
row[2],
]
for row in data
]
df2 = DataFrame(
{
"key": [x[0] for x in data],
"date": [x[1] for x in data],
"value": [x[2] for x in data],
}
)
df1["weights"] = df1["value"] / df1["value"].sum()
gb1 = df1.groupby("date").aggregate(np.sum)
df2["weights"] = df1["value"] / df1["value"].sum()
gb2 = df2.groupby("date").aggregate(np.sum)
assert len(gb1) == len(gb2)
def test_agg_period_index():
prng = period_range("2012-1-1", freq="M", periods=3)
df = DataFrame(np.random.randn(3, 2), index=prng)
rs = df.groupby(level=0).sum()
assert isinstance(rs.index, PeriodIndex)
# GH 3579
index = period_range(start="1999-01", periods=5, freq="M")
s1 = Series(np.random.rand(len(index)), index=index)
s2 = Series(np.random.rand(len(index)), index=index)
df = DataFrame.from_dict({"s1": s1, "s2": s2})
grouped = df.groupby(df.index.month)
list(grouped)
def test_agg_dict_parameter_cast_result_dtypes():
# GH 12821
df = DataFrame(
{
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
"time": date_range("1/1/2011", periods=8, freq="H"),
}
)
df.loc[[0, 1, 2, 5], "time"] = None
# test for `first` function
exp = df.loc[[0, 3, 4, 6]].set_index("class")
grouped = df.groupby("class")
tm.assert_frame_equal(grouped.first(), exp)
tm.assert_frame_equal(grouped.agg("first"), exp)
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
tm.assert_series_equal(grouped.time.first(), exp["time"])
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
# test for `last` function
exp = df.loc[[0, 3, 4, 7]].set_index("class")
grouped = df.groupby("class")
tm.assert_frame_equal(grouped.last(), exp)
tm.assert_frame_equal(grouped.agg("last"), exp)
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
tm.assert_series_equal(grouped.time.last(), exp["time"])
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
# count
exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
tm.assert_series_equal(grouped.time.agg(len), exp)
tm.assert_series_equal(grouped.time.size(), exp)
exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
tm.assert_series_equal(grouped.time.count(), exp)
def test_agg_cast_results_dtypes():
# similar to GH12821
# xref #11444
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
v = list("aaabbbbbbccd")
df = DataFrame({"X": v, "Y": u})
result = df.groupby("X")["Y"].agg(len)
expected = df.groupby("X")["Y"].count()
tm.assert_series_equal(result, expected)
def test_aggregate_float64_no_int64():
# see gh-11199
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
expected.index.name = "b"
result = df.groupby("b")[["a"]].mean()
tm.assert_frame_equal(result, expected)
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
expected.index.name = "b"
result = df.groupby("b")[["a", "c"]].mean()
tm.assert_frame_equal(result, expected)
def test_aggregate_api_consistency():
# GH 9052
# make sure that the aggregates via dict
# are consistent
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.randn(8) + 1.0,
"D": np.arange(8),
}
)
grouped = df.groupby(["A", "B"])
c_mean = grouped["C"].mean()
c_sum = grouped["C"].sum()
d_mean = grouped["D"].mean()
d_sum = grouped["D"].sum()
result = grouped["D"].agg(["sum", "mean"])
expected = pd.concat([d_sum, d_mean], axis=1)
expected.columns = ["sum", "mean"]
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg([np.sum, np.mean])
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped[["D", "C"]].agg([np.sum, np.mean])
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg({"C": "mean", "D": "sum"})
expected = pd.concat([d_sum, c_mean], axis=1)
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
msg = r"Column\(s\) \['r', 'r2'\] do not exist"
with pytest.raises(KeyError, match=msg):
grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})
def test_agg_dict_renaming_deprecation():
# 15931
df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
df.groupby("A").agg(
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
)
msg = r"Column\(s\) \['ma'\] do not exist"
with pytest.raises(KeyError, match=msg):
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
df.groupby("A").B.agg({"foo": "count"})
def test_agg_compat():
# GH 12334
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.randn(8) + 1.0,
"D": np.arange(8),
}
)
g = df.groupby(["A", "B"])
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"C": ["sum", "std"]})
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"C": "sum", "D": "std"})
def test_agg_nested_dicts():
# API change for disallowing these types of nested dicts
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.randn(8) + 1.0,
"D": np.arange(8),
}
)
g = df.groupby(["A", "B"])
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
with pytest.raises(SpecificationError, match=msg):
g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
# same name as the original column
# GH9052
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"result1": np.sum, "result2": np.mean})
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"D": np.sum, "result2": np.mean})
def test_agg_item_by_item_raise_typeerror():
df = DataFrame(np.random.randint(10, size=(20, 10)))
def raiseException(df):
pprint_thing("----------------------------------------")
pprint_thing(df.to_string())
raise TypeError("test")
with pytest.raises(TypeError, match="test"):
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
df.groupby(0).agg(raiseException)
def test_series_agg_multikey():
ts = tm.makeTimeSeries()
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.agg(np.sum)
expected = grouped.sum()
tm.assert_series_equal(result, expected)
def test_series_agg_multi_pure_python():
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
}
)
def bad(x):
assert len(x.values.base) > 0
return "foo"
result = data.groupby(["A", "B"]).agg(bad)
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
tm.assert_frame_equal(result, expected)
def test_agg_consistency():
# agg with ([]) and () not consistent
# GH 6715
def P1(a):
return np.percentile(a.dropna(), q=1)
df = DataFrame(
{
"col1": [1, 2, 3, 4],
"col2": [10, 25, 26, 31],
"date": [
dt.date(2013, 2, 10),
dt.date(2013, 2, 10),
dt.date(2013, 2, 11),
dt.date(2013, 2, 11),
],
}
)
g = df.groupby("date")
expected = g.agg([P1])
expected.columns = expected.columns.levels[0]
result = g.agg(P1)
tm.assert_frame_equal(result, expected)
def test_agg_callables():
# GH 7929
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
class fn_class:
def __call__(self, x):
return sum(x)
equiv_callables = [
sum,
np.sum,
lambda x: sum(x),
lambda x: x.sum(),
partial(sum),
fn_class(),
]
expected = df.groupby("foo").agg(sum)
for ecall in equiv_callables:
result = df.groupby("foo").agg(ecall)
tm.assert_frame_equal(result, expected)
def test_agg_over_numpy_arrays():
# GH 3788
df = DataFrame(
[
[1, np.array([10, 20, 30])],
[1, np.array([40, 50, 60])],
[2, np.array([20, 30, 40])],
],
columns=["category", "arraydata"],
)
gb = df.groupby("category")
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
expected_index = Index([1, 2], name="category")
expected_column = ["arraydata"]
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
alt = gb.sum(numeric_only=False)
tm.assert_frame_equal(alt, expected)
result = gb.agg("sum", numeric_only=False)
tm.assert_frame_equal(result, expected)
# FIXME: the original version of this test called `gb.agg(sum)`
# and that raises TypeError if `numeric_only=False` is passed
@pytest.mark.parametrize("as_period", [True, False])
def test_agg_tzaware_non_datetime_result(as_period):
# discussed in GH#29589, fixed in GH#29641, operating on tzaware values
# with function that is not dtype-preserving
dti = date_range("2012-01-01", periods=4, tz="UTC")
if as_period:
dti = dti.tz_localize(None).to_period("D")
df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
gb = df.groupby("a")
# Case that _does_ preserve the dtype
result = gb["b"].agg(lambda x: x.iloc[0])
expected = Series(dti[::2], name="b")
expected.index.name = "a"
tm.assert_series_equal(result, expected)
# Cases that do _not_ preserve the dtype
result = gb["b"].agg(lambda x: x.iloc[0].year)
expected = Series([2012, 2012], name="b")
expected.index.name = "a"
tm.assert_series_equal(result, expected)
result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
expected.index.name = "a"
if as_period:
expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
expected.index.name = "a"
tm.assert_series_equal(result, expected)
def test_agg_timezone_round_trip():
# GH 15426
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
result1 = df.groupby("a")["b"].agg(np.min).iloc[0]
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
result3 = df.groupby("a")["b"].min().iloc[0]
assert result1 == ts
assert result2 == ts
assert result3 == ts
dates = [
pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
]
df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
grouped = df.groupby("A")
ts = df["B"].iloc[0]
assert ts == grouped.nth(0)["B"].iloc[0]
assert ts == grouped.head(1)["B"].iloc[0]
assert ts == grouped.first()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
def test_sum_uint64_overflow():
# see gh-14758
# Convert to uint64 and don't overflow
df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
df = df + 9223372036854775807
index = Index(
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
)
expected = DataFrame(
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
index=index,
)
expected.index.name = 0
result = df.groupby(0).sum(numeric_only=False)
tm.assert_frame_equal(result, expected)
# out column is non-numeric, so with numeric_only=True it is dropped
result2 = df.groupby(0).sum(numeric_only=True)
expected2 = expected[[]]
tm.assert_frame_equal(result2, expected2)
@pytest.mark.parametrize(
"structure, expected",
[
(tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
(list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
(
lambda x: tuple(x),
DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
),
(
lambda x: list(x),
DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
),
],
)
def test_agg_structs_dataframe(structure, expected):
df = DataFrame(
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
)
result = df.groupby(["A", "B"]).aggregate(structure)
expected.index.names = ["A", "B"]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"structure, expected",
[
(tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
(list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
(lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
(lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
],
)
def test_agg_structs_series(structure, expected):
# Issue #18079
df = DataFrame(
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
)
result = df.groupby("A")["C"].aggregate(structure)
expected.index.name = "A"
tm.assert_series_equal(result, expected)
def test_agg_category_nansum(observed):
categories = ["a", "b", "c"]
df = DataFrame(
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
)
result = df.groupby("A", observed=observed).B.agg(np.nansum)
expected = Series(
[3, 3, 0],
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
name="B",
)
if observed:
expected = expected[expected != 0]
tm.assert_series_equal(result, expected)
def test_agg_list_like_func():
# GH 18473
df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
grouped = df.groupby("A", as_index=False, sort=False)
result = grouped.agg({"B": lambda x: list(x)})
expected = DataFrame(
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
)
tm.assert_frame_equal(result, expected)
def test_agg_lambda_with_timezone():
# GH 23683
df = DataFrame(
{
"tag": [1, 1],
"date": [
pd.Timestamp("2018-01-01", tz="UTC"),
pd.Timestamp("2018-01-02", tz="UTC"),
],
}
)
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
expected = DataFrame(
[pd.Timestamp("2018-01-01", tz="UTC")],
index=Index([1], name="tag"),
columns=["date"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"err_cls",
[
NotImplementedError,
RuntimeError,
KeyError,
IndexError,
OSError,
ValueError,
ArithmeticError,
AttributeError,
],
)
def test_groupby_agg_err_catching(err_cls):
# make sure we suppress anything other than TypeError or AssertionError
# in _python_agg_general
# Use a non-standard EA to make sure we don't go down ndarray paths
from pandas.tests.extension.decimal.array import (
DecimalArray,
make_data,
to_decimal,
)
data = make_data()[:5]
df = DataFrame(
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
)
expected = Series(to_decimal([data[0], data[3]]))
def weird_func(x):
# weird function that raise something other than TypeError or IndexError
# in _python_agg_general
if len(x) == 0:
raise err_cls
return x.iloc[0]
result = df["decimals"].groupby(df["id1"]).agg(weird_func)
tm.assert_series_equal(result, expected, check_names=False)

View File

@@ -0,0 +1,191 @@
import numpy as np
import pytest
from pandas import DataFrame
import pandas._testing as tm
from pandas.core.groupby.base import (
reduction_kernels,
transformation_kernels,
)
@pytest.fixture(params=[True, False])
def sort(request):
return request.param
@pytest.fixture(params=[True, False])
def as_index(request):
return request.param
@pytest.fixture
def mframe(multiindex_dataframe_random_data):
return multiindex_dataframe_random_data
@pytest.fixture
def df():
return DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
}
)
@pytest.fixture
def ts():
return tm.makeTimeSeries()
@pytest.fixture
def tsd():
return tm.getTimeSeriesData()
@pytest.fixture
def tsframe(tsd):
return DataFrame(tsd)
@pytest.fixture
def df_mixed_floats():
return DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.array(np.random.randn(8), dtype="float32"),
}
)
@pytest.fixture
def three_group():
return DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
}
)
@pytest.fixture()
def slice_test_df():
data = [
[0, "a", "a0_at_0"],
[1, "b", "b0_at_1"],
[2, "a", "a1_at_2"],
[3, "b", "b1_at_3"],
[4, "c", "c0_at_4"],
[5, "a", "a2_at_5"],
[6, "a", "a3_at_6"],
[7, "a", "a4_at_7"],
]
df = DataFrame(data, columns=["Index", "Group", "Value"])
return df.set_index("Index")
@pytest.fixture()
def slice_test_grouped(slice_test_df):
return slice_test_df.groupby("Group", as_index=False)
@pytest.fixture(params=sorted(reduction_kernels))
def reduction_func(request):
"""
yields the string names of all groupby reduction functions, one at a time.
"""
return request.param
@pytest.fixture(params=sorted(transformation_kernels))
def transformation_func(request):
"""yields the string names of all groupby transformation functions."""
return request.param
@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels))
def groupby_func(request):
"""yields both aggregation and transformation functions."""
return request.param
@pytest.fixture(params=[True, False])
def parallel(request):
"""parallel keyword argument for numba.jit"""
return request.param
# Can parameterize nogil & nopython over True | False, but limiting per
# https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472
@pytest.fixture(params=[False])
def nogil(request):
"""nogil keyword argument for numba.jit"""
return request.param
@pytest.fixture(params=[True])
def nopython(request):
"""nopython keyword argument for numba.jit"""
return request.param
@pytest.fixture(
params=[
("mean", {}),
("var", {"ddof": 1}),
("var", {"ddof": 0}),
("std", {"ddof": 1}),
("std", {"ddof": 0}),
("sum", {}),
]
)
def numba_supported_reductions(request):
"""reductions supported with engine='numba'"""
return request.param

View File

@@ -0,0 +1,450 @@
"""
test methods relating to generic function evaluation
the so-called white/black lists
"""
from string import ascii_lowercase
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
from pandas.core.groupby.base import (
groupby_other_methods,
reduction_kernels,
transformation_kernels,
)
AGG_FUNCTIONS = [
"sum",
"prod",
"min",
"max",
"median",
"mean",
"skew",
"mad",
"std",
"var",
"sem",
]
AGG_FUNCTIONS_WITH_SKIPNA = ["skew", "mad"]
df_allowlist = [
"quantile",
"fillna",
"mad",
"take",
"idxmax",
"idxmin",
"tshift",
"skew",
"plot",
"hist",
"dtypes",
"corrwith",
"corr",
"cov",
"diff",
]
@pytest.fixture(params=df_allowlist)
def df_allowlist_fixture(request):
return request.param
s_allowlist = [
"quantile",
"fillna",
"mad",
"take",
"idxmax",
"idxmin",
"tshift",
"skew",
"plot",
"hist",
"dtype",
"corr",
"cov",
"diff",
"unique",
"nlargest",
"nsmallest",
"is_monotonic_increasing",
"is_monotonic_decreasing",
]
@pytest.fixture(params=s_allowlist)
def s_allowlist_fixture(request):
return request.param
@pytest.fixture
def df():
return DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
}
)
@pytest.fixture
def df_letters():
letters = np.array(list(ascii_lowercase))
N = 10
random_letters = letters.take(np.random.randint(0, 26, N))
df = DataFrame(
{
"floats": N / 10 * Series(np.random.random(N)),
"letters": Series(random_letters),
}
)
return df
@pytest.mark.parametrize("allowlist", [df_allowlist, s_allowlist])
def test_groupby_allowlist(df_letters, allowlist):
df = df_letters
if allowlist == df_allowlist:
# dataframe
obj = df_letters
else:
obj = df_letters["floats"]
gb = obj.groupby(df.letters)
assert set(allowlist) == set(gb._apply_allowlist)
def check_allowlist(obj, df, m):
# check the obj for a particular allowlist m
gb = obj.groupby(df.letters)
f = getattr(type(gb), m)
# name
try:
n = f.__name__
except AttributeError:
return
assert n == m
# qualname
try:
n = f.__qualname__
except AttributeError:
return
assert n.endswith(m)
def test_groupby_series_allowlist(df_letters, s_allowlist_fixture):
m = s_allowlist_fixture
df = df_letters
check_allowlist(df.letters, df, m)
def test_groupby_frame_allowlist(df_letters, df_allowlist_fixture):
m = df_allowlist_fixture
df = df_letters
check_allowlist(df, df, m)
@pytest.fixture
def raw_frame(multiindex_dataframe_random_data):
df = multiindex_dataframe_random_data
df.iloc[1, [1, 2]] = np.nan
df.iloc[7, [0, 1]] = np.nan
return df
@pytest.mark.parametrize("op", AGG_FUNCTIONS)
@pytest.mark.parametrize("level", [0, 1])
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("sort", [True, False])
def test_regression_allowlist_methods(raw_frame, op, level, axis, skipna, sort):
# GH6944
# GH 17537
# explicitly test the allowlist methods
if axis == 0:
frame = raw_frame
else:
frame = raw_frame.T
if op in AGG_FUNCTIONS_WITH_SKIPNA:
grouped = frame.groupby(level=level, axis=axis, sort=sort)
result = getattr(grouped, op)(skipna=skipna)
with tm.assert_produces_warning(FutureWarning):
expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna)
if sort:
expected = expected.sort_index(axis=axis, level=level)
tm.assert_frame_equal(result, expected)
else:
grouped = frame.groupby(level=level, axis=axis, sort=sort)
result = getattr(grouped, op)()
with tm.assert_produces_warning(FutureWarning):
expected = getattr(frame, op)(level=level, axis=axis)
if sort:
expected = expected.sort_index(axis=axis, level=level)
tm.assert_frame_equal(result, expected)
def test_groupby_blocklist(df_letters):
df = df_letters
s = df_letters.floats
blocklist = [
"eval",
"query",
"abs",
"where",
"mask",
"align",
"groupby",
"clip",
"astype",
"at",
"combine",
"consolidate",
"convert_objects",
]
to_methods = [method for method in dir(df) if method.startswith("to_")]
blocklist.extend(to_methods)
for bl in blocklist:
for obj in (df, s):
gb = obj.groupby(df.letters)
# e.g., to_csv
defined_but_not_allowed = (
f"(?:^Cannot.+{repr(bl)}.+'{type(gb).__name__}'.+try "
f"using the 'apply' method$)"
)
# e.g., query, eval
not_defined = (
f"(?:^'{type(gb).__name__}' object has no attribute {repr(bl)}$)"
)
msg = f"{defined_but_not_allowed}|{not_defined}"
with pytest.raises(AttributeError, match=msg):
getattr(gb, bl)
def test_tab_completion(mframe):
grp = mframe.groupby(level="second")
results = {v for v in dir(grp) if not v.startswith("_")}
expected = {
"A",
"B",
"C",
"agg",
"aggregate",
"apply",
"boxplot",
"filter",
"first",
"get_group",
"groups",
"hist",
"indices",
"last",
"max",
"mean",
"median",
"min",
"ngroups",
"nth",
"ohlc",
"plot",
"prod",
"size",
"std",
"sum",
"transform",
"var",
"sem",
"count",
"nunique",
"head",
"describe",
"cummax",
"quantile",
"rank",
"cumprod",
"tail",
"resample",
"cummin",
"fillna",
"cumsum",
"cumcount",
"ngroup",
"all",
"shift",
"skew",
"take",
"tshift",
"pct_change",
"any",
"mad",
"corr",
"corrwith",
"cov",
"dtypes",
"ndim",
"diff",
"idxmax",
"idxmin",
"ffill",
"bfill",
"pad",
"backfill",
"rolling",
"expanding",
"pipe",
"sample",
"ewm",
"value_counts",
}
assert results == expected
def test_groupby_function_rename(mframe):
grp = mframe.groupby(level="second")
for name in ["sum", "prod", "min", "max", "first", "last"]:
f = getattr(grp, name)
assert f.__name__ == name
@pytest.mark.parametrize(
"method",
[
"count",
"corr",
"cummax",
"cummin",
"cumprod",
"describe",
"rank",
"quantile",
"diff",
"shift",
"all",
"any",
"idxmin",
"idxmax",
"ffill",
"bfill",
"pct_change",
],
)
def test_groupby_selection_with_methods(df, method):
# some methods which require DatetimeIndex
rng = date_range("2014", periods=len(df))
df.index = rng
g = df.groupby(["A"])[["C"]]
g_exp = df[["C"]].groupby(df["A"])
# TODO check groupby with > 1 col ?
res = getattr(g, method)()
exp = getattr(g_exp, method)()
# should always be frames!
tm.assert_frame_equal(res, exp)
@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning")
def test_groupby_selection_tshift_raises(df):
rng = date_range("2014", periods=len(df))
df.index = rng
g = df.groupby(["A"])[["C"]]
# check that the index cache is cleared
with pytest.raises(ValueError, match="Freq was not set in the index"):
# GH#35937
g.tshift()
def test_groupby_selection_other_methods(df):
# some methods which require DatetimeIndex
rng = date_range("2014", periods=len(df))
df.columns.name = "foo"
df.index = rng
g = df.groupby(["A"])[["C"]]
g_exp = df[["C"]].groupby(df["A"])
# methods which aren't just .foo()
tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum()))
tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean())
tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc())
tm.assert_frame_equal(
g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)
)
def test_all_methods_categorized(mframe):
grp = mframe.groupby(mframe.iloc[:, 0])
names = {_ for _ in dir(grp) if not _.startswith("_")} - set(mframe.columns)
new_names = set(names)
new_names -= reduction_kernels
new_names -= transformation_kernels
new_names -= groupby_other_methods
assert not (reduction_kernels & transformation_kernels)
assert not (reduction_kernels & groupby_other_methods)
assert not (transformation_kernels & groupby_other_methods)
# new public method?
if new_names:
msg = f"""
There are uncatgeorized methods defined on the Grouper class:
{new_names}.
Was a new method recently added?
Every public method On Grouper must appear in exactly one the
following three lists defined in pandas.core.groupby.base:
- `reduction_kernels`
- `transformation_kernels`
- `groupby_other_methods`
see the comments in pandas/core/groupby/base.py for guidance on
how to fix this test.
"""
raise AssertionError(msg)
# removed a public method?
all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
print(names)
print(all_categorized)
if not (names == all_categorized):
msg = f"""
Some methods which are supposed to be on the Grouper class
are missing:
{all_categorized - names}.
They're still defined in one of the lists that live in pandas/core/groupby/base.py.
If you removed a method, you should update them
"""
raise AssertionError(msg)

View File

@@ -0,0 +1,190 @@
import builtins
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
isna,
)
import pandas._testing as tm
@pytest.mark.parametrize("agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
"vals",
[
["foo", "bar", "baz"],
["foo", "", ""],
["", "", ""],
[1, 2, 3],
[1, 0, 0],
[0, 0, 0],
[1.0, 2.0, 3.0],
[1.0, 0.0, 0.0],
[0.0, 0.0, 0.0],
[True, True, True],
[True, False, False],
[False, False, False],
[np.nan, np.nan, np.nan],
],
)
def test_groupby_bool_aggs(agg_func, skipna, vals):
df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2})
# Figure out expectation using Python builtin
exp = getattr(builtins, agg_func)(vals)
# edge case for missing data with skipna and 'any'
if skipna and all(isna(vals)) and agg_func == "any":
exp = False
exp_df = DataFrame([exp] * 2, columns=["val"], index=Index(["a", "b"], name="key"))
result = getattr(df.groupby("key"), agg_func)(skipna=skipna)
tm.assert_frame_equal(result, exp_df)
def test_any():
df = DataFrame(
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
columns=["A", "B", "C"],
)
expected = DataFrame(
[[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
)
expected.index.name = "A"
result = df.groupby("A").any()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
def test_bool_aggs_dup_column_labels(bool_agg_func):
# 21668
df = DataFrame([[True, True]], columns=["a", "a"])
grp_by = df.groupby([0])
result = getattr(grp_by, bool_agg_func)()
expected = df
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
"data",
[
[False, False, False],
[True, True, True],
[pd.NA, pd.NA, pd.NA],
[False, pd.NA, False],
[True, pd.NA, True],
[True, pd.NA, False],
],
)
def test_masked_kleene_logic(bool_agg_func, skipna, data):
# GH#37506
ser = Series(data, dtype="boolean")
# The result should match aggregating on the whole series. Correctness
# there is verified in test_reductions.py::test_any_all_boolean_kleene_logic
expected_data = getattr(ser, bool_agg_func)(skipna=skipna)
expected = Series(expected_data, dtype="boolean")
result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dtype1,dtype2,exp_col1,exp_col2",
[
(
"float",
"Float64",
np.array([True], dtype=bool),
pd.array([pd.NA], dtype="boolean"),
),
(
"Int64",
"float",
pd.array([pd.NA], dtype="boolean"),
np.array([True], dtype=bool),
),
(
"Int64",
"Int64",
pd.array([pd.NA], dtype="boolean"),
pd.array([pd.NA], dtype="boolean"),
),
(
"Float64",
"boolean",
pd.array([pd.NA], dtype="boolean"),
pd.array([pd.NA], dtype="boolean"),
),
],
)
def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2):
# GH#37506
data = [1.0, np.nan]
df = DataFrame(
{"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)}
)
result = df.groupby([1, 1]).agg("all", skipna=False)
expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=[1])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
@pytest.mark.parametrize("skipna", [True, False])
def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series):
# GH#40585
obj = frame_or_series([pd.NA, 1], dtype=dtype)
expected_res = True
if not skipna and bool_agg_func == "all":
expected_res = pd.NA
expected = frame_or_series([expected_res], index=[1], dtype="boolean")
result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"bool_agg_func,data,expected_res",
[
("any", [pd.NA, np.nan], False),
("any", [pd.NA, 1, np.nan], True),
("all", [pd.NA, pd.NaT], True),
("all", [pd.NA, False, pd.NaT], False),
],
)
def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series):
# GH#37501
obj = frame_or_series(data, dtype=object)
result = obj.groupby([1] * len(data)).agg(bool_agg_func)
expected = frame_or_series([expected_res], index=[1], dtype="bool")
tm.assert_equal(result, expected)
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
def test_object_NA_raises_with_skipna_false(bool_agg_func):
# GH#37501
ser = Series([pd.NA], dtype=object)
with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
ser.groupby([1]).agg(bool_agg_func, skipna=False)
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
def test_empty(frame_or_series, bool_agg_func):
# GH 45231
kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"}
obj = frame_or_series(**kwargs, dtype=object)
result = getattr(obj.groupby(obj.index), bool_agg_func)()
expected = frame_or_series(**kwargs, dtype=bool)
tm.assert_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,130 @@
import numpy as np
import pandas as pd
import pandas._testing as tm
def test_mutate_groups():
# GH3380
df = pd.DataFrame(
{
"cat1": ["a"] * 8 + ["b"] * 6,
"cat2": ["c"] * 2
+ ["d"] * 2
+ ["e"] * 2
+ ["f"] * 2
+ ["c"] * 2
+ ["d"] * 2
+ ["e"] * 2,
"cat3": [f"g{x}" for x in range(1, 15)],
"val": np.random.randint(100, size=14),
}
)
def f_copy(x):
x = x.copy()
x["rank"] = x.val.rank(method="min")
return x.groupby("cat2")["rank"].min()
def f_no_copy(x):
x["rank"] = x.val.rank(method="min")
return x.groupby("cat2")["rank"].min()
grpby_copy = df.groupby("cat1").apply(f_copy)
grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
tm.assert_series_equal(grpby_copy, grpby_no_copy)
def test_no_mutate_but_looks_like():
# GH 8467
# first show's mutation indicator
# second does not, but should yield the same results
df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
tm.assert_series_equal(result1, result2)
def test_apply_function_with_indexing():
# GH: 33058
df = pd.DataFrame(
{"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}
)
def fn(x):
x.col2[x.index[-1]] = 0
return x.col2
result = df.groupby(["col1"], as_index=False).apply(fn)
expected = pd.Series(
[1, 2, 0, 4, 5, 0],
index=pd.MultiIndex.from_tuples(
[(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)]
),
name="col2",
)
tm.assert_series_equal(result, expected)
def test_apply_mutate_columns_multiindex():
# GH 12652
df = pd.DataFrame(
{
("C", "julian"): [1, 2, 3],
("B", "geoffrey"): [1, 2, 3],
("A", "julian"): [1, 2, 3],
("B", "julian"): [1, 2, 3],
("A", "geoffrey"): [1, 2, 3],
("C", "geoffrey"): [1, 2, 3],
},
columns=pd.MultiIndex.from_tuples(
[
("A", "julian"),
("A", "geoffrey"),
("B", "julian"),
("B", "geoffrey"),
("C", "julian"),
("C", "geoffrey"),
]
),
)
def add_column(grouped):
name = grouped.columns[0][1]
grouped["sum", name] = grouped.sum(axis=1)
return grouped
result = df.groupby(level=1, axis=1).apply(add_column)
expected = pd.DataFrame(
[
[1, 1, 1, 3, 1, 1, 1, 3],
[2, 2, 2, 6, 2, 2, 2, 6],
[
3,
3,
3,
9,
3,
3,
3,
9,
],
],
columns=pd.MultiIndex.from_tuples(
[
("geoffrey", "A", "geoffrey"),
("geoffrey", "B", "geoffrey"),
("geoffrey", "C", "geoffrey"),
("geoffrey", "sum", "geoffrey"),
("julian", "A", "julian"),
("julian", "B", "julian"),
("julian", "C", "julian"),
("julian", "sum", "julian"),
]
),
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,69 @@
import numpy as np
import pytest
from pandas._libs import lib
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
def assert_block_lengths(x):
assert len(x) == len(x._mgr.blocks[0].mgr_locs)
return 0
def cumsum_max(x):
x.cumsum().max()
return 0
@pytest.mark.parametrize(
"func",
[
cumsum_max,
pytest.param(assert_block_lengths, marks=td.skip_array_manager_invalid_test),
],
)
def test_mgr_locs_updated(func):
# https://github.com/pandas-dev/pandas/issues/31802
# Some operations may require creating new blocks, which requires
# valid mgr_locs
df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
result = df.groupby(["A", "B"]).agg(func)
expected = pd.DataFrame(
{"C": [0, 0]},
index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"binner,closed,expected",
[
(
np.array([0, 3, 6, 9], dtype=np.int64),
"left",
np.array([2, 5, 6], dtype=np.int64),
),
(
np.array([0, 3, 6, 9], dtype=np.int64),
"right",
np.array([3, 6, 6], dtype=np.int64),
),
(np.array([0, 3, 6], dtype=np.int64), "left", np.array([2, 5], dtype=np.int64)),
(
np.array([0, 3, 6], dtype=np.int64),
"right",
np.array([3, 6], dtype=np.int64),
),
],
)
def test_generate_bins(binner, closed, expected):
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
result = lib.generate_bins_dt64(values, binner, closed=closed)
tm.assert_numpy_array_equal(result, expected)
class TestMoments:
pass

View File

@@ -0,0 +1,378 @@
from itertools import product
from string import ascii_lowercase
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
Period,
Series,
Timedelta,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestCounting:
def test_cumcount(self):
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3])
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_cumcount_empty(self):
ge = DataFrame().groupby(level=0)
se = Series(dtype=object).groupby(level=0)
# edge case, as this is usually considered float
e = Series(dtype="int64")
tm.assert_series_equal(e, ge.cumcount())
tm.assert_series_equal(e, se.cumcount())
def test_cumcount_dupe_index(self):
df = DataFrame(
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
)
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_cumcount_mi(self):
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=mi)
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_cumcount_groupby_not_col(self):
df = DataFrame(
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
)
g = df.groupby([0, 0, 0, 1, 0])
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_ngroup(self):
df = DataFrame({"A": list("aaaba")})
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0])
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_distinct(self):
df = DataFrame({"A": list("abcde")})
g = df.groupby("A")
sg = g.A
expected = Series(range(5), dtype="int64")
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_one_group(self):
df = DataFrame({"A": [0] * 5})
g = df.groupby("A")
sg = g.A
expected = Series([0] * 5)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_empty(self):
ge = DataFrame().groupby(level=0)
se = Series(dtype=object).groupby(level=0)
# edge case, as this is usually considered float
e = Series(dtype="int64")
tm.assert_series_equal(e, ge.ngroup())
tm.assert_series_equal(e, se.ngroup())
def test_ngroup_series_matches_frame(self):
df = DataFrame({"A": list("aaaba")})
s = Series(list("aaaba"))
tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
def test_ngroup_dupe_index(self):
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_mi(self):
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
df = DataFrame({"A": list("aaaba")}, index=mi)
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=mi)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_groupby_not_col(self):
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
g = df.groupby([0, 0, 0, 1, 0])
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_descending(self):
df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
g = df.groupby(["A"])
ascending = Series([0, 0, 1, 0, 1])
descending = Series([1, 1, 0, 1, 0])
tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
tm.assert_series_equal(ascending, g.ngroup(ascending=True))
tm.assert_series_equal(descending, g.ngroup(ascending=False))
def test_ngroup_matches_cumcount(self):
# verify one manually-worked out case works
df = DataFrame(
[["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
columns=["A", "X"],
)
g = df.groupby(["A", "X"])
g_ngroup = g.ngroup()
g_cumcount = g.cumcount()
expected_ngroup = Series([0, 1, 2, 0, 3])
expected_cumcount = Series([0, 0, 0, 1, 0])
tm.assert_series_equal(g_ngroup, expected_ngroup)
tm.assert_series_equal(g_cumcount, expected_cumcount)
def test_ngroup_cumcount_pair(self):
# brute force comparison for all small series
for p in product(range(3), repeat=4):
df = DataFrame({"a": p})
g = df.groupby(["a"])
order = sorted(set(p))
ngroupd = [order.index(val) for val in p]
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
tm.assert_series_equal(g.ngroup(), Series(ngroupd))
tm.assert_series_equal(g.cumcount(), Series(cumcounted))
def test_ngroup_respects_groupby_order(self):
np.random.seed(0)
df = DataFrame({"a": np.random.choice(list("abcdef"), 100)})
for sort_flag in (False, True):
g = df.groupby(["a"], sort=sort_flag)
df["group_id"] = -1
df["group_index"] = -1
for i, (_, group) in enumerate(g):
df.loc[group.index, "group_id"] = i
for j, ind in enumerate(group.index):
df.loc[ind, "group_index"] = j
tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
@pytest.mark.parametrize(
"datetimelike",
[
[Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
[Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
[Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
[Timedelta(x, unit="h") for x in range(1, 4)],
[Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
],
)
def test_count_with_datetimelike(self, datetimelike):
# test for #13393, where DataframeGroupBy.count() fails
# when counting a datetimelike column.
df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
res = df.groupby("x").count()
expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
expected.index.name = "x"
tm.assert_frame_equal(expected, res)
def test_count_with_only_nans_in_first_group(self):
# GH21956
df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
result = df.groupby(["A", "B"]).C.count()
mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
expected = Series([], index=mi, dtype=np.int64, name="C")
tm.assert_series_equal(result, expected, check_index_type=False)
def test_count_groupby_column_with_nan_in_groupby_column(self):
# https://github.com/pandas-dev/pandas/issues/32841
df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]})
res = df.groupby(["B"]).count()
expected = DataFrame(
index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
)
tm.assert_frame_equal(expected, res)
def test_groupby_count_dateparseerror(self):
dr = date_range(start="1/1/2012", freq="5min", periods=10)
# BAD Example, datetimes first
ser = Series(np.arange(10), index=[dr, np.arange(10)])
grouped = ser.groupby(lambda x: x[1] % 2 == 0)
result = grouped.count()
ser = Series(np.arange(10), index=[np.arange(10), dr])
grouped = ser.groupby(lambda x: x[0] % 2 == 0)
expected = grouped.count()
tm.assert_series_equal(result, expected)
def test_groupby_timedelta_cython_count():
df = DataFrame(
{"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")}
)
expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delt")
result = df.groupby("g").delt.count()
tm.assert_series_equal(expected, result)
def test_count():
n = 1 << 15
dr = date_range("2015-08-30", periods=n // 10, freq="T")
df = DataFrame(
{
"1st": np.random.choice(list(ascii_lowercase), n),
"2nd": np.random.randint(0, 5, n),
"3rd": np.random.randn(n).round(3),
"4th": np.random.randint(-10, 10, n),
"5th": np.random.choice(dr, n),
"6th": np.random.randn(n).round(3),
"7th": np.random.randn(n).round(3),
"8th": np.random.choice(dr, n) - np.random.choice(dr, 1),
"9th": np.random.choice(list(ascii_lowercase), n),
}
)
for col in df.columns.drop(["1st", "2nd", "4th"]):
df.loc[np.random.choice(n, n // 10), col] = np.nan
df["9th"] = df["9th"].astype("category")
for key in ["1st", "2nd", ["1st", "2nd"]]:
left = df.groupby(key).count()
right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
tm.assert_frame_equal(left, right)
def test_count_non_nulls():
# GH#5610
# count counts non-nulls
df = DataFrame(
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
columns=["A", "B", "C"],
)
count_as = df.groupby("A").count()
count_not_as = df.groupby("A", as_index=False).count()
expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
expected.index.name = "A"
tm.assert_frame_equal(count_not_as, expected.reset_index())
tm.assert_frame_equal(count_as, expected)
count_B = df.groupby("A")["B"].count()
tm.assert_series_equal(count_B, expected["B"])
def test_count_object():
df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
result = df.groupby("c").a.count()
expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
tm.assert_series_equal(result, expected)
df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
result = df.groupby("c").a.count()
expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
tm.assert_series_equal(result, expected)
def test_count_cross_type():
# GH8169
vals = np.hstack(
(np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2)))
)
df = DataFrame(vals, columns=["a", "b", "c", "d"])
df[df == 2] = np.nan
expected = df.groupby(["c", "d"]).count()
for t in ["float32", "object"]:
df["a"] = df["a"].astype(t)
df["b"] = df["b"].astype(t)
result = df.groupby(["c", "d"]).count()
tm.assert_frame_equal(result, expected)
def test_lower_int_prec_count():
df = DataFrame(
{
"a": np.array([0, 1, 2, 100], np.int8),
"b": np.array([1, 2, 3, 6], np.uint32),
"c": np.array([4, 5, 6, 8], np.int16),
"grp": list("ab" * 2),
}
)
result = df.groupby("grp").count()
expected = DataFrame(
{"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
)
tm.assert_frame_equal(result, expected)
def test_count_uses_size_on_exception():
class RaisingObjectException(Exception):
pass
class RaisingObject:
def __init__(self, msg="I will raise inside Cython"):
super().__init__()
self.msg = msg
def __eq__(self, other):
# gets called in Cython to check that raising calls the method
raise RaisingObjectException(self.msg)
df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
result = df.groupby("grp").count()
expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,614 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
Timestamp,
)
import pandas._testing as tm
def test_filter_series():
s = Series([1, 3, 20, 5, 22, 24, 7])
expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6])
expected_even = Series([20, 22, 24], index=[2, 4, 5])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
# Test dropna=False.
tm.assert_series_equal(
grouped.filter(lambda x: x.mean() < 10, dropna=False),
expected_odd.reindex(s.index),
)
tm.assert_series_equal(
grouped.filter(lambda x: x.mean() > 10, dropna=False),
expected_even.reindex(s.index),
)
def test_filter_single_column_df():
df = DataFrame([1, 3, 20, 5, 22, 24, 7])
expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
expected_even = DataFrame([20, 22, 24], index=[2, 4, 5])
grouper = df[0].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
# Test dropna=False.
tm.assert_frame_equal(
grouped.filter(lambda x: x.mean() < 10, dropna=False),
expected_odd.reindex(df.index),
)
tm.assert_frame_equal(
grouped.filter(lambda x: x.mean() > 10, dropna=False),
expected_even.reindex(df.index),
)
def test_filter_multi_column_df():
df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
tm.assert_frame_equal(
grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
)
def test_filter_mixed_df():
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
def test_filter_out_all_groups():
s = Series([1, 3, 20, 5, 22, 24, 7])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
def test_filter_out_no_groups():
s = Series([1, 3, 20, 5, 22, 24, 7])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
filtered = grouped.filter(lambda x: x.mean() > 0)
tm.assert_series_equal(filtered, s)
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
filtered = grouped.filter(lambda x: x["A"].mean() > 0)
tm.assert_frame_equal(filtered, df)
def test_filter_out_all_groups_in_df():
# GH12768
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
tm.assert_frame_equal(expected, res)
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
expected = DataFrame({"a": [], "b": []}, dtype="int64")
tm.assert_frame_equal(expected, res)
def test_filter_condition_raises():
def raise_if_sum_is_zero(x):
if x.sum() == 0:
raise ValueError
else:
return x.sum() > 0
s = Series([-1, 0, 1, 2])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
grouped.filter(raise_if_sum_is_zero)
def test_filter_with_axis_in_groupby():
# issue 11041
index = pd.MultiIndex.from_product([range(10), [0, 1]])
data = DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64")
result = data.groupby(level=0, axis=1).filter(lambda x: x.iloc[0, 0] > 10)
expected = data.iloc[:, 12:20]
tm.assert_frame_equal(result, expected)
def test_filter_bad_shapes():
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
s = df["B"]
g_df = df.groupby("B")
g_s = s.groupby(s)
f = lambda x: x
msg = "filter function returned a DataFrame, but expected a scalar bool"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
f = lambda x: x == 1
msg = "filter function returned a DataFrame, but expected a scalar bool"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
f = lambda x: np.outer(x, x)
msg = "can't multiply sequence by non-int of type 'str'"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
def test_filter_nan_is_false():
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
s = df["B"]
g_df = df.groupby(df["B"])
g_s = s.groupby(s)
f = lambda x: np.nan
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
tm.assert_series_equal(g_s.filter(f), s[[]])
def test_filter_against_workaround():
np.random.seed(0)
# Series of ints
s = Series(np.random.randint(0, 100, 1000))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
old_way = s[grouped.transform(f).astype("bool")]
new_way = grouped.filter(f)
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
# Series of floats
s = 100 * Series(np.random.random(1000))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
old_way = s[grouped.transform(f).astype("bool")]
new_way = grouped.filter(f)
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
# Set up DataFrame of ints, floats, strings.
from string import ascii_lowercase
letters = np.array(list(ascii_lowercase))
N = 1000
random_letters = letters.take(np.random.randint(0, 26, N))
df = DataFrame(
{
"ints": Series(np.random.randint(0, 100, N)),
"floats": N / 10 * Series(np.random.random(N)),
"letters": Series(random_letters),
}
)
# Group by ints; filter on floats.
grouped = df.groupby("ints")
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
tm.assert_frame_equal(new_way, old_way)
# Group by floats (rounded); filter on strings.
grouper = df.floats.apply(lambda x: np.round(x, -1))
grouped = df.groupby(grouper)
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
tm.assert_frame_equal(new_way, old_way)
# Group by strings; filter on ints.
grouped = df.groupby("letters")
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
tm.assert_frame_equal(new_way, old_way)
def test_filter_using_len():
# BUG GH4447
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
grouped = df.groupby("B")
actual = grouped.filter(lambda x: len(x) > 2)
expected = DataFrame(
{"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
index=np.arange(2, 6),
)
tm.assert_frame_equal(actual, expected)
actual = grouped.filter(lambda x: len(x) > 4)
expected = df.loc[[]]
tm.assert_frame_equal(actual, expected)
# Series have always worked properly, but we'll test anyway.
s = df["B"]
grouped = s.groupby(s)
actual = grouped.filter(lambda x: len(x) > 2)
expected = Series(4 * ["b"], index=np.arange(2, 6), name="B")
tm.assert_series_equal(actual, expected)
actual = grouped.filter(lambda x: len(x) > 4)
expected = s[[]]
tm.assert_series_equal(actual, expected)
def test_filter_maintains_ordering():
# Simple case: index is sequential. #4621
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
# Now index is sequentially decreasing.
df.index = np.arange(len(df) - 1, -1, -1)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
# Index is shuffled.
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
df.index = df.index[SHUFFLED]
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
def test_filter_multiple_timestamp():
# GH 10114
df = DataFrame(
{
"A": np.arange(5, dtype="int64"),
"B": ["foo", "bar", "foo", "bar", "bar"],
"C": Timestamp("20130101"),
}
)
grouped = df.groupby(["B", "C"])
result = grouped["A"].filter(lambda x: True)
tm.assert_series_equal(df["A"], result)
result = grouped["A"].transform(len)
expected = Series([2, 3, 2, 3, 3], name="A")
tm.assert_series_equal(result, expected)
result = grouped.filter(lambda x: True)
tm.assert_frame_equal(df, result)
result = grouped.transform("sum")
expected = DataFrame({"A": [2, 8, 2, 8, 8]})
tm.assert_frame_equal(result, expected)
result = grouped.transform(len)
expected = DataFrame({"A": [2, 3, 2, 3, 3]})
tm.assert_frame_equal(result, expected)
def test_filter_and_transform_with_non_unique_int_index():
# GH4620
index = [1, 1, 1, 2, 1, 1, 0, 1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_multiple_non_unique_int_index():
# GH4620
index = [1, 1, 1, 2, 0, 0, 0, 1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_float_index():
# GH4620
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_timestamp_index():
# GH4620
t0 = Timestamp("2013-09-30 00:05:00")
t1 = Timestamp("2013-10-30 00:05:00")
t2 = Timestamp("2013-11-30 00:05:00")
index = [t1, t1, t1, t2, t1, t1, t0, t1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_string_index():
# GH4620
index = list("bbbcbbab")
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_has_access_to_grouped_cols():
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
# previously didn't have access to col A #????
filt = g.filter(lambda x: x["A"].sum() == 2)
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
def test_filter_enforces_scalarness():
df = DataFrame(
[
["best", "a", "x"],
["worst", "b", "y"],
["best", "c", "x"],
["best", "d", "y"],
["worst", "d", "y"],
["worst", "d", "y"],
["best", "d", "z"],
],
columns=["a", "b", "c"],
)
with pytest.raises(TypeError, match="filter function returned a.*"):
df.groupby("c").filter(lambda g: g["a"] == "best")
def test_filter_non_bool_raises():
df = DataFrame(
[
["best", "a", 1],
["worst", "b", 1],
["best", "c", 1],
["best", "d", 1],
["worst", "d", 1],
["worst", "d", 1],
["best", "d", 1],
],
columns=["a", "b", "c"],
)
with pytest.raises(TypeError, match="filter function returned a.*"):
df.groupby("a").filter(lambda g: g.c.mean())
def test_filter_dropna_with_empty_groups():
# GH 10780
data = Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3))
groupped = data.groupby(level=0)
result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False)
expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
tm.assert_series_equal(result_false, expected_false)
result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True)
expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64)
tm.assert_series_equal(result_true, expected_true)
def test_filter_consistent_result_before_after_agg_func():
# GH 17091
df = DataFrame({"data": range(6), "key": list("ABCABC")})
grouper = df.groupby("key")
result = grouper.filter(lambda x: True)
expected = DataFrame({"data": range(6), "key": list("ABCABC")})
tm.assert_frame_equal(result, expected)
grouper.sum()
result = grouper.filter(lambda x: True)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,444 @@
import numpy as np
import pytest
from pandas import (
CategoricalIndex,
DataFrame,
Index,
MultiIndex,
Series,
)
import pandas._testing as tm
@pytest.fixture
def education_df():
return DataFrame(
{
"gender": ["male", "male", "female", "male", "female", "male"],
"education": ["low", "medium", "high", "low", "high", "low"],
"country": ["US", "FR", "US", "FR", "FR", "FR"],
}
)
def test_axis(education_df):
gp = education_df.groupby("country", axis=1)
with pytest.raises(NotImplementedError, match="axis"):
gp.value_counts()
def test_bad_subset(education_df):
gp = education_df.groupby("country")
with pytest.raises(ValueError, match="subset"):
gp.value_counts(subset=["country"])
def test_basic(education_df):
# gh43564
result = education_df.groupby("country")[["gender", "education"]].value_counts(
normalize=True
)
expected = Series(
data=[0.5, 0.25, 0.25, 0.5, 0.5],
index=MultiIndex.from_tuples(
[
("FR", "male", "low"),
("FR", "female", "high"),
("FR", "male", "medium"),
("US", "female", "high"),
("US", "male", "low"),
],
names=["country", "gender", "education"],
),
)
tm.assert_series_equal(result, expected)
def _frame_value_counts(df, keys, normalize, sort, ascending):
return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending)
@pytest.mark.parametrize("groupby", ["column", "array", "function"])
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize(
"sort, ascending",
[
(False, None),
(True, True),
(True, False),
],
)
@pytest.mark.parametrize("as_index", [True, False])
@pytest.mark.parametrize("frame", [True, False])
def test_against_frame_and_seriesgroupby(
education_df, groupby, normalize, sort, ascending, as_index, frame
):
# test all parameters:
# - Use column, array or function as by= parameter
# - Whether or not to normalize
# - Whether or not to sort and how
# - Whether or not to use the groupby as an index
# - 3-way compare against:
# - apply with :meth:`~DataFrame.value_counts`
# - `~SeriesGroupBy.value_counts`
by = {
"column": "country",
"array": education_df["country"].values,
"function": lambda x: education_df["country"][x] == "US",
}[groupby]
gp = education_df.groupby(by=by, as_index=as_index)
result = gp[["gender", "education"]].value_counts(
normalize=normalize, sort=sort, ascending=ascending
)
if frame:
# compare against apply with DataFrame value_counts
expected = gp.apply(
_frame_value_counts, ["gender", "education"], normalize, sort, ascending
)
if as_index:
tm.assert_series_equal(result, expected)
else:
name = "proportion" if normalize else "count"
expected = expected.reset_index().rename({0: name}, axis=1)
if groupby == "column":
expected = expected.rename({"level_0": "country"}, axis=1)
expected["country"] = np.where(expected["country"], "US", "FR")
elif groupby == "function":
expected["level_0"] = expected["level_0"] == 1
else:
expected["level_0"] = np.where(expected["level_0"], "US", "FR")
tm.assert_frame_equal(result, expected)
else:
# compare against SeriesGroupBy value_counts
education_df["both"] = education_df["gender"] + "-" + education_df["education"]
expected = gp["both"].value_counts(
normalize=normalize, sort=sort, ascending=ascending
)
expected.name = None
if as_index:
index_frame = expected.index.to_frame(index=False)
index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
del index_frame["both"]
index_frame = index_frame.rename({0: None}, axis=1)
expected.index = MultiIndex.from_frame(index_frame)
tm.assert_series_equal(result, expected)
else:
expected.insert(1, "gender", expected["both"].str.split("-").str.get(0))
expected.insert(2, "education", expected["both"].str.split("-").str.get(1))
del expected["both"]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize(
"sort, ascending, expected_rows, expected_count, expected_group_size",
[
(False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
(True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]),
(True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]),
],
)
def test_compound(
education_df,
normalize,
sort,
ascending,
expected_rows,
expected_count,
expected_group_size,
):
# Multiple groupby keys and as_index=False
gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
result = gp["education"].value_counts(
normalize=normalize, sort=sort, ascending=ascending
)
expected = DataFrame()
for column in ["country", "gender", "education"]:
expected[column] = [education_df[column][row] for row in expected_rows]
if normalize:
expected["proportion"] = expected_count
expected["proportion"] /= expected_group_size
else:
expected["count"] = expected_count
tm.assert_frame_equal(result, expected)
@pytest.fixture
def animals_df():
return DataFrame(
{"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)
@pytest.mark.parametrize(
"sort, ascending, normalize, expected_data, expected_index",
[
(False, None, False, [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]),
(True, True, False, [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]),
(True, False, False, [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]),
(True, False, True, [0.5, 0.25, 0.25], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]),
],
)
def test_data_frame_value_counts(
animals_df, sort, ascending, normalize, expected_data, expected_index
):
# 3-way compare with :meth:`~DataFrame.value_counts`
# Tests from frame/methods/test_value_counts.py
result_frame = animals_df.value_counts(
sort=sort, ascending=ascending, normalize=normalize
)
expected = Series(
data=expected_data,
index=MultiIndex.from_arrays(
expected_index, names=["key", "num_legs", "num_wings"]
),
)
tm.assert_series_equal(result_frame, expected)
result_frame_groupby = animals_df.groupby("key").value_counts(
sort=sort, ascending=ascending, normalize=normalize
)
tm.assert_series_equal(result_frame_groupby, expected)
@pytest.fixture
def nulls_df():
n = np.nan
return DataFrame(
{
"A": [1, 1, n, 4, n, 6, 6, 6, 6],
"B": [1, 1, 3, n, n, 6, 6, 6, 6],
"C": [1, 2, 3, 4, 5, 6, n, 8, n],
"D": [1, 2, 3, 4, 5, 6, 7, n, n],
}
)
@pytest.mark.parametrize(
"group_dropna, count_dropna, expected_rows, expected_values",
[
(
False,
False,
[0, 1, 3, 5, 7, 6, 8, 2, 4],
[0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0],
),
(False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]),
(True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
(True, True, [0, 1, 5], [0.5, 0.5, 1.0]),
],
)
def test_dropna_combinations(
nulls_df, group_dropna, count_dropna, expected_rows, expected_values
):
gp = nulls_df.groupby(["A", "B"], dropna=group_dropna)
result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna)
columns = DataFrame()
for column in nulls_df.columns:
columns[column] = [nulls_df[column][row] for row in expected_rows]
index = MultiIndex.from_frame(columns)
expected = Series(data=expected_values, index=index)
tm.assert_series_equal(result, expected)
@pytest.fixture
def names_with_nulls_df(nulls_fixture):
return DataFrame(
{
"key": [1, 1, 1, 1],
"first_name": ["John", "Anne", "John", "Beth"],
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
},
)
@pytest.mark.parametrize(
"dropna, expected_data, expected_index",
[
(
True,
[1, 1],
MultiIndex.from_arrays(
[(1, 1), ("Beth", "John"), ("Louise", "Smith")],
names=["key", "first_name", "middle_name"],
),
),
(
False,
[1, 1, 1, 1],
MultiIndex(
levels=[
Index([1]),
Index(["Anne", "Beth", "John"]),
Index(["Louise", "Smith", np.nan]),
],
codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
names=["key", "first_name", "middle_name"],
),
),
],
)
@pytest.mark.parametrize("normalize", [False, True])
def test_data_frame_value_counts_dropna(
names_with_nulls_df, dropna, normalize, expected_data, expected_index
):
# GH 41334
# 3-way compare with :meth:`~DataFrame.value_counts`
# Tests with nulls from frame/methods/test_value_counts.py
result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize)
expected = Series(
data=expected_data,
index=expected_index,
)
if normalize:
expected /= float(len(expected_data))
tm.assert_series_equal(result_frame, expected)
result_frame_groupby = names_with_nulls_df.groupby("key").value_counts(
dropna=dropna, normalize=normalize
)
tm.assert_series_equal(result_frame_groupby, expected)
@pytest.mark.parametrize("as_index", [False, True])
@pytest.mark.parametrize(
"observed, expected_index",
[
(
False,
[
("FR", "male", "low"),
("FR", "female", "high"),
("FR", "male", "medium"),
("FR", "female", "low"),
("FR", "female", "medium"),
("FR", "male", "high"),
("US", "female", "high"),
("US", "male", "low"),
("US", "female", "low"),
("US", "female", "medium"),
("US", "male", "high"),
("US", "male", "medium"),
],
),
(
True,
[
("FR", "male", "low"),
("FR", "female", "high"),
("FR", "male", "medium"),
("US", "female", "high"),
("US", "male", "low"),
],
),
],
)
@pytest.mark.parametrize(
"normalize, expected_data",
[
(False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)),
(
True,
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
),
],
)
def test_categorical(
education_df, as_index, observed, expected_index, normalize, expected_data
):
# Test categorical data whether or not observed
gp = education_df.astype("category").groupby(
"country", as_index=as_index, observed=observed
)
result = gp.value_counts(normalize=normalize)
expected_series = Series(
data=expected_data[expected_data > 0.0] if observed else expected_data,
index=MultiIndex.from_tuples(
expected_index,
names=["country", "gender", "education"],
),
)
for i in range(3):
expected_series.index = expected_series.index.set_levels(
CategoricalIndex(expected_series.index.levels[i]), level=i
)
if as_index:
tm.assert_series_equal(result, expected_series)
else:
expected = expected_series.reset_index(
name="proportion" if normalize else "count"
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"normalize, expected_label, expected_values",
[
(False, "count", [1, 1, 1]),
(True, "proportion", [0.5, 0.5, 1.0]),
],
)
def test_mixed_groupings(normalize, expected_label, expected_values):
# Test multiple groupings
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False)
result = gp.value_counts(sort=True, normalize=normalize)
expected = DataFrame(
{
"level_0": [4, 4, 5],
"A": [1, 1, 2],
"level_2": [8, 8, 7],
"B": [1, 3, 2],
expected_label: expected_values,
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"test, expected_names",
[
("repeat", ["a", None, "d", "b", "b", "e"]),
("level", ["a", None, "d", "b", "c", "level_1"]),
],
)
@pytest.mark.parametrize("as_index", [False, True])
def test_column_name_clashes(test, expected_names, as_index):
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]})
if test == "repeat":
df.columns = list("abbde")
else:
df.columns = list("abcd") + ["level_1"]
if as_index:
result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
expected = Series(
data=(1, 1),
index=MultiIndex.from_tuples(
[(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)],
names=expected_names,
),
)
tm.assert_series_equal(result, expected)
else:
with pytest.raises(ValueError, match="cannot insert"):
df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts()
def test_ambiguous_grouping():
# Test that groupby is not confused by groupings length equal to row count
df = DataFrame({"a": [1, 1]})
gb = df.groupby([1, 1])
result = gb.value_counts()
expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]))
tm.assert_series_equal(result, expected)

View File

@@ -0,0 +1,372 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
(
True,
[["A", "B"], ["B", "A"]],
{"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]},
),
(
False,
[["A", "B"], ["A", np.nan], ["B", "A"]],
{
"c": [13.0, 12.3, 123.23],
"d": [13.0, 233.0, 123.0],
"e": [13.0, 12.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
dropna, tuples, outputs, nulls_fixture
):
# GH 3729 this is to test that NA is in one group
df_list = [
["A", "B", 12, 12, 12],
["A", nulls_fixture, 12.3, 233.0, 12],
["B", "A", 123.23, 123, 1],
["A", "B", 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna:
mi = mi.set_levels(["A", "B", np.nan], level="b")
expected = pd.DataFrame(outputs, index=mi)
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
(
True,
[["A", "B"], ["B", "A"]],
{"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
),
(
False,
[["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
{
"c": [12.0, 13.3, 123.23, 1.0],
"d": [12.0, 234.0, 123.0, 1.0],
"e": [12.0, 13.0, 1.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
dropna, tuples, outputs, nulls_fixture, nulls_fixture2
):
# GH 3729 this is to test that NA in different groups with different representations
df_list = [
["A", "B", 12, 12, 12],
["A", nulls_fixture, 12.3, 233.0, 12],
["B", "A", 123.23, 123, 1],
[nulls_fixture2, "B", 1, 1, 1.0],
["A", nulls_fixture2, 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna:
mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
expected = pd.DataFrame(outputs, index=mi)
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, idx, outputs",
[
(True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}),
(
False,
["A", "B", np.nan],
{
"b": [123.23, 13.0, 12.3],
"c": [123.0, 13.0, 233.0],
"d": [1.0, 13.0, 12.0],
},
),
],
)
def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
# GH 3729
df_list = [
["B", 12, 12, 12],
[None, 12.3, 233.0, 12],
["A", 123.23, 123, 1],
["B", 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
grouped = df.groupby("a", dropna=dropna).sum()
expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, idx, expected",
[
(True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])),
(
False,
["a", "a", "b", np.nan],
pd.Series([3, 3, 3], index=["a", "b", np.nan]),
),
],
)
def test_groupby_dropna_series_level(dropna, idx, expected):
ser = pd.Series([1, 2, 3, 3], index=idx)
result = ser.groupby(level=0, dropna=dropna).sum()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dropna, expected",
[
(True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")),
(
False,
pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"),
),
],
)
def test_groupby_dropna_series_by(dropna, expected):
ser = pd.Series(
[390.0, 350.0, 30.0, 20.0],
index=["Falcon", "Falcon", "Parrot", "Parrot"],
name="Max Speed",
)
result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dropna", (False, True))
def test_grouper_dropna_propagation(dropna):
# GH 36604
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
gb = df.groupby("A", dropna=dropna)
assert gb.grouper.dropna == dropna
@pytest.mark.parametrize(
"dropna,input_index,expected_data,expected_index",
[
(True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)),
(True, list("abcd"), {"B": [2, 2, 1]}, list("abc")),
(
True,
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
),
{"B": [2, 2, 1]},
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R")], names=["num", "col"]
),
),
(False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)),
(False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")),
(
False,
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
),
{"B": [2, 2, 1, 1]},
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
),
),
],
)
def test_groupby_dataframe_slice_then_transform(
dropna, input_index, expected_data, expected_index
):
# GH35014 & GH35612
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index)
gb = df.groupby("A", dropna=dropna)
result = gb.transform(len)
expected = pd.DataFrame(expected_data, index=expected_index)
tm.assert_frame_equal(result, expected)
result = gb[["B"]].transform(len)
expected = pd.DataFrame(expected_data, index=expected_index)
tm.assert_frame_equal(result, expected)
result = gb["B"].transform(len)
expected = pd.Series(expected_data["B"], index=expected_index, name="B")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
(
True,
[["A", "B"], ["B", "A"]],
{"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]},
),
(
False,
[["A", "B"], ["A", np.nan], ["B", "A"]],
{
"c": [13.0, 12.3, 123.23],
"d": [12.0, 233.0, 123.0],
"e": [1.0, 12.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
# GH 3729
df_list = [
["A", "B", 12, 12, 12],
["A", None, 12.3, 233.0, 12],
["B", "A", 123.23, 123, 1],
["A", "B", 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
agg_dict = {"c": sum, "d": max, "e": "min"}
grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict)
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna:
mi = mi.set_levels(["A", "B", np.nan], level="b")
expected = pd.DataFrame(outputs, index=mi)
tm.assert_frame_equal(grouped, expected)
@pytest.mark.arm_slow
@pytest.mark.parametrize(
"datetime1, datetime2",
[
(pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")),
(pd.Timedelta("-2 days"), pd.Timedelta("-1 days")),
(pd.Period("2020-01-01"), pd.Period("2020-02-01")),
],
)
@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
def test_groupby_dropna_datetime_like_data(
dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
):
# 3729
df = pd.DataFrame(
{
"values": [1, 2, 3, 4, 5, 6],
"dt": [
datetime1,
unique_nulls_fixture,
datetime2,
unique_nulls_fixture2,
datetime1,
datetime1,
],
}
)
if dropna:
indexes = [datetime1, datetime2]
else:
indexes = [datetime1, datetime2, np.nan]
grouped = df.groupby("dt", dropna=dropna).agg({"values": sum})
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, data, selected_data, levels",
[
pytest.param(
False,
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
["a", "b", np.nan],
id="dropna_false_has_nan",
),
pytest.param(
True,
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0]},
None,
id="dropna_true_has_nan",
),
pytest.param(
# no nan in "groups"; dropna=True|False should be same.
False,
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
None,
id="dropna_false_no_nan",
),
pytest.param(
# no nan in "groups"; dropna=True|False should be same.
True,
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
None,
id="dropna_true_no_nan",
),
],
)
def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
# GH 35889
df = pd.DataFrame(data)
gb = df.groupby("groups", dropna=dropna)
result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
mi_tuples = tuple(zip(data["groups"], selected_data["values"]))
mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna and levels:
mi = mi.set_levels(levels, level="groups")
expected = pd.DataFrame(selected_data, index=mi)
tm.assert_frame_equal(result, expected)
def test_groupby_nan_included():
# GH 35646
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
df = pd.DataFrame(data)
grouped = df.groupby("group", dropna=False)
result = grouped.indices
dtype = np.intp
expected = {
"g1": np.array([0, 2], dtype=dtype),
"g2": np.array([3], dtype=dtype),
np.nan: np.array([1, 4], dtype=dtype),
}
for result_values, expected_values in zip(result.values(), expected.values()):
tm.assert_numpy_array_equal(result_values, expected_values)
assert np.isnan(list(result.keys())[2])
assert list(result.keys())[0:2] == ["g1", "g2"]

View File

@@ -0,0 +1,133 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
NaT,
Series,
Timedelta,
Timestamp,
)
import pandas._testing as tm
def test_group_shift_with_null_key():
# This test is designed to replicate the segfault in issue #13813.
n_rows = 1200
# Generate a moderately large dataframe with occasional missing
# values in column `B`, and then group by [`A`, `B`]. This should
# force `-1` in `labels` array of `g.grouper.group_info` exactly
# at those places, where the group-by key is partially missing.
df = DataFrame(
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
dtype=float,
columns=["A", "B", "Z"],
index=None,
)
g = df.groupby(["A", "B"])
expected = DataFrame(
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
dtype=float,
columns=["Z"],
index=None,
)
result = g.shift(-1)
tm.assert_frame_equal(result, expected)
def test_group_shift_with_fill_value():
# GH #24128
n_rows = 24
df = DataFrame(
[(i % 12, i % 3, i) for i in range(n_rows)],
dtype=float,
columns=["A", "B", "Z"],
index=None,
)
g = df.groupby(["A", "B"])
expected = DataFrame(
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
dtype=float,
columns=["Z"],
index=None,
)
result = g.shift(-1, fill_value=0)
tm.assert_frame_equal(result, expected)
def test_group_shift_lose_timezone():
# GH 30134
now_dt = Timestamp.utcnow()
df = DataFrame({"a": [1, 1], "date": now_dt})
result = df.groupby("a").shift(0).iloc[0]
expected = Series({"date": now_dt}, name=result.name)
tm.assert_series_equal(result, expected)
def test_group_diff_real(any_real_numpy_dtype):
df = DataFrame(
{"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
dtype=any_real_numpy_dtype,
)
result = df.groupby("a")["b"].diff()
exp_dtype = "float"
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
exp_dtype = "float32"
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
[
Timestamp("2013-01-01"),
Timestamp("2013-01-02"),
Timestamp("2013-01-03"),
],
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
],
)
def test_group_diff_datetimelike(data):
df = DataFrame({"a": [1, 2, 2], "b": data})
result = df.groupby("a")["b"].diff()
expected = Series([NaT, NaT, Timedelta("1 days")], name="b")
tm.assert_series_equal(result, expected)
def test_group_diff_bool():
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
result = df.groupby("a")["b"].diff()
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
tm.assert_series_equal(result, expected)
def test_group_diff_object_raises(object_dtype):
df = DataFrame(
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
)
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
df.groupby("a")["b"].diff()
def test_empty_shift_with_fill():
# GH 41264, single-index check
df = DataFrame(columns=["a", "b", "c"])
shifted = df.groupby(["a"]).shift(1)
shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
tm.assert_frame_equal(shifted, shifted_with_fill)
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
def test_multindex_empty_shift_with_fill():
# GH 41264, multi-index check
df = DataFrame(columns=["a", "b", "c"])
shifted = df.groupby(["a", "b"]).shift(1)
shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
tm.assert_frame_equal(shifted, shifted_with_fill)
tm.assert_index_equal(shifted.index, shifted_with_fill.index)

View File

@@ -0,0 +1,113 @@
from datetime import datetime
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
from pandas.core.groupby.base import maybe_normalize_deprecated_kernels
@pytest.mark.parametrize(
"obj",
[
tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
tm.SubclassedSeries(np.arange(0, 10), name="A"),
],
)
@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning")
def test_groupby_preserves_subclass(obj, groupby_func):
# GH28330 -- preserve subclass through groupby operations
if isinstance(obj, Series) and groupby_func in {"corrwith"}:
pytest.skip("Not applicable")
# TODO(2.0) Remove after pad/backfill deprecation enforced
groupby_func = maybe_normalize_deprecated_kernels(groupby_func)
grouped = obj.groupby(np.arange(0, 10))
# Groups should preserve subclass type
assert isinstance(grouped.get_group(0), type(obj))
args = []
if groupby_func in {"fillna", "nth"}:
args.append(0)
elif groupby_func == "corrwith":
args.append(obj)
elif groupby_func == "tshift":
args.extend([0, 0])
result1 = getattr(grouped, groupby_func)(*args)
result2 = grouped.agg(groupby_func, *args)
# Reduction or transformation kernels should preserve type
slices = {"ngroup", "cumcount", "size"}
if isinstance(obj, DataFrame) and groupby_func in slices:
assert isinstance(result1, tm.SubclassedSeries)
else:
assert isinstance(result1, type(obj))
# Confirm .agg() groupby operations return same results
if isinstance(result1, DataFrame):
tm.assert_frame_equal(result1, result2)
else:
tm.assert_series_equal(result1, result2)
def test_groupby_preserves_metadata():
# GH-37343
custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
assert "testattr" in custom_df._metadata
custom_df.testattr = "hello"
for _, group_df in custom_df.groupby("c"):
assert group_df.testattr == "hello"
# GH-45314
def func(group):
assert isinstance(group, tm.SubclassedDataFrame)
assert hasattr(group, "testattr")
return group.testattr
result = custom_df.groupby("c").apply(func)
expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c"))
tm.assert_series_equal(result, expected)
def func2(group):
assert isinstance(group, tm.SubclassedSeries)
assert hasattr(group, "testattr")
return group.testattr
custom_series = tm.SubclassedSeries([1, 2, 3])
custom_series.testattr = "hello"
result = custom_series.groupby(custom_df["c"]).apply(func2)
tm.assert_series_equal(result, expected)
result = custom_series.groupby(custom_df["c"]).agg(func2)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame])
def test_groupby_resample_preserves_subclass(obj):
# GH28330 -- preserve subclass through groupby.resample()
df = obj(
{
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
df = df.set_index("Date")
# Confirm groupby.resample() preserves dataframe type
result = df.groupby("Buyer").resample("5D").sum()
assert isinstance(result, obj)

View File

@@ -0,0 +1,82 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.fixture(params=[["inner"], ["inner", "outer"]])
def frame(request):
levels = request.param
df = pd.DataFrame(
{
"outer": ["a", "a", "a", "b", "b", "b"],
"inner": [1, 2, 3, 1, 2, 3],
"A": np.arange(6),
"B": ["one", "one", "two", "two", "one", "one"],
}
)
if levels:
df = df.set_index(levels)
return df
@pytest.fixture()
def series():
df = pd.DataFrame(
{
"outer": ["a", "a", "a", "b", "b", "b"],
"inner": [1, 2, 3, 1, 2, 3],
"A": np.arange(6),
"B": ["one", "one", "two", "two", "one", "one"],
}
)
s = df.set_index(["outer", "inner", "B"])["A"]
return s
@pytest.mark.parametrize(
"key_strs,groupers",
[
("inner", pd.Grouper(level="inner")), # Index name
(["inner"], [pd.Grouper(level="inner")]), # List of index name
(["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index
(["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column
],
)
def test_grouper_index_level_as_string(frame, key_strs, groupers):
result = frame.groupby(key_strs).mean()
expected = frame.groupby(groupers).mean()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"levels",
[
"inner",
"outer",
"B",
["inner"],
["outer"],
["B"],
["inner", "outer"],
["outer", "inner"],
["inner", "outer", "B"],
["B", "outer", "inner"],
],
)
def test_grouper_index_level_as_string_series(series, levels):
# Compute expected result
if isinstance(levels, list):
groupers = [pd.Grouper(level=lv) for lv in levels]
else:
groupers = pd.Grouper(level=levels)
expected = series.groupby(groupers).mean()
# Compute and check result
result = series.groupby(levels).mean()
tm.assert_series_equal(result, expected)

View File

@@ -0,0 +1,316 @@
# Test GroupBy._positional_selector positional grouped indexing GH#42864
import random
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"arg, expected_rows",
[
[0, [0, 1, 4]],
[2, [5]],
[5, []],
[-1, [3, 4, 7]],
[-2, [1, 6]],
[-6, []],
],
)
def test_int(slice_test_df, slice_test_grouped, arg, expected_rows):
# Test single integer
result = slice_test_grouped._positional_selector[arg]
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
def test_slice(slice_test_df, slice_test_grouped):
# Test single slice
result = slice_test_grouped._positional_selector[0:3:2]
expected = slice_test_df.iloc[[0, 1, 4, 5]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"arg, expected_rows",
[
[[0, 2], [0, 1, 4, 5]],
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
[range(0, 3, 2), [0, 1, 4, 5]],
[{0, 2}, [0, 1, 4, 5]],
],
ids=[
"list",
"negative",
"range",
"set",
],
)
def test_list(slice_test_df, slice_test_grouped, arg, expected_rows):
# Test lists of integers and integer valued iterables
result = slice_test_grouped._positional_selector[arg]
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
def test_ints(slice_test_df, slice_test_grouped):
# Test tuple of ints
result = slice_test_grouped._positional_selector[0, 2, -1]
expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]]
tm.assert_frame_equal(result, expected)
def test_slices(slice_test_df, slice_test_grouped):
# Test tuple of slices
result = slice_test_grouped._positional_selector[:2, -2:]
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
tm.assert_frame_equal(result, expected)
def test_mix(slice_test_df, slice_test_grouped):
# Test mixed tuple of ints and slices
result = slice_test_grouped._positional_selector[0, 1, -2:]
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"arg, expected_rows",
[
[0, [0, 1, 4]],
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
[(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]],
],
)
def test_as_index(slice_test_df, arg, expected_rows):
# Test the default as_index behaviour
result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg]
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
def test_doc_examples():
# Test the examples in the documentation
df = pd.DataFrame(
[["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"]
)
grouped = df.groupby("A", as_index=False)
result = grouped._positional_selector[1:2]
expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4])
tm.assert_frame_equal(result, expected)
result = grouped._positional_selector[1, -1]
expected = pd.DataFrame(
[["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4]
)
tm.assert_frame_equal(result, expected)
@pytest.fixture()
def multiindex_data():
ndates = 100
nitems = 20
dates = pd.date_range("20130101", periods=ndates, freq="D")
items = [f"item {i}" for i in range(nitems)]
data = {}
for date in dates:
nitems_for_date = nitems - random.randint(0, 12)
levels = [
(item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100)
for item in items[:nitems_for_date]
]
levels.sort(key=lambda x: x[1])
data[date] = levels
return data
def _make_df_from_data(data):
rows = {}
for date in data:
for level in data[date]:
rows[(date, level[0])] = {"A": level[1], "B": level[2]}
df = pd.DataFrame.from_dict(rows, orient="index")
df.index.names = ("Date", "Item")
return df
def test_multiindex(multiindex_data):
# Test the multiindex mentioned as the use-case in the documentation
df = _make_df_from_data(multiindex_data)
result = df.groupby("Date", as_index=False).nth(slice(3, -3))
sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data}
expected = _make_df_from_data(sliced)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000])
@pytest.mark.parametrize("method", ["head", "tail"])
@pytest.mark.parametrize("simulated", [True, False])
def test_against_head_and_tail(arg, method, simulated):
# Test gives the same results as grouped head and tail
n_groups = 100
n_rows_per_group = 30
data = {
"group": [
f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)
],
"value": [
f"group {g} row {j}"
for j in range(n_rows_per_group)
for g in range(n_groups)
],
}
df = pd.DataFrame(data)
grouped = df.groupby("group", as_index=False)
size = arg if arg >= 0 else n_rows_per_group + arg
if method == "head":
result = grouped._positional_selector[:arg]
if simulated:
indices = []
for j in range(size):
for i in range(n_groups):
if j * n_groups + i < n_groups * n_rows_per_group:
indices.append(j * n_groups + i)
expected = df.iloc[indices]
else:
expected = grouped.head(arg)
else:
result = grouped._positional_selector[-arg:]
if simulated:
indices = []
for j in range(size):
for i in range(n_groups):
if (n_rows_per_group + j - size) * n_groups + i >= 0:
indices.append((n_rows_per_group + j - size) * n_groups + i)
expected = df.iloc[indices]
else:
expected = grouped.tail(arg)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10])
@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10])
@pytest.mark.parametrize("step", [None, 1, 5])
def test_against_df_iloc(start, stop, step):
# Test that a single group gives the same results as DataFame.iloc
n_rows = 30
data = {
"group": ["group 0"] * n_rows,
"value": list(range(n_rows)),
}
df = pd.DataFrame(data)
grouped = df.groupby("group", as_index=False)
result = grouped._positional_selector[start:stop:step]
expected = df.iloc[start:stop:step]
tm.assert_frame_equal(result, expected)
def test_series():
# Test grouped Series
ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"])
grouped = ser.groupby(level=0)
result = grouped._positional_selector[1:2]
expected = pd.Series([2, 5], index=["a", "b"])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("step", [1, 2, 3, 4, 5])
def test_step(step):
# Test slice with various step values
data = [["x", f"x{i}"] for i in range(5)]
data += [["y", f"y{i}"] for i in range(4)]
data += [["z", f"z{i}"] for i in range(3)]
df = pd.DataFrame(data, columns=["A", "B"])
grouped = df.groupby("A", as_index=False)
result = grouped._positional_selector[::step]
data = [["x", f"x{i}"] for i in range(0, 5, step)]
data += [["y", f"y{i}"] for i in range(0, 4, step)]
data += [["z", f"z{i}"] for i in range(0, 3, step)]
index = [0 + i for i in range(0, 5, step)]
index += [5 + i for i in range(0, 4, step)]
index += [9 + i for i in range(0, 3, step)]
expected = pd.DataFrame(data, columns=["A", "B"], index=index)
tm.assert_frame_equal(result, expected)
@pytest.fixture()
def column_group_df():
return pd.DataFrame(
[[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]],
columns=["A", "B", "C", "D", "E", "F", "G"],
)
def test_column_axis(column_group_df):
g = column_group_df.groupby(column_group_df.iloc[1], axis=1)
result = g._positional_selector[1:-1]
expected = column_group_df.iloc[:, [1, 3]]
tm.assert_frame_equal(result, expected)
def test_columns_on_iter():
# GitHub issue #44821
df = pd.DataFrame({k: range(10) for k in "ABC"})
# Group-by and select columns
cols = ["A", "B"]
for _, dg in df.groupby(df.A < 4)[cols]:
tm.assert_index_equal(dg.columns, pd.Index(cols))
assert "C" not in dg.columns
@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array])
def test_groupby_duplicated_columns(func):
# GH#44924
df = pd.DataFrame(
{
"A": [1, 2],
"B": [3, 3],
"C": ["G", "G"],
}
)
result = df.groupby("C")[func(["A", "B", "A"])].mean()
expected = pd.DataFrame(
[[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C")
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,283 @@
import numpy as np
import pytest
from pandas._libs import groupby as libgroupby
from pandas._libs.groupby import (
group_cumprod_float64,
group_cumsum,
group_mean,
group_var,
)
from pandas.core.dtypes.common import ensure_platform_int
from pandas import isna
import pandas._testing as tm
class GroupVarTestMixin:
def test_group_var_generic_1d(self):
prng = np.random.RandomState(1234)
out = (np.nan * np.ones((5, 1))).astype(self.dtype)
counts = np.zeros(5, dtype="int64")
values = 10 * prng.rand(15, 1).astype(self.dtype)
labels = np.tile(np.arange(5), (3,)).astype("intp")
expected_out = (
np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
)[:, np.newaxis]
expected_counts = counts + 3
self.algo(out, counts, values, labels)
assert np.allclose(out, expected_out, self.rtol)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_generic_1d_flat_labels(self):
prng = np.random.RandomState(1234)
out = (np.nan * np.ones((1, 1))).astype(self.dtype)
counts = np.zeros(1, dtype="int64")
values = 10 * prng.rand(5, 1).astype(self.dtype)
labels = np.zeros(5, dtype="intp")
expected_out = np.array([[values.std(ddof=1) ** 2]])
expected_counts = counts + 5
self.algo(out, counts, values, labels)
assert np.allclose(out, expected_out, self.rtol)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_generic_2d_all_finite(self):
prng = np.random.RandomState(1234)
out = (np.nan * np.ones((5, 2))).astype(self.dtype)
counts = np.zeros(5, dtype="int64")
values = 10 * prng.rand(10, 2).astype(self.dtype)
labels = np.tile(np.arange(5), (2,)).astype("intp")
expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
expected_counts = counts + 2
self.algo(out, counts, values, labels)
assert np.allclose(out, expected_out, self.rtol)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_generic_2d_some_nan(self):
prng = np.random.RandomState(1234)
out = (np.nan * np.ones((5, 2))).astype(self.dtype)
counts = np.zeros(5, dtype="int64")
values = 10 * prng.rand(10, 2).astype(self.dtype)
values[:, 1] = np.nan
labels = np.tile(np.arange(5), (2,)).astype("intp")
expected_out = np.vstack(
[
values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
np.nan * np.ones(5),
]
).T.astype(self.dtype)
expected_counts = counts + 2
self.algo(out, counts, values, labels)
tm.assert_almost_equal(out, expected_out, rtol=0.5e-06)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_constant(self):
# Regression test from GH 10448.
out = np.array([[np.nan]], dtype=self.dtype)
counts = np.array([0], dtype="int64")
values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
labels = np.zeros(3, dtype="intp")
self.algo(out, counts, values, labels)
assert counts[0] == 3
assert out[0, 0] >= 0
tm.assert_almost_equal(out[0, 0], 0.0)
class TestGroupVarFloat64(GroupVarTestMixin):
__test__ = True
algo = staticmethod(group_var)
dtype = np.float64
rtol = 1e-5
def test_group_var_large_inputs(self):
prng = np.random.RandomState(1234)
out = np.array([[np.nan]], dtype=self.dtype)
counts = np.array([0], dtype="int64")
values = (prng.rand(10**6) + 10**12).astype(self.dtype)
values.shape = (10**6, 1)
labels = np.zeros(10**6, dtype="intp")
self.algo(out, counts, values, labels)
assert counts[0] == 10**6
tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3)
class TestGroupVarFloat32(GroupVarTestMixin):
__test__ = True
algo = staticmethod(group_var)
dtype = np.float32
rtol = 1e-2
@pytest.mark.parametrize("dtype", ["float32", "float64"])
def test_group_ohlc(dtype):
obj = np.array(np.random.randn(20), dtype=dtype)
bins = np.array([6, 12, 20])
out = np.zeros((3, 4), dtype)
counts = np.zeros(len(out), dtype=np.int64)
labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
func = libgroupby.group_ohlc
func(out, counts, obj[:, None], labels)
def _ohlc(group):
if isna(group).all():
return np.repeat(np.nan, 4)
return [group[0], group.max(), group.min(), group[-1]]
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
tm.assert_almost_equal(out, expected)
tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
obj[:6] = np.nan
func(out, counts, obj[:, None], labels)
expected[0] = np.nan
tm.assert_almost_equal(out, expected)
def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
"""
Check a group transform that executes a cumulative function.
Parameters
----------
pd_op : callable
The pandas cumulative function.
np_op : callable
The analogous one in NumPy.
dtype : type
The specified dtype of the data.
"""
is_datetimelike = False
data = np.array([[1], [2], [3], [4]], dtype=dtype)
answer = np.zeros_like(data)
labels = np.array([0, 0, 0, 0], dtype=np.intp)
ngroups = 1
pd_op(answer, data, labels, ngroups, is_datetimelike)
tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False)
def test_cython_group_transform_cumsum(any_real_numpy_dtype):
# see gh-4095
dtype = np.dtype(any_real_numpy_dtype).type
pd_op, np_op = group_cumsum, np.cumsum
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
def test_cython_group_transform_cumprod():
# see gh-4095
dtype = np.float64
pd_op, np_op = group_cumprod_float64, np.cumproduct
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
def test_cython_group_transform_algos():
# see gh-4095
is_datetimelike = False
# with nans
labels = np.array([0, 0, 0, 0, 0], dtype=np.intp)
ngroups = 1
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
actual = np.zeros_like(data)
actual.fill(np.nan)
group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike)
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
tm.assert_numpy_array_equal(actual[:, 0], expected)
actual = np.zeros_like(data)
actual.fill(np.nan)
group_cumsum(actual, data, labels, ngroups, is_datetimelike)
expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
tm.assert_numpy_array_equal(actual[:, 0], expected)
# timedelta
is_datetimelike = True
data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
actual = np.zeros_like(data, dtype="int64")
group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
expected = np.array(
[
np.timedelta64(1, "ns"),
np.timedelta64(2, "ns"),
np.timedelta64(3, "ns"),
np.timedelta64(4, "ns"),
np.timedelta64(5, "ns"),
]
)
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
def test_cython_group_mean_datetimelike():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.array([0], dtype="int64")
data = (
np.array(
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
dtype="m8[ns]",
)[:, None]
.view("int64")
.astype("float64")
)
labels = np.zeros(len(data), dtype=np.intp)
group_mean(actual, counts, data, labels, is_datetimelike=True)
tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
def test_cython_group_mean_wrong_min_count():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.zeros(1, dtype="int64")
data = np.zeros(1, dtype="float64")[:, None]
labels = np.zeros(1, dtype=np.intp)
with pytest.raises(AssertionError, match="min_count"):
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.array([0], dtype="int64")
data = (
np.array(
[np.timedelta64("NaT"), np.timedelta64("NaT")],
dtype="m8[ns]",
)[:, None]
.view("int64")
.astype("float64")
)
labels = np.zeros(len(data), dtype=np.intp)
group_mean(actual, counts, data, labels, is_datetimelike=False)
tm.assert_numpy_array_equal(
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
)

View File

@@ -0,0 +1,227 @@
import numpy as np
import pytest
from pandas._libs.tslibs import iNaT
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
from pandas.core.api import Int64Index
def test_max_min_non_numeric():
# #2700
aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]})
result = aa.groupby("nn").max()
assert "ss" in result
result = aa.groupby("nn").max(numeric_only=False)
assert "ss" in result
result = aa.groupby("nn").min()
assert "ss" in result
result = aa.groupby("nn").min(numeric_only=False)
assert "ss" in result
def test_max_min_object_multiple_columns(using_array_manager):
# GH#41111 case where the aggregation is valid for some columns but not
# others; we split object blocks column-wise, consistent with
# DataFrame._reduce
df = DataFrame(
{
"A": [1, 1, 2, 2, 3],
"B": [1, "foo", 2, "bar", False],
"C": ["a", "b", "c", "d", "e"],
}
)
df._consolidate_inplace() # should already be consolidate, but double-check
if not using_array_manager:
assert len(df._mgr.blocks) == 2
gb = df.groupby("A")
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = gb.max(numeric_only=False)
# "max" is valid for column "C" but not for "B"
ei = Index([1, 2, 3], name="A")
expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
tm.assert_frame_equal(result, expected)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = gb.min(numeric_only=False)
# "min" is valid for column "C" but not for "B"
ei = Index([1, 2, 3], name="A")
expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
tm.assert_frame_equal(result, expected)
def test_min_date_with_nans():
# GH26321
dates = pd.to_datetime(
Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d"
).dt.date
df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates})
result = df.groupby("b", as_index=False)["c"].min()["c"]
expected = pd.to_datetime(
Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d"
).dt.date
tm.assert_series_equal(result, expected)
result = df.groupby("b")["c"].min()
expected.index.name = "b"
tm.assert_series_equal(result, expected)
def test_max_inat():
# GH#40767 dont interpret iNaT as NaN
ser = Series([1, iNaT])
gb = ser.groupby([1, 1])
result = gb.max(min_count=2)
expected = Series({1: 1}, dtype=np.int64)
tm.assert_series_equal(result, expected, check_exact=True)
result = gb.min(min_count=2)
expected = Series({1: iNaT}, dtype=np.int64)
tm.assert_series_equal(result, expected, check_exact=True)
# not enough entries -> gets masked to NaN
result = gb.min(min_count=3)
expected = Series({1: np.nan})
tm.assert_series_equal(result, expected, check_exact=True)
def test_max_inat_not_all_na():
# GH#40767 dont interpret iNaT as NaN
# make sure we dont round iNaT+1 to iNaT
ser = Series([1, iNaT, 2, iNaT + 1])
gb = ser.groupby([1, 2, 3, 3])
result = gb.min(min_count=2)
# Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy
expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1})
tm.assert_series_equal(result, expected, check_exact=True)
@pytest.mark.parametrize("func", ["min", "max"])
def test_groupby_aggregate_period_column(func):
# GH 31471
groups = [1, 2]
periods = pd.period_range("2020", periods=2, freq="Y")
df = DataFrame({"a": groups, "b": periods})
result = getattr(df.groupby("a")["b"], func)()
idx = Int64Index([1, 2], name="a")
expected = Series(periods, index=idx, name="b")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("func", ["min", "max"])
def test_groupby_aggregate_period_frame(func):
# GH 31471
groups = [1, 2]
periods = pd.period_range("2020", periods=2, freq="Y")
df = DataFrame({"a": groups, "b": periods})
result = getattr(df.groupby("a"), func)()
idx = Int64Index([1, 2], name="a")
expected = DataFrame({"b": periods}, index=idx)
tm.assert_frame_equal(result, expected)
def test_aggregate_numeric_object_dtype():
# https://github.com/pandas-dev/pandas/issues/39329
# simplified case: multiple object columns where one is all-NaN
# -> gets split as the all-NaN is inferred as float
df = DataFrame(
{"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4},
).astype(object)
result = df.groupby("key").min()
expected = DataFrame(
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}
).set_index("key")
tm.assert_frame_equal(result, expected)
# same but with numbers
df = DataFrame(
{"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)},
).astype(object)
result = df.groupby("key").min()
expected = DataFrame(
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
).set_index("key")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("func", ["min", "max"])
def test_aggregate_categorical_lost_index(func: str):
# GH: 28641 groupby drops index, when grouping over categorical column with min/max
ds = Series(["b"], dtype="category").cat.as_ordered()
df = DataFrame({"A": [1997], "B": ds})
result = df.groupby("A").agg({"B": func})
expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A"))
# ordered categorical dtype should be preserved
expected["B"] = expected["B"].astype(ds.dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"])
def test_groupby_min_max_nullable(dtype):
if dtype == "Int64":
# GH#41743 avoid precision loss
ts = 1618556707013635762
elif dtype == "boolean":
ts = 0
else:
ts = 4.0
df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]})
df["ts"] = df["ts"].astype(dtype)
gb = df.groupby("id")
result = gb.min()
expected = df.iloc[:1].set_index("id")
tm.assert_frame_equal(result, expected)
res_max = gb.max()
expected_max = df.iloc[1:].set_index("id")
tm.assert_frame_equal(res_max, expected_max)
result2 = gb.min(min_count=3)
expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype)
tm.assert_frame_equal(result2, expected2)
res_max2 = gb.max(min_count=3)
tm.assert_frame_equal(res_max2, expected2)
# Case with NA values
df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]})
df2["ts"] = df2["ts"].astype(dtype)
gb2 = df2.groupby("id")
result3 = gb2.min()
tm.assert_frame_equal(result3, expected)
res_max3 = gb2.max()
tm.assert_frame_equal(res_max3, expected_max)
result4 = gb2.min(min_count=100)
tm.assert_frame_equal(result4, expected2)
res_max4 = gb2.max(min_count=100)
tm.assert_frame_equal(res_max4, expected2)

View File

@@ -0,0 +1,155 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
date_range,
)
import pandas._testing as tm
@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_groupby_column_index_name_lost_fill_funcs(func):
# GH: 29764 groupby loses index sometimes
df = DataFrame(
[[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
columns=Index(["type", "a", "b"], name="idx"),
)
df_grouped = df.groupby(["type"])[["a", "b"]]
result = getattr(df_grouped, func)().columns
expected = Index(["a", "b"], name="idx")
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_groupby_fill_duplicate_column_names(func):
# GH: 25610 ValueError with duplicate column names
df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
df2 = DataFrame({"field1": [1, np.nan, 4]})
df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
expected = DataFrame(
[[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
)
result = getattr(df_grouped, func)()
tm.assert_frame_equal(result, expected)
def test_ffill_missing_arguments():
# GH 14955
df = DataFrame({"a": [1, 2], "b": [1, 1]})
with pytest.raises(ValueError, match="Must specify a fill"):
df.groupby("b").fillna()
@pytest.mark.parametrize(
"method, expected", [("ffill", [None, "a", "a"]), ("bfill", ["a", "a", None])]
)
def test_fillna_with_string_dtype(method, expected):
# GH 40250
df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]})
grp = df.groupby("b")
result = grp.fillna(method=method)
expected = DataFrame({"a": pd.array(expected, dtype="string")})
tm.assert_frame_equal(result, expected)
def test_fill_consistency():
# GH9221
# pass thru keyword arguments to the generated wrapper
# are set if the passed kw is None (only)
df = DataFrame(
index=pd.MultiIndex.from_product(
[["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
),
columns=Index(["1", "2"], name="id"),
)
df["1"] = [
np.nan,
1,
np.nan,
np.nan,
11,
np.nan,
np.nan,
2,
np.nan,
np.nan,
22,
np.nan,
]
df["2"] = [
np.nan,
3,
np.nan,
np.nan,
33,
np.nan,
np.nan,
4,
np.nan,
np.nan,
44,
np.nan,
]
expected = df.groupby(level=0, axis=0).fillna(method="ffill")
result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["ffill", "bfill"])
@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.parametrize("has_nan_group", [True, False])
def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
# GH 34725
df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])
ridx = [-1, 0, -1, -1, 1, -1]
df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
group_b = np.nan if has_nan_group else "b"
df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)
grouped = df.groupby(by="group_col", dropna=dropna)
result = getattr(grouped, method)(limit=None)
expected_rows = {
("ffill", True, True): [-1, 0, 0, -1, -1, -1],
("ffill", True, False): [-1, 0, 0, -1, 1, 1],
("ffill", False, True): [-1, 0, 0, -1, 1, 1],
("ffill", False, False): [-1, 0, 0, -1, 1, 1],
("bfill", True, True): [0, 0, -1, -1, -1, -1],
("bfill", True, False): [0, 0, -1, 1, 1, -1],
("bfill", False, True): [0, 0, -1, 1, 1, -1],
("bfill", False, False): [0, 0, -1, 1, 1, -1],
}
ridx = expected_rows.get((method, dropna, has_nan_group))
expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
# columns are a 'take' on df.columns, which are object dtype
expected.columns = expected.columns.astype(object)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
def test_min_count(func, min_count, value):
# GH#37821
df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
result = getattr(df.groupby("a"), func)(min_count=min_count)
expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
tm.assert_frame_equal(result, expected)
def test_indices_with_missing():
# GH 9304
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
g = df.groupby(["a", "b"])
result = g.indices
expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
assert result == expected

View File

@@ -0,0 +1,843 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
isna,
)
import pandas._testing as tm
def test_first_last_nth(df):
# tests for first / last / nth
grouped = df.groupby("A")
first = grouped.first()
expected = df.loc[[1, 0], ["B", "C", "D"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(first, expected)
nth = grouped.nth(0)
tm.assert_frame_equal(nth, expected)
last = grouped.last()
expected = df.loc[[5, 7], ["B", "C", "D"]]
expected.index = Index(["bar", "foo"], name="A")
tm.assert_frame_equal(last, expected)
nth = grouped.nth(-1)
tm.assert_frame_equal(nth, expected)
nth = grouped.nth(1)
expected = df.loc[[2, 3], ["B", "C", "D"]].copy()
expected.index = Index(["foo", "bar"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(nth, expected)
# it works!
grouped["B"].first()
grouped["B"].last()
grouped["B"].nth(0)
df.loc[df["A"] == "foo", "B"] = np.nan
assert isna(grouped["B"].first()["foo"])
assert isna(grouped["B"].last()["foo"])
assert isna(grouped["B"].nth(0)["foo"])
# v0.14.0 whatsnew
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
result = g.first()
expected = df.iloc[[1, 2]].set_index("A")
tm.assert_frame_equal(result, expected)
expected = df.iloc[[1, 2]].set_index("A")
result = g.nth(0, dropna="any")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["first", "last"])
def test_first_last_with_na_object(method, nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/32123
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
result = getattr(groups, method)()
if method == "first":
values = [1, 3]
else:
values = [2, 3]
values = np.array(values, dtype=result["b"].dtype)
idx = Index([1, 2], name="a")
expected = DataFrame({"b": values}, index=idx)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index", [0, -1])
def test_nth_with_na_object(index, nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/32123
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
result = groups.nth(index)
if index == 0:
values = [1, 3]
else:
values = [2, nulls_fixture]
values = np.array(values, dtype=result["b"].dtype)
idx = Index([1, 2], name="a")
expected = DataFrame({"b": values}, index=idx)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["first", "last"])
def test_first_last_with_None(method):
# https://github.com/pandas-dev/pandas/issues/32800
# None should be preserved as object dtype
df = DataFrame.from_dict({"id": ["a"], "value": [None]})
groups = df.groupby("id", as_index=False)
result = getattr(groups, method)()
tm.assert_frame_equal(result, df)
@pytest.mark.parametrize("method", ["first", "last"])
@pytest.mark.parametrize(
"df, expected",
[
(
DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
),
(
DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
DataFrame({"value": [None]}, index=Index(["a"], name="id")),
),
],
)
def test_first_last_with_None_expanded(method, df, expected):
# GH 32800, 38286
result = getattr(df.groupby("id"), method)()
tm.assert_frame_equal(result, expected)
def test_first_last_nth_dtypes(df_mixed_floats):
df = df_mixed_floats.copy()
df["E"] = True
df["F"] = 1
# tests for first / last / nth
grouped = df.groupby("A")
first = grouped.first()
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(first, expected)
last = grouped.last()
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(last, expected)
nth = grouped.nth(1)
expected = df.loc[[3, 2], ["B", "C", "D", "E", "F"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(nth, expected)
# GH 2763, first/last shifting dtypes
idx = list(range(10))
idx.append(9)
s = Series(data=range(11), index=idx, name="IntCol")
assert s.dtype == "int64"
f = s.groupby(level=0).first()
assert f.dtype == "int64"
def test_first_last_nth_nan_dtype():
# GH 33591
df = DataFrame({"data": ["A"], "nans": Series([np.nan], dtype=object)})
grouped = df.groupby("data")
expected = df.set_index("data").nans
tm.assert_series_equal(grouped.nans.first(), expected)
tm.assert_series_equal(grouped.nans.last(), expected)
tm.assert_series_equal(grouped.nans.nth(-1), expected)
tm.assert_series_equal(grouped.nans.nth(0), expected)
def test_first_strings_timestamps():
# GH 11244
test = DataFrame(
{
Timestamp("2012-01-01 00:00:00"): ["a", "b"],
Timestamp("2012-01-02 00:00:00"): ["c", "d"],
"name": ["e", "e"],
"aaaa": ["f", "g"],
}
)
result = test.groupby("name").first()
expected = DataFrame(
[["a", "c", "f"]],
columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
index=Index(["e"], name="name"),
)
tm.assert_frame_equal(result, expected)
def test_nth():
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
tm.assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index("A"))
tm.assert_frame_equal(g.nth(1), df.iloc[[1]].set_index("A"))
tm.assert_frame_equal(g.nth(2), df.loc[[]].set_index("A"))
tm.assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index("A"))
tm.assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index("A"))
tm.assert_frame_equal(g.nth(-3), df.loc[[]].set_index("A"))
tm.assert_series_equal(g.B.nth(0), df.set_index("A").B.iloc[[0, 2]])
tm.assert_series_equal(g.B.nth(1), df.set_index("A").B.iloc[[1]])
tm.assert_frame_equal(g[["B"]].nth(0), df.loc[[0, 2], ["A", "B"]].set_index("A"))
exp = df.set_index("A")
tm.assert_frame_equal(g.nth(0, dropna="any"), exp.iloc[[1, 2]])
tm.assert_frame_equal(g.nth(-1, dropna="any"), exp.iloc[[1, 2]])
exp["B"] = np.nan
tm.assert_frame_equal(g.nth(7, dropna="any"), exp.iloc[[1, 2]])
tm.assert_frame_equal(g.nth(2, dropna="any"), exp.iloc[[1, 2]])
# out of bounds, regression from 0.13.1
# GH 6621
df = DataFrame(
{
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
"two": {
0: 1.5456590000000001,
1: -0.070345000000000005,
2: -2.4004539999999999,
3: 0.46206000000000003,
4: 0.52350799999999997,
},
"one": {
0: 0.56573799999999996,
1: -0.9742360000000001,
2: 1.033801,
3: -0.78543499999999999,
4: 0.70422799999999997,
},
}
).set_index(["color", "food"])
result = df.groupby(level=0, as_index=False).nth(2)
expected = df.iloc[[-1]]
tm.assert_frame_equal(result, expected)
result = df.groupby(level=0, as_index=False).nth(3)
expected = df.loc[[]]
tm.assert_frame_equal(result, expected)
# GH 7559
# from the vbench
df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype="int64")
s = df[1]
g = df[0]
expected = s.groupby(g).first()
expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
tm.assert_series_equal(expected2, expected, check_names=False)
assert expected.name == 1
assert expected2.name == 1
# validate first
v = s[g == 1].iloc[0]
assert expected.iloc[0] == v
assert expected2.iloc[0] == v
# this is NOT the same as .first (as sorted is default!)
# as it keeps the order in the series (and not the group order)
# related GH 7287
expected = s.groupby(g, sort=False).first()
result = s.groupby(g, sort=False).nth(0, dropna="all")
tm.assert_series_equal(result, expected)
with pytest.raises(ValueError, match="For a DataFrame"):
s.groupby(g, sort=False).nth(0, dropna=True)
# doc example
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
result = g.B.nth(0, dropna="all")
expected = g.B.first()
tm.assert_series_equal(result, expected)
# test multiple nth values
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
g = df.groupby("A")
tm.assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index("A"))
tm.assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index("A"))
tm.assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index("A"))
tm.assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index("A"))
tm.assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index("A"))
tm.assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index("A"))
tm.assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index("A"))
tm.assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index("A"))
business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B")
df = DataFrame(1, index=business_dates, columns=["a", "b"])
# get the first, fourth and last two business days for each month
key = [df.index.year, df.index.month]
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
expected_dates = pd.to_datetime(
[
"2014/4/1",
"2014/4/4",
"2014/4/29",
"2014/4/30",
"2014/5/1",
"2014/5/6",
"2014/5/29",
"2014/5/30",
"2014/6/2",
"2014/6/5",
"2014/6/27",
"2014/6/30",
]
)
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
tm.assert_frame_equal(result, expected)
def test_nth_multi_index(three_group):
# PR 9090, related to issue 8979
# test nth on MultiIndex, should match .first()
grouped = three_group.groupby(["A", "B"])
result = grouped.nth(0)
expected = grouped.first()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data, expected_first, expected_last",
[
(
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
),
(
{
"id": ["A", "B", "A"],
"time": [
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
],
"foo": [1, 2, 3],
},
{
"id": ["A", "B"],
"time": [
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
],
"foo": [1, 2],
},
{
"id": ["A", "B"],
"time": [
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
],
"foo": [3, 2],
},
),
],
)
def test_first_last_tz(data, expected_first, expected_last):
# GH15884
# Test that the timezone is retained when calling first
# or last on groupby with as_index=False
df = DataFrame(data)
result = df.groupby("id", as_index=False).first()
expected = DataFrame(expected_first)
cols = ["id", "time", "foo"]
tm.assert_frame_equal(result[cols], expected[cols])
result = df.groupby("id", as_index=False)["time"].first()
tm.assert_frame_equal(result, expected[["id", "time"]])
result = df.groupby("id", as_index=False).last()
expected = DataFrame(expected_last)
cols = ["id", "time", "foo"]
tm.assert_frame_equal(result[cols], expected[cols])
result = df.groupby("id", as_index=False)["time"].last()
tm.assert_frame_equal(result, expected[["id", "time"]])
@pytest.mark.parametrize(
"method, ts, alpha",
[
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
],
)
def test_first_last_tz_multi_column(method, ts, alpha):
# GH 21603
category_string = Series(list("abc")).astype("category")
df = DataFrame(
{
"group": [1, 1, 2],
"category_string": category_string,
"datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
}
)
result = getattr(df.groupby("group"), method)()
expected = DataFrame(
{
"category_string": pd.Categorical(
[alpha, "c"], dtype=category_string.dtype
),
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
},
index=Index([1, 2], name="group"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
pd.array([True, False], dtype="boolean"),
pd.array([1, 2], dtype="Int64"),
pd.to_datetime(["2020-01-01", "2020-02-01"]),
pd.to_timedelta([1, 2], unit="D"),
],
)
@pytest.mark.parametrize("function", ["first", "last", "min", "max"])
def test_first_last_extension_array_keeps_dtype(values, function):
# https://github.com/pandas-dev/pandas/issues/33071
# https://github.com/pandas-dev/pandas/issues/32194
df = DataFrame({"a": [1, 2], "b": values})
grouped = df.groupby("a")
idx = Index([1, 2], name="a")
expected_series = Series(values, name="b", index=idx)
expected_frame = DataFrame({"b": values}, index=idx)
result_series = getattr(grouped["b"], function)()
tm.assert_series_equal(result_series, expected_series)
result_frame = grouped.agg({"b": function})
tm.assert_frame_equal(result_frame, expected_frame)
def test_nth_multi_index_as_expected():
# PR 9090, related to issue 8979
# test nth on MultiIndex
three_group = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
}
)
grouped = three_group.groupby(["A", "B"])
result = grouped.nth(0)
expected = DataFrame(
{"C": ["dull", "dull", "dull", "dull"]},
index=MultiIndex.from_arrays(
[["bar", "bar", "foo", "foo"], ["one", "two", "one", "two"]],
names=["A", "B"],
),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, n, expected_rows",
[
("head", -1, [0]),
("head", 0, []),
("head", 1, [0, 2]),
("head", 7, [0, 1, 2]),
("tail", -1, [1]),
("tail", 0, []),
("tail", 1, [1, 2]),
("tail", 7, [0, 1, 2]),
],
)
@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A", as_index=as_index)
expected = df.iloc[expected_rows]
if columns is not None:
g = g[columns]
expected = expected[columns]
result = getattr(g, op)(n)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, n, expected_cols",
[
("head", -1, [0]),
("head", 0, []),
("head", 1, [0, 2]),
("head", 7, [0, 1, 2]),
("tail", -1, [1]),
("tail", 0, []),
("tail", 1, [1, 2]),
("tail", 7, [0, 1, 2]),
],
)
def test_groupby_head_tail_axis_1(op, n, expected_cols):
# GH 9772
df = DataFrame(
[[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
)
g = df.groupby([0, 0, 1], axis=1)
expected = df.iloc[:, expected_cols]
result = getattr(g, op)(n)
tm.assert_frame_equal(result, expected)
def test_group_selection_cache():
# GH 12839 nth, head, and tail should return same result consistently
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
expected = df.iloc[[0, 2]].set_index("A")
g = df.groupby("A")
result1 = g.head(n=2)
result2 = g.nth(0)
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, expected)
g = df.groupby("A")
result1 = g.tail(n=2)
result2 = g.nth(0)
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, expected)
g = df.groupby("A")
result1 = g.nth(0)
result2 = g.head(n=2)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, df)
g = df.groupby("A")
result1 = g.nth(0)
result2 = g.tail(n=2)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, df)
def test_nth_empty():
# GH 16064
df = DataFrame(index=[0], columns=["a", "b", "c"])
result = df.groupby("a").nth(10)
expected = DataFrame(index=Index([], name="a"), columns=["b", "c"])
tm.assert_frame_equal(result, expected)
result = df.groupby(["a", "b"]).nth(10)
expected = DataFrame(
index=MultiIndex([[], []], [[], []], names=["a", "b"]), columns=["c"]
)
tm.assert_frame_equal(result, expected)
def test_nth_column_order():
# GH 20760
# Check that nth preserves column order
df = DataFrame(
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
columns=["A", "C", "B"],
)
result = df.groupby("A").nth(0)
expected = DataFrame(
[["b", 100.0], ["c", 200.0]], columns=["C", "B"], index=Index([1, 2], name="A")
)
tm.assert_frame_equal(result, expected)
result = df.groupby("A").nth(-1, dropna="any")
expected = DataFrame(
[["a", 50.0], ["d", 150.0]], columns=["C", "B"], index=Index([1, 2], name="A")
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dropna", [None, "any", "all"])
def test_nth_nan_in_grouper(dropna):
# GH 26011
df = DataFrame(
[[np.nan, 0, 1], ["abc", 2, 3], [np.nan, 4, 5], ["def", 6, 7], [np.nan, 8, 9]],
columns=list("abc"),
)
result = df.groupby("a").nth(0, dropna=dropna)
expected = DataFrame(
[[2, 3], [6, 7]], columns=list("bc"), index=Index(["abc", "def"], name="a")
)
tm.assert_frame_equal(result, expected)
def test_first_categorical_and_datetime_data_nat():
# GH 20520
df = DataFrame(
{
"group": ["first", "first", "second", "third", "third"],
"time": 5 * [np.datetime64("NaT")],
"categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
}
)
result = df.groupby("group").first()
expected = DataFrame(
{
"time": 3 * [np.datetime64("NaT")],
"categories": Series(["a", "c", "a"]).astype(
pd.CategoricalDtype(["a", "b", "c"])
),
}
)
expected.index = Index(["first", "second", "third"], name="group")
tm.assert_frame_equal(result, expected)
def test_first_multi_key_groupbby_categorical():
# GH 22512
df = DataFrame(
{
"A": [1, 1, 1, 2, 2],
"B": [100, 100, 200, 100, 100],
"C": ["apple", "orange", "mango", "mango", "orange"],
"D": ["jupiter", "mercury", "mars", "venus", "venus"],
}
)
df = df.astype({"D": "category"})
result = df.groupby(by=["A", "B"]).first()
expected = DataFrame(
{
"C": ["apple", "mango", "mango"],
"D": Series(["jupiter", "mars", "venus"]).astype(
pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
),
}
)
expected.index = MultiIndex.from_tuples(
[(1, 100), (1, 200), (2, 100)], names=["A", "B"]
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["first", "last", "nth"])
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
# GH29645
expected = Series(["y"])
data = Series(
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
index=[0, 0, 0, 0, 0],
).groupby(level=0)
if method == "nth":
result = getattr(data, method)(3)
else:
result = getattr(data, method)()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"arg, expected_rows",
[
[slice(None, 3, 2), [0, 1, 4, 5]],
[slice(None, -2), [0, 2, 5]],
[[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
[[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
],
)
def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
# Test slices GH #42947
result = slice_test_grouped.nth[arg]
equivalent = slice_test_grouped.nth(arg)
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(equivalent, expected)
def test_nth_indexed(slice_test_df, slice_test_grouped):
# Test index notation GH #44688
result = slice_test_grouped.nth[0, 1, -2:]
equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(equivalent, expected)
def test_invalid_argument(slice_test_grouped):
# Test for error on invalid argument
with pytest.raises(TypeError, match="Invalid index"):
slice_test_grouped.nth(3.14)
def test_negative_step(slice_test_grouped):
# Test for error on negative slice step
with pytest.raises(ValueError, match="Invalid step"):
slice_test_grouped.nth(slice(None, None, -1))
def test_np_ints(slice_test_df, slice_test_grouped):
# Test np ints work
result = slice_test_grouped.nth(np.array([0, 1]))
expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
tm.assert_frame_equal(result, expected)
def test_groupby_nth_with_column_axis():
# GH43926
df = DataFrame(
[
[4, 5, 6],
[8, 8, 7],
],
index=["z", "y"],
columns=["C", "B", "A"],
)
result = df.groupby(df.iloc[1], axis=1).nth(0)
expected = DataFrame(
[
[6, 4],
[7, 8],
],
index=["z", "y"],
columns=[7, 8],
)
expected.columns.name = "y"
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"start, stop, expected_values, expected_columns",
[
(None, None, [0, 1, 2, 3, 4], [5, 5, 5, 6, 6]),
(None, 1, [0, 3], [5, 6]),
(None, 9, [0, 1, 2, 3, 4], [5, 5, 5, 6, 6]),
(None, -1, [0, 1, 3], [5, 5, 6]),
(1, None, [1, 2, 4], [5, 5, 6]),
(1, -1, [1], [5]),
(-1, None, [2, 4], [5, 6]),
(-1, 2, [4], [6]),
],
)
@pytest.mark.parametrize("method", ["call", "index"])
def test_nth_slices_with_column_axis(
start, stop, expected_values, expected_columns, method
):
df = DataFrame([range(5)], columns=[list("ABCDE")])
gb = df.groupby([5, 5, 5, 6, 6], axis=1)
result = {
"call": lambda start, stop: gb.nth(slice(start, stop)),
"index": lambda start, stop: gb.nth[start:stop],
}[method](start, stop)
expected = DataFrame([expected_values], columns=expected_columns)
tm.assert_frame_equal(result, expected)
def test_head_tail_dropna_true():
# GH#45089
df = DataFrame(
[["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
)
expected = DataFrame([["a", "z"]], columns=["X", "Y"])
result = df.groupby(["X", "Y"]).head(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"]).tail(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"]).nth(n=0).reset_index()
tm.assert_frame_equal(result, expected)
def test_head_tail_dropna_false():
# GH#45089
df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
result = df.groupby(["X", "Y"], dropna=False).head(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"], dropna=False).nth(n=0).reset_index()
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,73 @@
import pytest
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
@td.skip_if_no("numba")
@pytest.mark.filterwarnings("ignore:\n")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
class TestEngine:
def test_cython_vs_numba_frame(
self, sort, nogil, parallel, nopython, numba_supported_reductions
):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
gb = df.groupby("a", sort=sort)
result = getattr(gb, func)(
engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
expected = getattr(gb, func)(**kwargs)
# check_dtype can be removed if GH 44952 is addressed
check_dtype = func != "sum"
tm.assert_frame_equal(result, expected, check_dtype=check_dtype)
def test_cython_vs_numba_getitem(
self, sort, nogil, parallel, nopython, numba_supported_reductions
):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
gb = df.groupby("a", sort=sort)["c"]
result = getattr(gb, func)(
engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
expected = getattr(gb, func)(**kwargs)
# check_dtype can be removed if GH 44952 is addressed
check_dtype = func != "sum"
tm.assert_series_equal(result, expected, check_dtype=check_dtype)
def test_cython_vs_numba_series(
self, sort, nogil, parallel, nopython, numba_supported_reductions
):
func, kwargs = numba_supported_reductions
ser = Series(range(3), index=[1, 2, 1], name="foo")
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
gb = ser.groupby(level=0, sort=sort)
result = getattr(gb, func)(
engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
expected = getattr(gb, func)(**kwargs)
# check_dtype can be removed if GH 44952 is addressed
check_dtype = func != "sum"
tm.assert_series_equal(result, expected, check_dtype=check_dtype)
def test_as_index_false_unsupported(self, numba_supported_reductions):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
gb = df.groupby("a", as_index=False)
with pytest.raises(NotImplementedError, match="as_index=False"):
getattr(gb, func)(engine="numba", **kwargs)
def test_axis_1_unsupported(self, numba_supported_reductions):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
gb = df.groupby("a", axis=1)
with pytest.raises(NotImplementedError, match="axis=1"):
getattr(gb, func)(engine="numba", **kwargs)

View File

@@ -0,0 +1,184 @@
import datetime as dt
from string import ascii_lowercase
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
MultiIndex,
NaT,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
@pytest.mark.slow
@pytest.mark.parametrize("n", 10 ** np.arange(2, 6))
@pytest.mark.parametrize("m", [10, 100, 1000])
@pytest.mark.parametrize("sort", [False, True])
@pytest.mark.parametrize("dropna", [False, True])
def test_series_groupby_nunique(n, m, sort, dropna):
def check_nunique(df, keys, as_index=True):
original_df = df.copy()
gr = df.groupby(keys, as_index=as_index, sort=sort)
left = gr["julie"].nunique(dropna=dropna)
gr = df.groupby(keys, as_index=as_index, sort=sort)
right = gr["julie"].apply(Series.nunique, dropna=dropna)
if not as_index:
right = right.reset_index(drop=True)
if as_index:
tm.assert_series_equal(left, right, check_names=False)
else:
tm.assert_frame_equal(left, right, check_names=False)
tm.assert_frame_equal(df, original_df)
days = date_range("2015-08-23", periods=10)
frame = DataFrame(
{
"jim": np.random.choice(list(ascii_lowercase), n),
"joe": np.random.choice(days, n),
"julie": np.random.randint(0, m, n),
}
)
check_nunique(frame, ["jim"])
check_nunique(frame, ["jim", "joe"])
frame.loc[1::17, "jim"] = None
frame.loc[3::37, "joe"] = None
frame.loc[7::19, "julie"] = None
frame.loc[8::19, "julie"] = None
frame.loc[9::19, "julie"] = None
check_nunique(frame, ["jim"])
check_nunique(frame, ["jim", "joe"])
check_nunique(frame, ["jim"], as_index=False)
check_nunique(frame, ["jim", "joe"], as_index=False)
def test_nunique():
df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]})
result = df.groupby("A", as_index=False).nunique()
tm.assert_frame_equal(result, expected)
# as_index
expected.index = list("abc")
expected.index.name = "A"
expected = expected.drop(columns="A")
result = df.groupby("A").nunique()
tm.assert_frame_equal(result, expected)
# with na
result = df.replace({"x": None}).groupby("A").nunique(dropna=False)
tm.assert_frame_equal(result, expected)
# dropna
expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
expected.index.name = "A"
result = df.replace({"x": None}).groupby("A").nunique()
tm.assert_frame_equal(result, expected)
def test_nunique_with_object():
# GH 11077
data = DataFrame(
[
[100, 1, "Alice"],
[200, 2, "Bob"],
[300, 3, "Charlie"],
[-400, 4, "Dan"],
[500, 5, "Edith"],
],
columns=["amount", "id", "name"],
)
result = data.groupby(["id", "amount"])["name"].nunique()
index = MultiIndex.from_arrays([data.id, data.amount])
expected = Series([1] * 5, name="name", index=index)
tm.assert_series_equal(result, expected)
def test_nunique_with_empty_series():
# GH 12553
data = Series(name="name", dtype=object)
result = data.groupby(level=0).nunique()
expected = Series(name="name", dtype="int64")
tm.assert_series_equal(result, expected)
def test_nunique_with_timegrouper():
# GH 13453
test = DataFrame(
{
"time": [
Timestamp("2016-06-28 09:35:35"),
Timestamp("2016-06-28 16:09:30"),
Timestamp("2016-06-28 16:46:28"),
],
"data": ["1", "2", "3"],
}
).set_index("time")
result = test.groupby(pd.Grouper(freq="h"))["data"].nunique()
expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"key, data, dropna, expected",
[
(
["x", "x", "x"],
[Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")],
True,
Series([1], index=pd.Index(["x"], name="key"), name="data"),
),
(
["x", "x", "x"],
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
True,
Series([1], index=pd.Index(["x"], name="key"), name="data"),
),
(
["x", "x", "x", "y", "y"],
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
False,
Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"),
),
(
["x", "x", "x", "x", "y"],
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
False,
Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"),
),
],
)
def test_nunique_with_NaT(key, data, dropna, expected):
# GH 27951
df = DataFrame({"key": key, "data": data})
result = df.groupby(["key"])["data"].nunique(dropna=dropna)
tm.assert_series_equal(result, expected)
def test_nunique_preserves_column_level_names():
# GH 23222
test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0"))
result = test.groupby([0, 0, 0]).nunique()
expected = DataFrame([2], columns=test.columns)
tm.assert_frame_equal(result, expected)
def test_nunique_transform_with_datetime():
# GH 35109 - transform with nunique on datetimes results in integers
df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"])
result = df.groupby([0, 0, 1])["date"].transform("nunique")
expected = Series([2, 2, 1], name="date")
tm.assert_series_equal(result, expected)

View File

@@ -0,0 +1,82 @@
import numpy as np
import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
from pandas.core.api import Int64Index
def test_pipe():
# Test the pipe method of DataFrameGroupBy.
# Issue #17871
random_state = np.random.RandomState(1234567890)
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": random_state.randn(8),
"C": random_state.randn(8),
}
)
def f(dfgb):
return dfgb.B.max() - dfgb.C.min().min()
def square(srs):
return srs**2
# Note that the transformations are
# GroupBy -> Series
# Series -> Series
# This then chains the GroupBy.pipe and the
# NDFrame.pipe methods
result = df.groupby("A").pipe(f).pipe(square)
index = Index(["bar", "foo"], dtype="object", name="A")
expected = pd.Series([8.99110003361, 8.17516964785], name="B", index=index)
tm.assert_series_equal(expected, result)
def test_pipe_args():
# Test passing args to the pipe method of DataFrameGroupBy.
# Issue #17871
df = DataFrame(
{
"group": ["A", "A", "B", "B", "C"],
"x": [1.0, 2.0, 3.0, 2.0, 5.0],
"y": [10.0, 100.0, 1000.0, -100.0, -1000.0],
}
)
def f(dfgb, arg1):
return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby(
dfgb.grouper
)
def g(dfgb, arg2):
return dfgb.sum() / dfgb.sum().sum() + arg2
def h(df, arg3):
return df.x + df.y - arg3
result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100)
# Assert the results here
index = Index(["A", "B", "C"], name="group")
expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index)
tm.assert_series_equal(expected, result)
# test SeriesGroupby.pipe
ser = pd.Series([1, 1, 2, 2, 3, 3])
result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
expected = pd.Series([4, 8, 12], index=Int64Index([1, 2, 3]))
tm.assert_series_equal(result, expected)

View File

@@ -0,0 +1,331 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
)
@pytest.mark.parametrize(
"a_vals,b_vals",
[
# Ints
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
([1, 2, 3, 4], [4, 3, 2, 1]),
([1, 2, 3, 4, 5], [4, 3, 2, 1]),
# Floats
([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
# Missing data
([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
# Timestamps
(
list(pd.date_range("1/1/18", freq="D", periods=5)),
list(pd.date_range("1/1/18", freq="D", periods=5))[::-1],
),
# All NA
([np.nan] * 5, [np.nan] * 5),
],
)
@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
def test_quantile(interpolation, a_vals, b_vals, q):
if interpolation == "nearest" and q == 0.5 and b_vals == [4, 3, 2, 1]:
pytest.skip(
"Unclear numpy expectation for nearest result with equidistant data"
)
a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
df = DataFrame(
{"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": a_vals + b_vals}
)
expected = DataFrame(
[a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
)
result = df.groupby("key").quantile(q, interpolation=interpolation)
tm.assert_frame_equal(result, expected)
def test_quantile_array():
# https://github.com/pandas-dev/pandas/issues/27526
df = DataFrame({"A": [0, 1, 2, 3, 4]})
result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25])
index = pd.MultiIndex.from_product([[0, 1], [0.25]])
expected = DataFrame({"A": [0.25, 2.50]}, index=index)
tm.assert_frame_equal(result, expected)
df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75])
expected = DataFrame(
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
)
tm.assert_frame_equal(result, expected)
def test_quantile_array2():
# https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
df = DataFrame(
np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC")
)
result = df.groupby("A").quantile([0.3, 0.7])
expected = DataFrame(
{
"B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0],
"C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0],
},
index=pd.MultiIndex.from_product(
[[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
),
)
tm.assert_frame_equal(result, expected)
def test_quantile_array_no_sort():
df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75])
expected = DataFrame(
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
)
tm.assert_frame_equal(result, expected)
result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25])
expected = DataFrame(
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
)
tm.assert_frame_equal(result, expected)
def test_quantile_array_multiple_levels():
df = DataFrame(
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
)
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
index = pd.MultiIndex.from_tuples(
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
names=["c", "d", None],
)
expected = DataFrame(
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
@pytest.mark.parametrize("groupby", [[0], [0, 1]])
@pytest.mark.parametrize("q", [[0.5, 0.6]])
def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
# GH30289
nrow, ncol = frame_size
df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
idx_levels = [list(range(min(nrow, 4)))] * len(groupby) + [q]
idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
list(range(len(q))) * min(nrow, 4)
]
expected_index = pd.MultiIndex(
levels=idx_levels, codes=idx_codes, names=groupby + [None]
)
expected_values = [
[float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
]
expected_columns = [x for x in range(ncol) if x not in groupby]
expected = DataFrame(
expected_values, index=expected_index, columns=expected_columns
)
result = df.groupby(groupby).quantile(q)
tm.assert_frame_equal(result, expected)
def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid columns"
):
df.groupby("key").quantile()
def test_quantile_out_of_bounds_q_raises():
# https://github.com/pandas-dev/pandas/issues/27470
df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
g = df.groupby([0, 0, 0, 1, 1, 1])
with pytest.raises(ValueError, match="Got '50.0' instead"):
g.quantile(50)
with pytest.raises(ValueError, match="Got '-1.0' instead"):
g.quantile(-1)
def test_quantile_missing_group_values_no_segfaults():
# GH 28662
data = np.array([1.0, np.nan, 1.0])
df = DataFrame({"key": data, "val": range(3)})
# Random segfaults; would have been guaranteed in loop
grp = df.groupby("key")
for _ in range(100):
grp.quantile()
@pytest.mark.parametrize(
"key, val, expected_key, expected_val",
[
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
([0], [42], [0], [42.0]),
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
],
)
def test_quantile_missing_group_values_correct_results(
key, val, expected_key, expected_val
):
# GH 28662, GH 33200, GH 33569
df = DataFrame({"key": key, "val": val})
expected = DataFrame(
expected_val, index=Index(expected_key, name="key"), columns=["val"]
)
grp = df.groupby("key")
result = grp.quantile(0.5)
tm.assert_frame_equal(result, expected)
result = grp.quantile()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
pd.array([1, 0, None] * 2, dtype="Int64"),
pd.array([True, False, None] * 2, dtype="boolean"),
],
)
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
def test_groupby_quantile_nullable_array(values, q):
# https://github.com/pandas-dev/pandas/issues/33136
df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
result = df.groupby("a")["b"].quantile(q)
if isinstance(q, list):
idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
true_quantiles = [0.0, 0.5, 1.0]
else:
idx = Index(["x", "y"], name="a")
true_quantiles = [0.5]
expected = pd.Series(true_quantiles * 2, index=idx, name="b")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
def test_groupby_quantile_skips_invalid_dtype(q):
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
result = df.groupby("a").quantile(q)
expected = df.groupby("a")[["b"]].quantile(q)
tm.assert_frame_equal(result, expected)
def test_groupby_quantile_NA_float(any_float_dtype):
# GH#42849
df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
result = df.groupby("x")["y"].quantile(0.5)
exp_index = Index([1.0], dtype=any_float_dtype, name="x")
expected = pd.Series([0.2], dtype=float, index=exp_index, name="y")
tm.assert_series_equal(expected, result)
result = df.groupby("x")["y"].quantile([0.5, 0.75])
expected = pd.Series(
[0.2] * 2,
index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
name="y",
)
tm.assert_series_equal(result, expected)
def test_groupby_quantile_NA_int(any_int_ea_dtype):
# GH#42849
df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
result = df.groupby("x")["y"].quantile(0.5)
expected = pd.Series(
[3.5], dtype=float, index=Index([1], name="x", dtype=any_int_ea_dtype), name="y"
)
tm.assert_series_equal(expected, result)
result = df.groupby("x").quantile(0.5)
expected = DataFrame({"y": 3.5}, index=Index([1], name="x", dtype=any_int_ea_dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Float64", "Float32"])
def test_groupby_quantile_allNA_column(dtype):
# GH#42849
df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
result = df.groupby("x")["y"].quantile(0.5)
expected = pd.Series(
[np.nan], dtype=float, index=Index([1.0], dtype=dtype), name="y"
)
expected.index.name = "x"
tm.assert_series_equal(expected, result)
def test_groupby_timedelta_quantile():
# GH: 29485
df = DataFrame(
{"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]}
)
result = df.groupby("group").quantile(0.99)
expected = DataFrame(
{
"value": [
pd.Timedelta("0 days 00:00:00.990000"),
pd.Timedelta("0 days 00:00:02.990000"),
]
},
index=Index([1, 2], name="group"),
)
tm.assert_frame_equal(result, expected)
def test_columns_groupby_quantile():
# GH 33795
df = DataFrame(
np.arange(12).reshape(3, -1),
index=list("XYZ"),
columns=pd.Series(list("ABAB"), name="col"),
)
result = df.groupby("col", axis=1).quantile(q=[0.8, 0.2])
expected = DataFrame(
[
[1.6, 0.4, 2.6, 1.4],
[5.6, 4.4, 6.6, 5.4],
[9.6, 8.4, 10.6, 9.4],
],
index=list("XYZ"),
columns=pd.MultiIndex.from_tuples(
[("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
),
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,663 @@
from datetime import datetime
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
NaT,
Series,
concat,
)
import pandas._testing as tm
def test_rank_apply():
lev1 = tm.rands_array(10, 100)
lev2 = tm.rands_array(10, 130)
lab1 = np.random.randint(0, 100, size=500)
lab2 = np.random.randint(0, 130, size=500)
df = DataFrame(
{
"value": np.random.randn(500),
"key1": lev1.take(lab1),
"key2": lev2.take(lab2),
}
)
result = df.groupby(["key1", "key2"]).value.rank()
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
expected = concat(expected, axis=0)
expected = expected.reindex(result.index)
tm.assert_series_equal(result, expected)
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
expected = [
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
]
expected = concat(expected, axis=0)
expected = expected.reindex(result.index)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals",
[
np.array([2, 2, 8, 2, 6], dtype=dtype)
for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
]
+ [
[
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-08"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-06"),
],
[
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-08", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-06", tz="US/Pacific"),
],
[
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
],
[
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-08").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-06").to_period("D"),
],
],
ids=lambda x: type(x[0]),
)
@pytest.mark.parametrize(
"ties_method,ascending,pct,exp",
[
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
],
)
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
key = np.repeat(grps, len(vals))
orig_vals = vals
vals = list(vals) * len(grps)
if isinstance(orig_vals, np.ndarray):
vals = np.array(vals, dtype=orig_vals.dtype)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
)
@pytest.mark.parametrize(
"ties_method,ascending,na_option,exp",
[
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
],
)
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
# GH 20561
key = np.repeat(grps, len(vals))
vals = vals * len(grps)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option
)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals",
[
np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
for dtype in ["f8", "f4", "f2"]
]
+ [
[
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-02"),
np.nan,
pd.Timestamp("2018-01-08"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-06"),
np.nan,
np.nan,
],
[
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
np.nan,
pd.Timestamp("2018-01-08", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-06", tz="US/Pacific"),
np.nan,
np.nan,
],
[
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
np.nan,
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
np.nan,
np.nan,
],
[
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
np.nan,
pd.Timestamp("2018-01-08").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-06").to_period("D"),
np.nan,
np.nan,
],
],
ids=lambda x: type(x[0]),
)
@pytest.mark.parametrize(
"ties_method,ascending,na_option,pct,exp",
[
(
"average",
True,
"keep",
False,
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
),
(
"average",
True,
"keep",
True,
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
),
(
"average",
False,
"keep",
False,
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
),
(
"average",
False,
"keep",
True,
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
),
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
(
"min",
False,
"keep",
False,
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
),
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
(
"max",
False,
"keep",
False,
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
),
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
(
"first",
True,
"keep",
False,
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
),
(
"first",
True,
"keep",
True,
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
),
(
"first",
False,
"keep",
False,
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
),
(
"first",
False,
"keep",
True,
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
),
(
"dense",
True,
"keep",
False,
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
),
(
"dense",
True,
"keep",
True,
[
1.0 / 3.0,
1.0 / 3.0,
np.nan,
3.0 / 3.0,
1.0 / 3.0,
2.0 / 3.0,
np.nan,
np.nan,
],
),
(
"dense",
False,
"keep",
False,
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
),
(
"dense",
False,
"keep",
True,
[
3.0 / 3.0,
3.0 / 3.0,
np.nan,
1.0 / 3.0,
3.0 / 3.0,
2.0 / 3.0,
np.nan,
np.nan,
],
),
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
(
"average",
True,
"bottom",
True,
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
),
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
(
"average",
False,
"bottom",
True,
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
),
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
(
"min",
True,
"bottom",
True,
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
),
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
(
"min",
False,
"bottom",
True,
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
),
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
(
"max",
False,
"bottom",
True,
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
),
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
(
"first",
True,
"bottom",
True,
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
),
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
(
"first",
False,
"bottom",
True,
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
),
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
],
)
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
key = np.repeat(grps, len(vals))
orig_vals = vals
vals = list(vals) * len(grps)
if isinstance(orig_vals, np.ndarray):
vals = np.array(vals, dtype=orig_vals.dtype)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize(
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
)
def test_rank_resets_each_group(pct, exp):
df = DataFrame(
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
)
result = df.groupby("key").rank(pct=pct)
exp_df = DataFrame(exp * 2, columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize(
"dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
)
@pytest.mark.parametrize("upper", [True, False])
def test_rank_avg_even_vals(dtype, upper):
if upper:
# use IntegerDtype/FloatingDtype
dtype = dtype[0].upper() + dtype[1:]
dtype = dtype.replace("Ui", "UI")
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
df["val"] = df["val"].astype(dtype)
assert df["val"].dtype == dtype
result = df.groupby("key").rank()
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
@pytest.mark.parametrize("pct", [True, False])
@pytest.mark.parametrize(
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
)
def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals):
df = DataFrame({"key": ["foo"] * 5, "val": vals})
mask = df["val"].isna()
gb = df.groupby("key")
res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
# construct our expected by using numeric values with the same ordering
if mask.any():
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
else:
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
gb2 = df2.groupby("key")
alt = gb2.rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
)
tm.assert_frame_equal(res, alt)
@pytest.mark.parametrize("na_option", [True, "bad", 1])
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("pct", [True, False])
@pytest.mark.parametrize(
"vals",
[
["bar", "bar", "foo", "bar", "baz"],
["bar", np.nan, "foo", np.nan, "baz"],
[1, np.nan, 2, np.nan, 3],
],
)
def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
df = DataFrame({"key": ["foo"] * 5, "val": vals})
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
with pytest.raises(ValueError, match=msg):
df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
)
def test_rank_empty_group():
# see gh-22519
column = "A"
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
result = df.groupby(column).B.rank(pct=True)
expected = Series([0.5, np.nan, 1.0], name="B")
tm.assert_series_equal(result, expected)
result = df.groupby(column).rank(pct=True)
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"input_key,input_value,output_value",
[
([1, 2], [1, 1], [1.0, 1.0]),
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
],
)
def test_rank_zero_div(input_key, input_value, output_value):
# GH 23666
df = DataFrame({"A": input_key, "B": input_value})
result = df.groupby("A").rank(method="dense", pct=True)
expected = DataFrame({"B": output_value})
tm.assert_frame_equal(result, expected)
def test_rank_min_int():
# GH-32859
df = DataFrame(
{
"grp": [1, 1, 2],
"int_col": [
np.iinfo(np.int64).min,
np.iinfo(np.int64).max,
np.iinfo(np.int64).min,
],
"datetimelike": [NaT, datetime(2001, 1, 1), NaT],
}
)
result = df.groupby("grp").rank()
expected = DataFrame(
{"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.NaN, 1.0, np.NaN]}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("use_nan", [True, False])
def test_rank_pct_equal_values_on_group_transition(use_nan):
# GH#40518
fill_value = np.nan if use_nan else 3
df = DataFrame(
[
[-1, 1],
[-1, 2],
[1, fill_value],
[-1, fill_value],
],
columns=["group", "val"],
)
result = df.groupby(["group"])["val"].rank(
method="dense",
pct=True,
)
if use_nan:
expected = Series([0.5, 1, np.nan, np.nan], name="val")
else:
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
tm.assert_series_equal(result, expected)
def test_rank_multiindex():
# GH27721
df = concat(
{
"a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
"b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
},
axis=1,
)
gb = df.groupby(level=0, axis=1)
result = gb.rank(axis=1)
expected = concat(
[
df["a"].rank(axis=1),
df["b"].rank(axis=1),
],
axis=1,
keys=["a", "b"],
)
tm.assert_frame_equal(result, expected)
def test_groupby_axis0_rank_axis1():
# GH#41320
df = DataFrame(
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
index=["a", "a", "b", "b"],
)
gb = df.groupby(level=0, axis=0)
res = gb.rank(axis=1)
# This should match what we get when "manually" operating group-by-group
expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
tm.assert_frame_equal(res, expected)
# check that we haven't accidentally written a case that coincidentally
# matches rank(axis=0)
alt = gb.rank(axis=0)
assert not alt.equals(expected)
def test_groupby_axis0_cummax_axis1():
# case where groupby axis is 0 and axis keyword in transform is 1
# df has mixed dtype -> multiple blocks
df = DataFrame(
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
index=["a", "a", "b", "b"],
)
gb = df.groupby(level=0, axis=0)
cmax = gb.cummax(axis=1)
expected = df[[0, 1]].astype(np.float64)
expected[2] = expected[1]
tm.assert_frame_equal(cmax, expected)
def test_non_unique_index():
# GH 16577
df = DataFrame(
{"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
)
result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
expected = Series(
[1.0] * 4, index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, name="value"
)
tm.assert_series_equal(result, expected)

View File

@@ -0,0 +1,144 @@
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
def test_groupby_sample_balanced_groups_shape(n, frac):
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(n=n, frac=frac)
values = [1] * 2 + [2] * 2
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=n, frac=frac)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_unbalanced_groups_shape():
values = [1] * 10 + [2] * 20
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(n=5)
values = [1] * 5 + [2] * 5
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=5)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_index_value_spans_groups():
values = [1] * 3 + [2] * 3
df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
result = df.groupby("a").sample(n=2)
values = [1] * 2 + [2] * 2
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=2)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_n_and_frac_raises():
df = DataFrame({"a": [1, 2], "b": [1, 2]})
msg = "Please enter a value for `frac` OR `n`, not both"
with pytest.raises(ValueError, match=msg):
df.groupby("a").sample(n=1, frac=1.0)
with pytest.raises(ValueError, match=msg):
df.groupby("a")["b"].sample(n=1, frac=1.0)
def test_groupby_sample_frac_gt_one_without_replacement_raises():
df = DataFrame({"a": [1, 2], "b": [1, 2]})
msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
with pytest.raises(ValueError, match=msg):
df.groupby("a").sample(frac=1.5, replace=False)
with pytest.raises(ValueError, match=msg):
df.groupby("a")["b"].sample(frac=1.5, replace=False)
@pytest.mark.parametrize("n", [-1, 1.5])
def test_groupby_sample_invalid_n_raises(n):
df = DataFrame({"a": [1, 2], "b": [1, 2]})
if n < 0:
msg = "A negative number of rows requested. Please provide `n` >= 0."
else:
msg = "Only integers accepted as `n` values"
with pytest.raises(ValueError, match=msg):
df.groupby("a").sample(n=n)
with pytest.raises(ValueError, match=msg):
df.groupby("a")["b"].sample(n=n)
def test_groupby_sample_oversample():
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(frac=2.0, replace=True)
values = [1] * 20 + [2] * 20
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_without_n_or_frac():
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(n=None, frac=None)
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=None, frac=None)
expected = Series([1, 2], name="b", index=result.index)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"index, expected_index",
[(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
)
def test_groupby_sample_with_weights(index, expected_index):
# GH 39927 - tests for integer index needed
values = [1] * 2 + [2] * 2
df = DataFrame({"a": values, "b": values}, index=Index(index))
result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
expected = Series(values, name="b", index=Index(expected_index))
tm.assert_series_equal(result, expected)
def test_groupby_sample_with_selections():
# GH 39928
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values, "c": values})
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,67 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
PeriodIndex,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
def test_size(df, by):
grouped = df.groupby(by=by)
result = grouped.size()
for key, group in grouped:
assert result[key] == len(group)
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
@pytest.mark.parametrize("sort", [True, False])
def test_size_sort(df, sort, by):
df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("ABC"))
left = df.groupby(by=by, sort=sort).size()
right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
tm.assert_series_equal(left, right, check_names=False)
def test_size_series_dataframe():
# https://github.com/pandas-dev/pandas/issues/11699
df = DataFrame(columns=["A", "B"])
out = Series(dtype="int64", index=Index([], name="A"))
tm.assert_series_equal(df.groupby("A").size(), out)
def test_size_groupby_all_null():
# https://github.com/pandas-dev/pandas/issues/23050
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
df = DataFrame({"A": [None, None]}) # all-null groups
result = df.groupby("A").size()
expected = Series(dtype="int64", index=Index([], name="A"))
tm.assert_series_equal(result, expected)
def test_size_period_index():
# https://github.com/pandas-dev/pandas/issues/34010
ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
grp = ser.groupby(level="A")
result = grp.size()
tm.assert_series_equal(result, ser)
@pytest.mark.parametrize("as_index", [True, False])
def test_size_on_categorical(as_index):
df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
df["A"] = df["A"].astype("category")
result = df.groupby(["A", "B"], as_index=as_index).size()
expected = DataFrame(
[[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
)
expected["A"] = expected["A"].astype("category")
if as_index:
expected = expected.set_index(["A", "B"])["size"].rename(None)
tm.assert_equal(result, expected)

View File

@@ -0,0 +1,910 @@
""" test with the TimeGrouper / grouping with datetimes """
from datetime import datetime
from io import StringIO
import numpy as np
import pytest
import pytz
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
offsets,
)
import pandas._testing as tm
from pandas.core.groupby.grouper import Grouper
from pandas.core.groupby.ops import BinGrouper
@pytest.fixture
def frame_for_truncated_bingrouper():
"""
DataFrame used by groupby_with_truncated_bingrouper, made into
a separate fixture for easier re-use in
test_groupby_apply_timegrouper_with_nat_apply_squeeze
"""
df = DataFrame(
{
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
Timestamp(2013, 9, 1, 13, 0),
Timestamp(2013, 9, 1, 13, 5),
Timestamp(2013, 10, 1, 20, 0),
Timestamp(2013, 10, 3, 10, 0),
pd.NaT,
Timestamp(2013, 9, 2, 14, 0),
],
}
)
return df
@pytest.fixture
def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
"""
GroupBy object such that gb.grouper is a BinGrouper and
len(gb.grouper.result_index) < len(gb.grouper.group_keys_seq)
Aggregations on this groupby should have
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
As either the index or an index level.
"""
df = frame_for_truncated_bingrouper
tdg = Grouper(key="Date", freq="5D")
gb = df.groupby(tdg)
# check we're testing the case we're interested in
assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
return gb
class TestGroupBy:
def test_groupby_with_timegrouper(self):
# GH 4161
# TimeGrouper requires a sorted index
# also verifies that the resultant index has the correct name
df_original = DataFrame(
{
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
# GH 6908 change target column's order
df_reordered = df_original.sort_values(by="Quantity")
for df in [df_original, df_reordered]:
df = df.set_index(["Date"])
expected = DataFrame(
{"Quantity": 0},
index=date_range(
"20130901", "20131205", freq="5D", name="Date", inclusive="left"
),
)
expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64")
result1 = df.resample("5D").sum()
tm.assert_frame_equal(result1, expected)
df_sorted = df.sort_index()
result2 = df_sorted.groupby(Grouper(freq="5D")).sum()
tm.assert_frame_equal(result2, expected)
result3 = df.groupby(Grouper(freq="5D")).sum()
tm.assert_frame_equal(result3, expected)
@pytest.mark.parametrize("should_sort", [True, False])
def test_groupby_with_timegrouper_methods(self, should_sort):
# GH 3881
# make sure API of timegrouper conforms
df = DataFrame(
{
"Branch": "A A A A A B".split(),
"Buyer": "Carl Mark Carl Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 8, 9, 3],
"Date": [
datetime(2013, 1, 1, 13, 0),
datetime(2013, 1, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 12, 2, 14, 0),
],
}
)
if should_sort:
df = df.sort_values(by="Quantity", ascending=False)
df = df.set_index("Date", drop=False)
g = df.groupby(Grouper(freq="6M"))
assert g.group_keys
assert isinstance(g.grouper, BinGrouper)
groups = g.groups
assert isinstance(groups, dict)
assert len(groups) == 3
def test_timegrouper_with_reg_groups(self):
# GH 3794
# allow combination of timegrouper/reg groups
df_original = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 1, 1, 13, 0),
datetime(2013, 1, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 12, 2, 14, 0),
],
}
).set_index("Date")
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
for df in [df_original, df_sorted]:
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 12, 31, 0, 0),
datetime(2013, 12, 31, 0, 0),
datetime(2013, 12, 31, 0, 0),
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([Grouper(freq="A"), "Buyer"]).sum()
tm.assert_frame_equal(result, expected)
expected = DataFrame(
{
"Buyer": "Carl Mark Carl Joe".split(),
"Quantity": [1, 3, 9, 18],
"Date": [
datetime(2013, 1, 1, 0, 0),
datetime(2013, 1, 1, 0, 0),
datetime(2013, 7, 1, 0, 0),
datetime(2013, 7, 1, 0, 0),
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum()
tm.assert_frame_equal(result, expected)
df_original = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 10, 1, 13, 0),
datetime(2013, 10, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 2, 12, 0),
datetime(2013, 10, 2, 14, 0),
],
}
).set_index("Date")
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
for df in [df_original, df_sorted]:
expected = DataFrame(
{
"Buyer": "Carl Joe Mark Carl Joe".split(),
"Quantity": [6, 8, 3, 4, 10],
"Date": [
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 2, 0, 0),
datetime(2013, 10, 2, 0, 0),
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum()
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum()
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 10, 31, 0, 0),
datetime(2013, 10, 31, 0, 0),
datetime(2013, 10, 31, 0, 0),
],
}
).set_index(["Date", "Buyer"])
tm.assert_frame_equal(result, expected)
# passing the name
df = df.reset_index()
result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum()
tm.assert_frame_equal(result, expected)
with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
df.groupby([Grouper(freq="1M", key="foo"), "Buyer"]).sum()
# passing the level
df = df.set_index("Date")
result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum()
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum()
tm.assert_frame_equal(result, expected)
with pytest.raises(ValueError, match="The level foo is not valid"):
df.groupby([Grouper(freq="1M", level="foo"), "Buyer"]).sum()
# multi names
df = df.copy()
df["Date"] = df.index + offsets.MonthEnd(2)
result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum()
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 11, 30, 0, 0),
datetime(2013, 11, 30, 0, 0),
datetime(2013, 11, 30, 0, 0),
],
}
).set_index(["Date", "Buyer"])
tm.assert_frame_equal(result, expected)
# error as we have both a level and a name!
msg = "The Grouper cannot specify both a key and a level!"
with pytest.raises(ValueError, match=msg):
df.groupby(
[Grouper(freq="1M", key="Date", level="Date"), "Buyer"]
).sum()
# single groupers
expected = DataFrame(
[[31]],
columns=["Quantity"],
index=DatetimeIndex(
[datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
),
)
result = df.groupby(Grouper(freq="1M")).sum()
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1M")]).sum()
tm.assert_frame_equal(result, expected)
expected.index = expected.index.shift(1)
assert expected.index.freq == offsets.MonthEnd()
result = df.groupby(Grouper(freq="1M", key="Date")).sum()
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1M", key="Date")]).sum()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"])
def test_timegrouper_with_reg_groups_freq(self, freq):
# GH 6764 multiple grouping with/without sort
df = DataFrame(
{
"date": pd.to_datetime(
[
"20121002",
"20121007",
"20130130",
"20130202",
"20130305",
"20121002",
"20121207",
"20130130",
"20130202",
"20130305",
"20130202",
"20130305",
]
),
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
"whole_cost": [
1790,
364,
280,
259,
201,
623,
90,
312,
359,
301,
359,
801,
],
"cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
}
).set_index("date")
expected = (
df.groupby("user_id")["whole_cost"]
.resample(freq)
.sum(min_count=1) # XXX
.dropna()
.reorder_levels(["date", "user_id"])
.sort_index()
.astype("int64")
)
expected.name = "whole_cost"
result1 = (
df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
)
tm.assert_series_equal(result1, expected)
result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
tm.assert_series_equal(result2, expected)
def test_timegrouper_get_group(self):
# GH 6914
df_original = DataFrame(
{
"Buyer": "Carl Joe Joe Carl Joe Carl".split(),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
df_reordered = df_original.sort_values(by="Quantity")
# single grouping
expected_list = [
df_original.iloc[[0, 1, 5]],
df_original.iloc[[2, 3]],
df_original.iloc[[4]],
]
dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
for df in [df_original, df_reordered]:
grouped = df.groupby(Grouper(freq="M", key="Date"))
for t, expected in zip(dt_list, expected_list):
dt = Timestamp(t)
result = grouped.get_group(dt)
tm.assert_frame_equal(result, expected)
# multiple grouping
expected_list = [
df_original.iloc[[1]],
df_original.iloc[[3]],
df_original.iloc[[4]],
]
g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
for df in [df_original, df_reordered]:
grouped = df.groupby(["Buyer", Grouper(freq="M", key="Date")])
for (b, t), expected in zip(g_list, expected_list):
dt = Timestamp(t)
result = grouped.get_group((b, dt))
tm.assert_frame_equal(result, expected)
# with index
df_original = df_original.set_index("Date")
df_reordered = df_original.sort_values(by="Quantity")
expected_list = [
df_original.iloc[[0, 1, 5]],
df_original.iloc[[2, 3]],
df_original.iloc[[4]],
]
for df in [df_original, df_reordered]:
grouped = df.groupby(Grouper(freq="M"))
for t, expected in zip(dt_list, expected_list):
dt = Timestamp(t)
result = grouped.get_group(dt)
tm.assert_frame_equal(result, expected)
def test_timegrouper_apply_return_type_series(self):
# Using `apply` with the `TimeGrouper` should give the
# same return type as an `apply` with a `Grouper`.
# Issue #11742
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
df_dt = df.copy()
df_dt["date"] = pd.to_datetime(df_dt["date"])
def sumfunc_series(x):
return Series([x["value"].sum()], ("sum",))
expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series)
tm.assert_frame_equal(
result.reset_index(drop=True), expected.reset_index(drop=True)
)
def test_timegrouper_apply_return_type_value(self):
# Using `apply` with the `TimeGrouper` should give the
# same return type as an `apply` with a `Grouper`.
# Issue #11742
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
df_dt = df.copy()
df_dt["date"] = pd.to_datetime(df_dt["date"])
def sumfunc_value(x):
return x.value.sum()
expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value)
tm.assert_series_equal(
result.reset_index(drop=True), expected.reset_index(drop=True)
)
def test_groupby_groups_datetimeindex(self):
# GH#1430
periods = 1000
ind = date_range(start="2012/1/1", freq="5min", periods=periods)
df = DataFrame(
{"high": np.arange(periods), "low": np.arange(periods)}, index=ind
)
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
# it works!
groups = grouped.groups
assert isinstance(list(groups.keys())[0], datetime)
# GH#11442
index = date_range("2015/01/01", periods=5, name="date")
df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
result = df.groupby(level="date").groups
dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
expected = {
Timestamp(date): DatetimeIndex([date], name="date") for date in dates
}
tm.assert_dict_equal(result, expected)
grouped = df.groupby(level="date")
for date in dates:
result = grouped.get_group(date)
data = [[df.loc[date, "A"], df.loc[date, "B"]]]
expected_index = DatetimeIndex([date], name="date", freq="D")
expected = DataFrame(data, columns=list("AB"), index=expected_index)
tm.assert_frame_equal(result, expected)
def test_groupby_groups_datetimeindex_tz(self):
# GH 3950
dates = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"datetime": dates,
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
exp_idx1 = DatetimeIndex(
[
"2011-07-19 07:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 09:00:00",
],
tz="US/Pacific",
name="datetime",
)
exp_idx2 = Index(["a", "b"] * 3, name="label")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(["datetime", "label"]).sum()
tm.assert_frame_equal(result, expected)
# by level
didx = DatetimeIndex(dates, tz="Asia/Tokyo")
df = DataFrame(
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
index=didx,
)
exp_idx = DatetimeIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
tz="Asia/Tokyo",
)
expected = DataFrame(
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(level=0).sum()
tm.assert_frame_equal(result, expected)
def test_frame_datetime64_handling_groupby(self):
# it works!
df = DataFrame(
[(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
columns=["a", "date"],
)
result = df.groupby("a").first()
assert result["date"][3] == Timestamp("2012-07-03")
def test_groupby_multi_timezone(self):
# combining multiple / different timezones yields UTC
data = """0,2000-01-28 16:47:00,America/Chicago
1,2000-01-29 16:48:00,America/Chicago
2,2000-01-30 16:49:00,America/Los_Angeles
3,2000-01-31 16:50:00,America/Chicago
4,2000-01-01 16:50:00,America/New_York"""
df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"])
result = df.groupby("tz").date.apply(
lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
)
expected = Series(
[
Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
],
name="date",
dtype=object,
)
tm.assert_series_equal(result, expected)
tz = "America/Chicago"
res_values = df.groupby("tz").date.get_group(tz)
result = pd.to_datetime(res_values).dt.tz_localize(tz)
exp_values = Series(
["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
index=[0, 1, 3],
name="date",
)
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
tm.assert_series_equal(result, expected)
def test_groupby_groups_periods(self):
dates = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"period": [pd.Period(d, freq="H") for d in dates],
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
exp_idx1 = pd.PeriodIndex(
[
"2011-07-19 07:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 09:00:00",
],
freq="H",
name="period",
)
exp_idx2 = Index(["a", "b"] * 3, name="label")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(["period", "label"]).sum()
tm.assert_frame_equal(result, expected)
# by level
didx = pd.PeriodIndex(dates, freq="H")
df = DataFrame(
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
index=didx,
)
exp_idx = pd.PeriodIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
freq="H",
)
expected = DataFrame(
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(level=0).sum()
tm.assert_frame_equal(result, expected)
def test_groupby_first_datetime64(self):
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
df[1] = df[1].view("M8[ns]")
assert issubclass(df[1].dtype.type, np.datetime64)
result = df.groupby(level=0).first()
got_dt = result[1].dtype
assert issubclass(got_dt.type, np.datetime64)
result = df[1].groupby(level=0).first()
got_dt = result.dtype
assert issubclass(got_dt.type, np.datetime64)
def test_groupby_max_datetime64(self):
# GH 5869
# datetimelike dtype conversion from int
df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
expected = df.groupby("A")["A"].apply(lambda x: x.max())
result = df.groupby("A")["A"].max()
tm.assert_series_equal(result, expected)
def test_groupby_datetime64_32_bit(self):
# GH 6410 / numpy 4328
# 32-bit under 1.9-dev indexing issue
df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2})
result = df.groupby("A")["B"].transform(min)
expected = Series([Timestamp("2000-01-1")] * 2, name="B")
tm.assert_series_equal(result, expected)
def test_groupby_with_timezone_selection(self):
# GH 11616
# Test that column selection returns output in correct timezone.
np.random.seed(42)
df = DataFrame(
{
"factor": np.random.randint(0, 3, size=60),
"time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"),
}
)
df1 = df.groupby("factor").max()["time"]
df2 = df.groupby("factor")["time"].max()
tm.assert_series_equal(df1, df2)
def test_timezone_info(self):
# see gh-11682: Timezone info lost when broadcasting
# scalar datetime to DataFrame
df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]})
assert df["b"][0].tzinfo == pytz.utc
df = DataFrame({"a": [1, 2, 3]})
df["b"] = datetime.now(pytz.utc)
assert df["b"][0].tzinfo == pytz.utc
def test_datetime_count(self):
df = DataFrame(
{"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="T")}
)
result = df.groupby("a").dates.count()
expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
tm.assert_series_equal(result, expected)
def test_first_last_max_min_on_time_data(self):
# GH 10295
# Verify that NaT is not in the result of max, min, first and last on
# Dataframe with datetime or timedelta values.
from datetime import timedelta as td
df_test = DataFrame(
{
"dt": [
np.nan,
"2015-07-24 10:10",
"2015-07-25 11:11",
"2015-07-23 12:12",
np.nan,
],
"td": [np.nan, td(days=1), td(days=2), td(days=3), np.nan],
}
)
df_test.dt = pd.to_datetime(df_test.dt)
df_test["group"] = "A"
df_ref = df_test[df_test.dt.notna()]
grouped_test = df_test.groupby("group")
grouped_ref = df_ref.groupby("group")
tm.assert_frame_equal(grouped_ref.max(), grouped_test.max())
tm.assert_frame_equal(grouped_ref.min(), grouped_test.min())
tm.assert_frame_equal(grouped_ref.first(), grouped_test.first())
tm.assert_frame_equal(grouped_ref.last(), grouped_test.last())
def test_nunique_with_timegrouper_and_nat(self):
# GH 17575
test = DataFrame(
{
"time": [
Timestamp("2016-06-28 09:35:35"),
pd.NaT,
Timestamp("2016-06-28 16:46:28"),
],
"data": ["1", "2", "3"],
}
)
grouper = Grouper(key="time", freq="h")
result = test.groupby(grouper)["data"].nunique()
expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
expected.index = expected.index._with_freq(None)
tm.assert_series_equal(result, expected)
def test_scalar_call_versus_list_call(self):
# Issue: 17530
data_frame = {
"location": ["shanghai", "beijing", "shanghai"],
"time": Series(
["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
dtype="datetime64[ns]",
),
"value": [1, 2, 3],
}
data_frame = DataFrame(data_frame).set_index("time")
grouper = Grouper(freq="D")
grouped = data_frame.groupby(grouper)
result = grouped.count()
grouped = data_frame.groupby([grouper])
expected = grouped.count()
tm.assert_frame_equal(result, expected)
def test_grouper_period_index(self):
# GH 32108
periods = 2
index = pd.period_range(
start="2018-01", periods=periods, freq="M", name="Month"
)
period_series = Series(range(periods), index=index)
result = period_series.groupby(period_series.index.month).sum()
expected = Series(
range(0, periods), index=Index(range(1, periods + 1), name=index.name)
)
tm.assert_series_equal(result, expected)
def test_groupby_apply_timegrouper_with_nat_dict_returns(
self, groupby_with_truncated_bingrouper
):
# GH#43500 case where gb.grouper.result_index and gb.grouper.group_keys_seq
# have different lengths that goes through the `isinstance(values[0], dict)`
# path
gb = groupby_with_truncated_bingrouper
res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
tm.assert_series_equal(res, expected)
def test_groupby_apply_timegrouper_with_nat_scalar_returns(
self, groupby_with_truncated_bingrouper
):
# GH#43500 Previously raised ValueError bc used index with incorrect
# length in wrap_applied_result
gb = groupby_with_truncated_bingrouper
res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
expected = Series(
[18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
index=dti._with_freq(None),
name="Quantity",
)
tm.assert_series_equal(res, expected)
def test_groupby_apply_timegrouper_with_nat_apply_squeeze(
self, frame_for_truncated_bingrouper
):
df = frame_for_truncated_bingrouper
# We need to create a GroupBy object with only one non-NaT group,
# so use a huge freq so that all non-NaT dates will be grouped together
tdg = Grouper(key="Date", freq="100Y")
with tm.assert_produces_warning(FutureWarning, match="`squeeze` parameter"):
gb = df.groupby(tdg, squeeze=True)
# check that we will go through the singular_series path
# in _wrap_applied_output_series
assert gb.ngroups == 1
assert gb._selected_obj._get_axis(gb.axis).nlevels == 1
# function that returns a Series
res = gb.apply(lambda x: x["Quantity"] * 2)
key = Timestamp("2013-12-31")
ordering = df["Date"].sort_values().dropna().index
mi = MultiIndex.from_product([[key], ordering], names=["Date", None])
ex_values = df["Quantity"].take(ordering).values * 2
expected = Series(ex_values, index=mi, name="Quantity")
tm.assert_series_equal(res, expected)
@td.skip_if_no("numba")
def test_groupby_agg_numba_timegrouper_with_nat(
self, groupby_with_truncated_bingrouper
):
# See discussion in GH#43487
gb = groupby_with_truncated_bingrouper
result = gb["Quantity"].aggregate(
lambda values, index: np.nanmean(values), engine="numba"
)
expected = gb["Quantity"].aggregate(np.nanmean)
tm.assert_series_equal(result, expected)
result_df = gb[["Quantity"]].aggregate(
lambda values, index: np.nanmean(values), engine="numba"
)
expected_df = gb[["Quantity"]].aggregate(np.nanmean)
tm.assert_frame_equal(result_df, expected_df)

View File

@@ -0,0 +1,174 @@
"""
these are systematically testing all of the args to value_counts
with different size combinations. This is to ensure stability of the sorting
and proper parameter handling
"""
from itertools import product
import numpy as np
import pytest
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
Grouper,
MultiIndex,
Series,
date_range,
to_datetime,
)
import pandas._testing as tm
# our starting frame
def seed_df(seed_nans, n, m):
np.random.seed(1234)
days = date_range("2015-08-24", periods=10)
frame = DataFrame(
{
"1st": np.random.choice(list("abcd"), n),
"2nd": np.random.choice(days, n),
"3rd": np.random.randint(1, m + 1, n),
}
)
if seed_nans:
frame.loc[1::11, "1st"] = np.nan
frame.loc[3::17, "2nd"] = np.nan
frame.loc[7::19, "3rd"] = np.nan
frame.loc[8::19, "3rd"] = np.nan
frame.loc[9::19, "3rd"] = np.nan
return frame
# create input df, keys, and the bins
binned = []
ids = []
for seed_nans in [True, False]:
for n, m in product((100, 1000), (5, 20)):
df = seed_df(seed_nans, n, m)
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
keys = "1st", "2nd", ["1st", "2nd"]
for k, b in product(keys, bins):
binned.append((df, k, b, n, m))
ids.append(f"{k}-{n}-{m}")
@pytest.mark.slow
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
@pytest.mark.parametrize("isort", [True, False])
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("dropna", [True, False])
def test_series_groupby_value_counts(
df, keys, bins, n, m, isort, normalize, sort, ascending, dropna
):
def rebuild_index(df):
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
return df
kwargs = {
"normalize": normalize,
"sort": sort,
"ascending": ascending,
"dropna": dropna,
"bins": bins,
}
gr = df.groupby(keys, sort=isort)
left = gr["3rd"].value_counts(**kwargs)
gr = df.groupby(keys, sort=isort)
right = gr["3rd"].apply(Series.value_counts, **kwargs)
right.index.names = right.index.names[:-1] + ["3rd"]
# have to sort on index because of unstable sort on values
left, right = map(rebuild_index, (left, right)) # xref GH9212
tm.assert_series_equal(left.sort_index(), right.sort_index())
def test_series_groupby_value_counts_with_grouper():
# GH28479
df = DataFrame(
{
"Timestamp": [
1565083561,
1565083561 + 86400,
1565083561 + 86500,
1565083561 + 86400 * 2,
1565083561 + 86400 * 3,
1565083561 + 86500 * 3,
1565083561 + 86400 * 4,
],
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
}
).drop([3])
df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s")
dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
# have to sort on index because of unstable sort on values xref GH9212
result = dfg["Food"].value_counts().sort_index()
expected = dfg["Food"].apply(Series.value_counts).sort_index()
expected.index.names = result.index.names
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
def test_series_groupby_value_counts_empty(columns):
# GH39172
df = DataFrame(columns=columns)
dfg = df.groupby(columns[:-1])
result = dfg[columns[-1]].value_counts()
expected = Series([], name=columns[-1], dtype=result.dtype)
expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
def test_series_groupby_value_counts_one_row(columns):
# GH42618
df = DataFrame(data=[range(len(columns))], columns=columns)
dfg = df.groupby(columns[:-1])
result = dfg[columns[-1]].value_counts()
expected = df.value_counts().rename(columns[-1])
tm.assert_series_equal(result, expected)
def test_series_groupby_value_counts_on_categorical():
# GH38672
s = Series(Categorical(["a"], categories=["a", "b"]))
result = s.groupby([0]).value_counts()
expected = Series(
data=[1, 0],
index=MultiIndex.from_arrays(
[
[0, 0],
CategoricalIndex(
["a", "b"], categories=["a", "b"], ordered=False, dtype="category"
),
]
),
name=0,
)
# Expected:
# 0 a 1
# b 0
# Name: 0, dtype: int64
tm.assert_series_equal(result, expected)

View File

@@ -0,0 +1,205 @@
import pytest
from pandas.errors import NumbaUtilError
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
Series,
option_context,
)
import pandas._testing as tm
from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
@td.skip_if_no("numba")
def test_correct_function_signature():
def incorrect_function(x):
return x + 1
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key").transform(incorrect_function, engine="numba")
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key")["data"].transform(incorrect_function, engine="numba")
@td.skip_if_no("numba")
def test_check_nopython_kwargs():
def incorrect_function(x, **kwargs):
return x + 1
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key").transform(incorrect_function, engine="numba", a=1)
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
@td.skip_if_no("numba")
@pytest.mark.filterwarnings("ignore:\n")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython):
def func(values, index):
return values + 1
if jit:
# Test accepted jitted functions
import numba
func = numba.jit(func)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
if pandas_obj == "Series":
grouped = grouped[1]
result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x + 1, engine="cython")
tm.assert_equal(result, expected)
@td.skip_if_no("numba")
@pytest.mark.filterwarnings("ignore:\n")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
# Test that the functions are cached correctly if we switch functions
def func_1(values, index):
return values + 1
def func_2(values, index):
return values * 5
if jit:
import numba
func_1 = numba.jit(func_1)
func_2 = numba.jit(func_2)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
if pandas_obj == "Series":
grouped = grouped[1]
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x + 1, engine="cython")
tm.assert_equal(result, expected)
# func_1 should be in the cache now
assert (func_1, "groupby_transform") in NUMBA_FUNC_CACHE
# Add func_2 to the cache
result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x * 5, engine="cython")
tm.assert_equal(result, expected)
assert (func_2, "groupby_transform") in NUMBA_FUNC_CACHE
# Retest func_1 which should use the cache
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x + 1, engine="cython")
tm.assert_equal(result, expected)
@td.skip_if_no("numba")
def test_use_global_config():
def func_1(values, index):
return values + 1
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
grouped = data.groupby(0)
expected = grouped.transform(func_1, engine="numba")
with option_context("compute.use_numba", True):
result = grouped.transform(func_1, engine=None)
tm.assert_frame_equal(expected, result)
@td.skip_if_no("numba")
@pytest.mark.parametrize(
"agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
)
def test_multifunc_notimplimented(agg_func):
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
grouped = data.groupby(0)
with pytest.raises(NotImplementedError, match="Numba engine can"):
grouped.transform(agg_func, engine="numba")
with pytest.raises(NotImplementedError, match="Numba engine can"):
grouped[1].transform(agg_func, engine="numba")
@td.skip_if_no("numba")
def test_args_not_cached():
# GH 41647
def sum_last(values, index, n):
return values[-n:].sum()
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
grouped_x = df.groupby("id")["x"]
result = grouped_x.transform(sum_last, 1, engine="numba")
expected = Series([1.0] * 4, name="x")
tm.assert_series_equal(result, expected)
result = grouped_x.transform(sum_last, 2, engine="numba")
expected = Series([2.0] * 4, name="x")
tm.assert_series_equal(result, expected)
@td.skip_if_no("numba")
def test_index_data_correctly_passed():
# GH 43133
def f(values, index):
return index - 1
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
result = df.groupby("group").transform(f, engine="numba")
expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3])
tm.assert_frame_equal(result, expected)
@td.skip_if_no("numba")
def test_multiindex_one_key(nogil, parallel, nopython):
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
result = df.groupby("A").transform(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"])
tm.assert_frame_equal(result, expected)
@td.skip_if_no("numba")
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
with pytest.raises(NotImplementedError, match="More than 1 grouping labels"):
df.groupby(["A", "B"]).transform(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)