Old engine for Continuous Time Bayesian Networks. Superseded by reCTBN. 🐍
https://github.com/madlabunimib/PyCTBN
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2058 lines
61 KiB
2058 lines
61 KiB
4 years ago
|
from datetime import datetime
|
||
|
from decimal import Decimal
|
||
|
from io import StringIO
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas.errors import PerformanceWarning
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv
|
||
|
import pandas._testing as tm
|
||
|
from pandas.core.base import SpecificationError
|
||
|
import pandas.core.common as com
|
||
|
|
||
|
|
||
|
def test_repr():
|
||
|
# GH18203
|
||
|
result = repr(pd.Grouper(key="A", level="B"))
|
||
|
expected = "Grouper(key='A', level='B', axis=0, sort=False)"
|
||
|
assert result == expected
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"])
|
||
|
def test_basic(dtype):
|
||
|
|
||
|
data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
|
||
|
|
||
|
index = np.arange(9)
|
||
|
np.random.shuffle(index)
|
||
|
data = data.reindex(index)
|
||
|
|
||
|
grouped = data.groupby(lambda x: x // 3)
|
||
|
|
||
|
for k, v in grouped:
|
||
|
assert len(v) == 3
|
||
|
|
||
|
agged = grouped.aggregate(np.mean)
|
||
|
assert agged[1] == 1
|
||
|
|
||
|
tm.assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
|
||
|
tm.assert_series_equal(agged, grouped.mean())
|
||
|
tm.assert_series_equal(grouped.agg(np.sum), grouped.sum())
|
||
|
|
||
|
expected = grouped.apply(lambda x: x * x.sum())
|
||
|
transformed = grouped.transform(lambda x: x * x.sum())
|
||
|
assert transformed[7] == 12
|
||
|
tm.assert_series_equal(transformed, expected)
|
||
|
|
||
|
value_grouped = data.groupby(data)
|
||
|
tm.assert_series_equal(
|
||
|
value_grouped.aggregate(np.mean), agged, check_index_type=False
|
||
|
)
|
||
|
|
||
|
# complex agg
|
||
|
agged = grouped.aggregate([np.mean, np.std])
|
||
|
|
||
|
msg = r"nested renamer is not supported"
|
||
|
with pytest.raises(SpecificationError, match=msg):
|
||
|
grouped.aggregate({"one": np.mean, "two": np.std})
|
||
|
|
||
|
group_constants = {0: 10, 1: 20, 2: 30}
|
||
|
agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
|
||
|
assert agged[1] == 21
|
||
|
|
||
|
# corner cases
|
||
|
msg = "Must produce aggregated value"
|
||
|
# exception raised is type Exception
|
||
|
with pytest.raises(Exception, match=msg):
|
||
|
grouped.aggregate(lambda x: x * 2)
|
||
|
|
||
|
|
||
|
def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
|
||
|
key = mframe.index.codes[0]
|
||
|
grouped = mframe.groupby(key)
|
||
|
result = grouped.sum()
|
||
|
|
||
|
expected = mframe.groupby(key.astype("O")).sum()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# GH 3911, mixed frame non-conversion
|
||
|
df = df_mixed_floats.copy()
|
||
|
df["value"] = range(len(df))
|
||
|
|
||
|
def max_value(group):
|
||
|
return group.loc[group["value"].idxmax()]
|
||
|
|
||
|
applied = df.groupby("A").apply(max_value)
|
||
|
result = applied.dtypes
|
||
|
expected = Series(
|
||
|
[np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")],
|
||
|
index=["A", "B", "C", "D", "value"],
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_return_type():
|
||
|
|
||
|
# GH2893, return a reduced type
|
||
|
df1 = DataFrame(
|
||
|
[
|
||
|
{"val1": 1, "val2": 20},
|
||
|
{"val1": 1, "val2": 19},
|
||
|
{"val1": 2, "val2": 27},
|
||
|
{"val1": 2, "val2": 12},
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def func(dataf):
|
||
|
return dataf["val2"] - dataf["val2"].mean()
|
||
|
|
||
|
with tm.assert_produces_warning(FutureWarning):
|
||
|
result = df1.groupby("val1", squeeze=True).apply(func)
|
||
|
assert isinstance(result, Series)
|
||
|
|
||
|
df2 = DataFrame(
|
||
|
[
|
||
|
{"val1": 1, "val2": 20},
|
||
|
{"val1": 1, "val2": 19},
|
||
|
{"val1": 1, "val2": 27},
|
||
|
{"val1": 1, "val2": 12},
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def func(dataf):
|
||
|
return dataf["val2"] - dataf["val2"].mean()
|
||
|
|
||
|
with tm.assert_produces_warning(FutureWarning):
|
||
|
result = df2.groupby("val1", squeeze=True).apply(func)
|
||
|
assert isinstance(result, Series)
|
||
|
|
||
|
# GH3596, return a consistent type (regression in 0.11 from 0.10.1)
|
||
|
df = DataFrame([[1, 1], [1, 1]], columns=["X", "Y"])
|
||
|
with tm.assert_produces_warning(FutureWarning):
|
||
|
result = df.groupby("X", squeeze=False).count()
|
||
|
assert isinstance(result, DataFrame)
|
||
|
|
||
|
|
||
|
def test_inconsistent_return_type():
|
||
|
# GH5592
|
||
|
# inconsistent return type
|
||
|
df = DataFrame(
|
||
|
dict(
|
||
|
A=["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"],
|
||
|
B=Series(np.arange(7), dtype="int64"),
|
||
|
C=date_range("20130101", periods=7),
|
||
|
)
|
||
|
)
|
||
|
|
||
|
def f(grp):
|
||
|
return grp.iloc[0]
|
||
|
|
||
|
expected = df.groupby("A").first()[["B"]]
|
||
|
result = df.groupby("A").apply(f)[["B"]]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def f(grp):
|
||
|
if grp.name == "Tiger":
|
||
|
return None
|
||
|
return grp.iloc[0]
|
||
|
|
||
|
result = df.groupby("A").apply(f)[["B"]]
|
||
|
e = expected.copy()
|
||
|
e.loc["Tiger"] = np.nan
|
||
|
tm.assert_frame_equal(result, e)
|
||
|
|
||
|
def f(grp):
|
||
|
if grp.name == "Pony":
|
||
|
return None
|
||
|
return grp.iloc[0]
|
||
|
|
||
|
result = df.groupby("A").apply(f)[["B"]]
|
||
|
e = expected.copy()
|
||
|
e.loc["Pony"] = np.nan
|
||
|
tm.assert_frame_equal(result, e)
|
||
|
|
||
|
# 5592 revisited, with datetimes
|
||
|
def f(grp):
|
||
|
if grp.name == "Pony":
|
||
|
return None
|
||
|
return grp.iloc[0]
|
||
|
|
||
|
result = df.groupby("A").apply(f)[["C"]]
|
||
|
e = df.groupby("A").first()[["C"]]
|
||
|
e.loc["Pony"] = pd.NaT
|
||
|
tm.assert_frame_equal(result, e)
|
||
|
|
||
|
# scalar outputs
|
||
|
def f(grp):
|
||
|
if grp.name == "Pony":
|
||
|
return None
|
||
|
return grp.iloc[0].loc["C"]
|
||
|
|
||
|
result = df.groupby("A").apply(f)
|
||
|
e = df.groupby("A").first()["C"].copy()
|
||
|
e.loc["Pony"] = np.nan
|
||
|
e.name = None
|
||
|
tm.assert_series_equal(result, e)
|
||
|
|
||
|
|
||
|
def test_pass_args_kwargs(ts, tsframe):
|
||
|
def f(x, q=None, axis=0):
|
||
|
return np.percentile(x, q, axis=axis)
|
||
|
|
||
|
g = lambda x: np.percentile(x, 80, axis=0)
|
||
|
|
||
|
# Series
|
||
|
ts_grouped = ts.groupby(lambda x: x.month)
|
||
|
agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
|
||
|
apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
|
||
|
trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
|
||
|
|
||
|
agg_expected = ts_grouped.quantile(0.8)
|
||
|
trans_expected = ts_grouped.transform(g)
|
||
|
|
||
|
tm.assert_series_equal(apply_result, agg_expected)
|
||
|
tm.assert_series_equal(agg_result, agg_expected)
|
||
|
tm.assert_series_equal(trans_result, trans_expected)
|
||
|
|
||
|
agg_result = ts_grouped.agg(f, q=80)
|
||
|
apply_result = ts_grouped.apply(f, q=80)
|
||
|
trans_result = ts_grouped.transform(f, q=80)
|
||
|
tm.assert_series_equal(agg_result, agg_expected)
|
||
|
tm.assert_series_equal(apply_result, agg_expected)
|
||
|
tm.assert_series_equal(trans_result, trans_expected)
|
||
|
|
||
|
# DataFrame
|
||
|
df_grouped = tsframe.groupby(lambda x: x.month)
|
||
|
agg_result = df_grouped.agg(np.percentile, 80, axis=0)
|
||
|
apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
|
||
|
expected = df_grouped.quantile(0.8)
|
||
|
tm.assert_frame_equal(apply_result, expected, check_names=False)
|
||
|
tm.assert_frame_equal(agg_result, expected)
|
||
|
|
||
|
agg_result = df_grouped.agg(f, q=80)
|
||
|
apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
|
||
|
tm.assert_frame_equal(agg_result, expected)
|
||
|
tm.assert_frame_equal(apply_result, expected, check_names=False)
|
||
|
|
||
|
|
||
|
def test_len():
|
||
|
df = tm.makeTimeDataFrame()
|
||
|
grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
|
||
|
assert len(grouped) == len(df)
|
||
|
|
||
|
grouped = df.groupby([lambda x: x.year, lambda x: x.month])
|
||
|
expected = len({(x.year, x.month) for x in df.index})
|
||
|
assert len(grouped) == expected
|
||
|
|
||
|
# issue 11016
|
||
|
df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
|
||
|
assert len(df.groupby(("a"))) == 0
|
||
|
assert len(df.groupby(("b"))) == 3
|
||
|
assert len(df.groupby(["a", "b"])) == 3
|
||
|
|
||
|
|
||
|
def test_basic_regression():
|
||
|
# regression
|
||
|
result = Series([1.0 * x for x in list(range(1, 10)) * 10])
|
||
|
|
||
|
data = np.random.random(1100) * 10.0
|
||
|
groupings = Series(data)
|
||
|
|
||
|
grouped = result.groupby(groupings)
|
||
|
grouped.mean()
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dtype", ["float64", "float32", "int64", "int32", "int16", "int8"]
|
||
|
)
|
||
|
def test_with_na_groups(dtype):
|
||
|
index = Index(np.arange(10))
|
||
|
values = Series(np.ones(10), index, dtype=dtype)
|
||
|
labels = Series(
|
||
|
[np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"],
|
||
|
index=index,
|
||
|
)
|
||
|
|
||
|
# this SHOULD be an int
|
||
|
grouped = values.groupby(labels)
|
||
|
agged = grouped.agg(len)
|
||
|
expected = Series([4, 2], index=["bar", "foo"])
|
||
|
|
||
|
tm.assert_series_equal(agged, expected, check_dtype=False)
|
||
|
|
||
|
# assert issubclass(agged.dtype.type, np.integer)
|
||
|
|
||
|
# explicitly return a float from my function
|
||
|
def f(x):
|
||
|
return float(len(x))
|
||
|
|
||
|
agged = grouped.agg(f)
|
||
|
expected = Series([4, 2], index=["bar", "foo"])
|
||
|
|
||
|
tm.assert_series_equal(agged, expected, check_dtype=False)
|
||
|
assert issubclass(agged.dtype.type, np.dtype(dtype).type)
|
||
|
|
||
|
|
||
|
def test_indices_concatenation_order():
|
||
|
|
||
|
# GH 2808
|
||
|
|
||
|
def f1(x):
|
||
|
y = x[(x.b % 2) == 1] ** 2
|
||
|
if y.empty:
|
||
|
multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"])
|
||
|
res = DataFrame(columns=["a"], index=multiindex)
|
||
|
return res
|
||
|
else:
|
||
|
y = y.set_index(["b", "c"])
|
||
|
return y
|
||
|
|
||
|
def f2(x):
|
||
|
y = x[(x.b % 2) == 1] ** 2
|
||
|
if y.empty:
|
||
|
return DataFrame()
|
||
|
else:
|
||
|
y = y.set_index(["b", "c"])
|
||
|
return y
|
||
|
|
||
|
def f3(x):
|
||
|
y = x[(x.b % 2) == 1] ** 2
|
||
|
if y.empty:
|
||
|
multiindex = MultiIndex(
|
||
|
levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"]
|
||
|
)
|
||
|
res = DataFrame(columns=["a", "b"], index=multiindex)
|
||
|
return res
|
||
|
else:
|
||
|
return y
|
||
|
|
||
|
df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)})
|
||
|
|
||
|
df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)})
|
||
|
|
||
|
# correct result
|
||
|
result1 = df.groupby("a").apply(f1)
|
||
|
result2 = df2.groupby("a").apply(f1)
|
||
|
tm.assert_frame_equal(result1, result2)
|
||
|
|
||
|
# should fail (not the same number of levels)
|
||
|
msg = "Cannot concat indices that do not have the same number of levels"
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
df.groupby("a").apply(f2)
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
df2.groupby("a").apply(f2)
|
||
|
|
||
|
# should fail (incorrect shape)
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
df.groupby("a").apply(f3)
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
df2.groupby("a").apply(f3)
|
||
|
|
||
|
|
||
|
def test_attr_wrapper(ts):
|
||
|
grouped = ts.groupby(lambda x: x.weekday())
|
||
|
|
||
|
result = grouped.std()
|
||
|
expected = grouped.agg(lambda x: np.std(x, ddof=1))
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# this is pretty cool
|
||
|
result = grouped.describe()
|
||
|
expected = {name: gp.describe() for name, gp in grouped}
|
||
|
expected = DataFrame(expected).T
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# get attribute
|
||
|
result = grouped.dtype
|
||
|
expected = grouped.agg(lambda x: x.dtype)
|
||
|
|
||
|
# make sure raises error
|
||
|
msg = "'SeriesGroupBy' object has no attribute 'foo'"
|
||
|
with pytest.raises(AttributeError, match=msg):
|
||
|
getattr(grouped, "foo")
|
||
|
|
||
|
|
||
|
def test_frame_groupby(tsframe):
|
||
|
grouped = tsframe.groupby(lambda x: x.weekday())
|
||
|
|
||
|
# aggregate
|
||
|
aggregated = grouped.aggregate(np.mean)
|
||
|
assert len(aggregated) == 5
|
||
|
assert len(aggregated.columns) == 4
|
||
|
|
||
|
# by string
|
||
|
tscopy = tsframe.copy()
|
||
|
tscopy["weekday"] = [x.weekday() for x in tscopy.index]
|
||
|
stragged = tscopy.groupby("weekday").aggregate(np.mean)
|
||
|
tm.assert_frame_equal(stragged, aggregated, check_names=False)
|
||
|
|
||
|
# transform
|
||
|
grouped = tsframe.head(30).groupby(lambda x: x.weekday())
|
||
|
transformed = grouped.transform(lambda x: x - x.mean())
|
||
|
assert len(transformed) == 30
|
||
|
assert len(transformed.columns) == 4
|
||
|
|
||
|
# transform propagate
|
||
|
transformed = grouped.transform(lambda x: x.mean())
|
||
|
for name, group in grouped:
|
||
|
mean = group.mean()
|
||
|
for idx in group.index:
|
||
|
tm.assert_series_equal(transformed.xs(idx), mean, check_names=False)
|
||
|
|
||
|
# iterate
|
||
|
for weekday, group in grouped:
|
||
|
assert group.index[0].weekday() == weekday
|
||
|
|
||
|
# groups / group_indices
|
||
|
groups = grouped.groups
|
||
|
indices = grouped.indices
|
||
|
|
||
|
for k, v in groups.items():
|
||
|
samething = tsframe.index.take(indices[k])
|
||
|
assert (samething == v).all()
|
||
|
|
||
|
|
||
|
def test_frame_groupby_columns(tsframe):
|
||
|
mapping = {"A": 0, "B": 0, "C": 1, "D": 1}
|
||
|
grouped = tsframe.groupby(mapping, axis=1)
|
||
|
|
||
|
# aggregate
|
||
|
aggregated = grouped.aggregate(np.mean)
|
||
|
assert len(aggregated) == len(tsframe)
|
||
|
assert len(aggregated.columns) == 2
|
||
|
|
||
|
# transform
|
||
|
tf = lambda x: x - x.mean()
|
||
|
groupedT = tsframe.T.groupby(mapping, axis=0)
|
||
|
tm.assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
|
||
|
|
||
|
# iterate
|
||
|
for k, v in grouped:
|
||
|
assert len(v.columns) == 2
|
||
|
|
||
|
|
||
|
def test_frame_set_name_single(df):
|
||
|
grouped = df.groupby("A")
|
||
|
|
||
|
result = grouped.mean()
|
||
|
assert result.index.name == "A"
|
||
|
|
||
|
result = df.groupby("A", as_index=False).mean()
|
||
|
assert result.index.name != "A"
|
||
|
|
||
|
result = grouped.agg(np.mean)
|
||
|
assert result.index.name == "A"
|
||
|
|
||
|
result = grouped.agg({"C": np.mean, "D": np.std})
|
||
|
assert result.index.name == "A"
|
||
|
|
||
|
result = grouped["C"].mean()
|
||
|
assert result.index.name == "A"
|
||
|
result = grouped["C"].agg(np.mean)
|
||
|
assert result.index.name == "A"
|
||
|
result = grouped["C"].agg([np.mean, np.std])
|
||
|
assert result.index.name == "A"
|
||
|
|
||
|
msg = r"nested renamer is not supported"
|
||
|
with pytest.raises(SpecificationError, match=msg):
|
||
|
grouped["C"].agg({"foo": np.mean, "bar": np.std})
|
||
|
|
||
|
|
||
|
def test_multi_func(df):
|
||
|
col1 = df["A"]
|
||
|
col2 = df["B"]
|
||
|
|
||
|
grouped = df.groupby([col1.get, col2.get])
|
||
|
agged = grouped.mean()
|
||
|
expected = df.groupby(["A", "B"]).mean()
|
||
|
|
||
|
# TODO groupby get drops names
|
||
|
tm.assert_frame_equal(
|
||
|
agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False
|
||
|
)
|
||
|
|
||
|
# some "groups" with no data
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"v1": np.random.randn(6),
|
||
|
"v2": np.random.randn(6),
|
||
|
"k1": np.array(["b", "b", "b", "a", "a", "a"]),
|
||
|
"k2": np.array(["1", "1", "1", "2", "2", "2"]),
|
||
|
},
|
||
|
index=["one", "two", "three", "four", "five", "six"],
|
||
|
)
|
||
|
# only verify that it works for now
|
||
|
grouped = df.groupby(["k1", "k2"])
|
||
|
grouped.agg(np.sum)
|
||
|
|
||
|
|
||
|
def test_multi_key_multiple_functions(df):
|
||
|
grouped = df.groupby(["A", "B"])["C"]
|
||
|
|
||
|
agged = grouped.agg([np.mean, np.std])
|
||
|
expected = DataFrame({"mean": grouped.agg(np.mean), "std": grouped.agg(np.std)})
|
||
|
tm.assert_frame_equal(agged, expected)
|
||
|
|
||
|
|
||
|
def test_frame_multi_key_function_list():
|
||
|
data = DataFrame(
|
||
|
{
|
||
|
"A": [
|
||
|
"foo",
|
||
|
"foo",
|
||
|
"foo",
|
||
|
"foo",
|
||
|
"bar",
|
||
|
"bar",
|
||
|
"bar",
|
||
|
"bar",
|
||
|
"foo",
|
||
|
"foo",
|
||
|
"foo",
|
||
|
],
|
||
|
"B": [
|
||
|
"one",
|
||
|
"one",
|
||
|
"one",
|
||
|
"two",
|
||
|
"one",
|
||
|
"one",
|
||
|
"one",
|
||
|
"two",
|
||
|
"two",
|
||
|
"two",
|
||
|
"one",
|
||
|
],
|
||
|
"C": [
|
||
|
"dull",
|
||
|
"dull",
|
||
|
"shiny",
|
||
|
"dull",
|
||
|
"dull",
|
||
|
"shiny",
|
||
|
"shiny",
|
||
|
"dull",
|
||
|
"shiny",
|
||
|
"shiny",
|
||
|
"shiny",
|
||
|
],
|
||
|
"D": np.random.randn(11),
|
||
|
"E": np.random.randn(11),
|
||
|
"F": np.random.randn(11),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
grouped = data.groupby(["A", "B"])
|
||
|
funcs = [np.mean, np.std]
|
||
|
agged = grouped.agg(funcs)
|
||
|
expected = pd.concat(
|
||
|
[grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)],
|
||
|
keys=["D", "E", "F"],
|
||
|
axis=1,
|
||
|
)
|
||
|
assert isinstance(agged.index, MultiIndex)
|
||
|
assert isinstance(expected.index, MultiIndex)
|
||
|
tm.assert_frame_equal(agged, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()])
|
||
|
def test_groupby_multiple_columns(df, op):
|
||
|
data = df
|
||
|
grouped = data.groupby(["A", "B"])
|
||
|
|
||
|
result1 = op(grouped)
|
||
|
|
||
|
keys = []
|
||
|
values = []
|
||
|
for n1, gp1 in data.groupby("A"):
|
||
|
for n2, gp2 in gp1.groupby("B"):
|
||
|
keys.append((n1, n2))
|
||
|
values.append(op(gp2.loc[:, ["C", "D"]]))
|
||
|
|
||
|
mi = MultiIndex.from_tuples(keys, names=["A", "B"])
|
||
|
expected = pd.concat(values, axis=1).T
|
||
|
expected.index = mi
|
||
|
|
||
|
# a little bit crude
|
||
|
for col in ["C", "D"]:
|
||
|
result_col = op(grouped[col])
|
||
|
pivoted = result1[col]
|
||
|
exp = expected[col]
|
||
|
tm.assert_series_equal(result_col, exp)
|
||
|
tm.assert_series_equal(pivoted, exp)
|
||
|
|
||
|
# test single series works the same
|
||
|
result = data["C"].groupby([data["A"], data["B"]]).mean()
|
||
|
expected = data.groupby(["A", "B"]).mean()["C"]
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_as_index_select_column():
|
||
|
# GH 5764
|
||
|
df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||
|
result = df.groupby("A", as_index=False)["B"].get_group(1)
|
||
|
expected = pd.Series([2, 4], name="B")
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum())
|
||
|
expected = pd.Series(
|
||
|
[2, 6, 6], name="B", index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)])
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_as_index_select_column_sum_empty_df():
|
||
|
# GH 35246
|
||
|
df = DataFrame(columns=["A", "B", "C"])
|
||
|
left = df.groupby(by="A", as_index=False)["B"].sum()
|
||
|
assert type(left) is DataFrame
|
||
|
assert left.to_dict() == {"A": {}, "B": {}}
|
||
|
|
||
|
|
||
|
def test_groupby_as_index_agg(df):
|
||
|
grouped = df.groupby("A", as_index=False)
|
||
|
|
||
|
# single-key
|
||
|
|
||
|
result = grouped.agg(np.mean)
|
||
|
expected = grouped.mean()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result2 = grouped.agg({"C": np.mean, "D": np.sum})
|
||
|
expected2 = grouped.mean()
|
||
|
expected2["D"] = grouped.sum()["D"]
|
||
|
tm.assert_frame_equal(result2, expected2)
|
||
|
|
||
|
grouped = df.groupby("A", as_index=True)
|
||
|
|
||
|
msg = r"nested renamer is not supported"
|
||
|
with pytest.raises(SpecificationError, match=msg):
|
||
|
grouped["C"].agg({"Q": np.sum})
|
||
|
|
||
|
# multi-key
|
||
|
|
||
|
grouped = df.groupby(["A", "B"], as_index=False)
|
||
|
|
||
|
result = grouped.agg(np.mean)
|
||
|
expected = grouped.mean()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result2 = grouped.agg({"C": np.mean, "D": np.sum})
|
||
|
expected2 = grouped.mean()
|
||
|
expected2["D"] = grouped.sum()["D"]
|
||
|
tm.assert_frame_equal(result2, expected2)
|
||
|
|
||
|
expected3 = grouped["C"].sum()
|
||
|
expected3 = DataFrame(expected3).rename(columns={"C": "Q"})
|
||
|
result3 = grouped["C"].agg({"Q": np.sum})
|
||
|
tm.assert_frame_equal(result3, expected3)
|
||
|
|
||
|
# GH7115 & GH8112 & GH8582
|
||
|
df = DataFrame(np.random.randint(0, 100, (50, 3)), columns=["jim", "joe", "jolie"])
|
||
|
ts = Series(np.random.randint(5, 10, 50), name="jim")
|
||
|
|
||
|
gr = df.groupby(ts)
|
||
|
gr.nth(0) # invokes set_selection_from_grouper internally
|
||
|
tm.assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
|
||
|
|
||
|
for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]:
|
||
|
gr = df.groupby(ts, as_index=False)
|
||
|
left = getattr(gr, attr)()
|
||
|
|
||
|
gr = df.groupby(ts.values, as_index=True)
|
||
|
right = getattr(gr, attr)().reset_index(drop=True)
|
||
|
|
||
|
tm.assert_frame_equal(left, right)
|
||
|
|
||
|
|
||
|
def test_ops_not_as_index(reduction_func):
|
||
|
# GH 10355, 21090
|
||
|
# Using as_index=False should not modify grouped column
|
||
|
|
||
|
if reduction_func in ("corrwith",):
|
||
|
pytest.skip("Test not applicable")
|
||
|
|
||
|
if reduction_func in ("nth", "ngroup",):
|
||
|
pytest.skip("Skip until behavior is determined (GH #5755)")
|
||
|
|
||
|
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
|
||
|
expected = getattr(df.groupby("a"), reduction_func)()
|
||
|
if reduction_func == "size":
|
||
|
expected = expected.rename("size")
|
||
|
expected = expected.reset_index()
|
||
|
|
||
|
g = df.groupby("a", as_index=False)
|
||
|
|
||
|
result = getattr(g, reduction_func)()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = g.agg(reduction_func)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = getattr(g["b"], reduction_func)()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = g["b"].agg(reduction_func)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_as_index_series_return_frame(df):
|
||
|
grouped = df.groupby("A", as_index=False)
|
||
|
grouped2 = df.groupby(["A", "B"], as_index=False)
|
||
|
|
||
|
result = grouped["C"].agg(np.sum)
|
||
|
expected = grouped.agg(np.sum).loc[:, ["A", "C"]]
|
||
|
assert isinstance(result, DataFrame)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result2 = grouped2["C"].agg(np.sum)
|
||
|
expected2 = grouped2.agg(np.sum).loc[:, ["A", "B", "C"]]
|
||
|
assert isinstance(result2, DataFrame)
|
||
|
tm.assert_frame_equal(result2, expected2)
|
||
|
|
||
|
result = grouped["C"].sum()
|
||
|
expected = grouped.sum().loc[:, ["A", "C"]]
|
||
|
assert isinstance(result, DataFrame)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result2 = grouped2["C"].sum()
|
||
|
expected2 = grouped2.sum().loc[:, ["A", "B", "C"]]
|
||
|
assert isinstance(result2, DataFrame)
|
||
|
tm.assert_frame_equal(result2, expected2)
|
||
|
|
||
|
|
||
|
def test_as_index_series_column_slice_raises(df):
|
||
|
# GH15072
|
||
|
grouped = df.groupby("A", as_index=False)
|
||
|
msg = r"Column\(s\) C already selected"
|
||
|
|
||
|
with pytest.raises(IndexError, match=msg):
|
||
|
grouped["C"].__getitem__("D")
|
||
|
|
||
|
|
||
|
def test_groupby_as_index_cython(df):
|
||
|
data = df
|
||
|
|
||
|
# single-key
|
||
|
grouped = data.groupby("A", as_index=False)
|
||
|
result = grouped.mean()
|
||
|
expected = data.groupby(["A"]).mean()
|
||
|
expected.insert(0, "A", expected.index)
|
||
|
expected.index = np.arange(len(expected))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# multi-key
|
||
|
grouped = data.groupby(["A", "B"], as_index=False)
|
||
|
result = grouped.mean()
|
||
|
expected = data.groupby(["A", "B"]).mean()
|
||
|
|
||
|
arrays = list(zip(*expected.index.values))
|
||
|
expected.insert(0, "A", arrays[0])
|
||
|
expected.insert(1, "B", arrays[1])
|
||
|
expected.index = np.arange(len(expected))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_as_index_series_scalar(df):
|
||
|
grouped = df.groupby(["A", "B"], as_index=False)
|
||
|
|
||
|
# GH #421
|
||
|
|
||
|
result = grouped["C"].agg(len)
|
||
|
expected = grouped.agg(len).loc[:, ["A", "B", "C"]]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_as_index_corner(df, ts):
|
||
|
msg = "as_index=False only valid with DataFrame"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
ts.groupby(lambda x: x.weekday(), as_index=False)
|
||
|
|
||
|
msg = "as_index=False only valid for axis=0"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.groupby(lambda x: x.lower(), as_index=False, axis=1)
|
||
|
|
||
|
|
||
|
def test_groupby_multiple_key(df):
|
||
|
df = tm.makeTimeDataFrame()
|
||
|
grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
|
||
|
agged = grouped.sum()
|
||
|
tm.assert_almost_equal(df.values, agged.values)
|
||
|
|
||
|
grouped = df.T.groupby(
|
||
|
[lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1
|
||
|
)
|
||
|
|
||
|
agged = grouped.agg(lambda x: x.sum())
|
||
|
tm.assert_index_equal(agged.index, df.columns)
|
||
|
tm.assert_almost_equal(df.T.values, agged.values)
|
||
|
|
||
|
agged = grouped.agg(lambda x: x.sum())
|
||
|
tm.assert_almost_equal(df.T.values, agged.values)
|
||
|
|
||
|
|
||
|
def test_groupby_multi_corner(df):
|
||
|
# test that having an all-NA column doesn't mess you up
|
||
|
df = df.copy()
|
||
|
df["bad"] = np.nan
|
||
|
agged = df.groupby(["A", "B"]).mean()
|
||
|
|
||
|
expected = df.groupby(["A", "B"]).mean()
|
||
|
expected["bad"] = np.nan
|
||
|
|
||
|
tm.assert_frame_equal(agged, expected)
|
||
|
|
||
|
|
||
|
def test_omit_nuisance(df):
|
||
|
grouped = df.groupby("A")
|
||
|
|
||
|
result = grouped.mean()
|
||
|
expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
agged = grouped.agg(np.mean)
|
||
|
exp = grouped.mean()
|
||
|
tm.assert_frame_equal(agged, exp)
|
||
|
|
||
|
df = df.loc[:, ["A", "C", "D"]]
|
||
|
df["E"] = datetime.now()
|
||
|
grouped = df.groupby("A")
|
||
|
result = grouped.agg(np.sum)
|
||
|
expected = grouped.sum()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# won't work with axis = 1
|
||
|
grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1)
|
||
|
msg = "reduction operation 'sum' not allowed for this dtype"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
grouped.agg(lambda x: x.sum(0, numeric_only=False))
|
||
|
|
||
|
|
||
|
def test_omit_nuisance_python_multiple(three_group):
|
||
|
grouped = three_group.groupby(["A", "B"])
|
||
|
|
||
|
agged = grouped.agg(np.mean)
|
||
|
exp = grouped.mean()
|
||
|
tm.assert_frame_equal(agged, exp)
|
||
|
|
||
|
|
||
|
def test_empty_groups_corner(mframe):
|
||
|
# handle empty groups
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"k1": np.array(["b", "b", "b", "a", "a", "a"]),
|
||
|
"k2": np.array(["1", "1", "1", "2", "2", "2"]),
|
||
|
"k3": ["foo", "bar"] * 3,
|
||
|
"v1": np.random.randn(6),
|
||
|
"v2": np.random.randn(6),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
grouped = df.groupby(["k1", "k2"])
|
||
|
result = grouped.agg(np.mean)
|
||
|
expected = grouped.mean()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
grouped = mframe[3:5].groupby(level=0)
|
||
|
agged = grouped.apply(lambda x: x.mean())
|
||
|
agged_A = grouped["A"].apply(np.mean)
|
||
|
tm.assert_series_equal(agged["A"], agged_A)
|
||
|
assert agged.index.name == "first"
|
||
|
|
||
|
|
||
|
def test_nonsense_func():
|
||
|
df = DataFrame([0])
|
||
|
msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
df.groupby(lambda x: x + "foo")
|
||
|
|
||
|
|
||
|
def test_wrap_aggregated_output_multindex(mframe):
|
||
|
df = mframe.T
|
||
|
df["baz", "two"] = "peekaboo"
|
||
|
|
||
|
keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
|
||
|
agged = df.groupby(keys).agg(np.mean)
|
||
|
assert isinstance(agged.columns, MultiIndex)
|
||
|
|
||
|
def aggfun(ser):
|
||
|
if ser.name == ("foo", "one"):
|
||
|
raise TypeError
|
||
|
else:
|
||
|
return ser.sum()
|
||
|
|
||
|
agged2 = df.groupby(keys).aggregate(aggfun)
|
||
|
assert len(agged2.columns) + 1 == len(df.columns)
|
||
|
|
||
|
|
||
|
def test_groupby_level_apply(mframe):
|
||
|
|
||
|
result = mframe.groupby(level=0).count()
|
||
|
assert result.index.name == "first"
|
||
|
result = mframe.groupby(level=1).count()
|
||
|
assert result.index.name == "second"
|
||
|
|
||
|
result = mframe["A"].groupby(level=0).count()
|
||
|
assert result.index.name == "first"
|
||
|
|
||
|
|
||
|
def test_groupby_level_mapper(mframe):
|
||
|
deleveled = mframe.reset_index()
|
||
|
|
||
|
mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1}
|
||
|
mapper1 = {"one": 0, "two": 0, "three": 1}
|
||
|
|
||
|
result0 = mframe.groupby(mapper0, level=0).sum()
|
||
|
result1 = mframe.groupby(mapper1, level=1).sum()
|
||
|
|
||
|
mapped_level0 = np.array([mapper0.get(x) for x in deleveled["first"]])
|
||
|
mapped_level1 = np.array([mapper1.get(x) for x in deleveled["second"]])
|
||
|
expected0 = mframe.groupby(mapped_level0).sum()
|
||
|
expected1 = mframe.groupby(mapped_level1).sum()
|
||
|
expected0.index.name, expected1.index.name = "first", "second"
|
||
|
|
||
|
tm.assert_frame_equal(result0, expected0)
|
||
|
tm.assert_frame_equal(result1, expected1)
|
||
|
|
||
|
|
||
|
def test_groupby_level_nonmulti():
|
||
|
# GH 1313, GH 13901
|
||
|
s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo"))
|
||
|
expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo"))
|
||
|
|
||
|
result = s.groupby(level=0).sum()
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
result = s.groupby(level=[0]).sum()
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
result = s.groupby(level=-1).sum()
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
result = s.groupby(level=[-1]).sum()
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
msg = "level > 0 or level < -1 only valid with MultiIndex"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
s.groupby(level=1)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
s.groupby(level=-2)
|
||
|
msg = "No group keys passed!"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
s.groupby(level=[])
|
||
|
msg = "multiple levels only valid with MultiIndex"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
s.groupby(level=[0, 0])
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
s.groupby(level=[0, 1])
|
||
|
msg = "level > 0 or level < -1 only valid with MultiIndex"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
s.groupby(level=[1])
|
||
|
|
||
|
|
||
|
def test_groupby_complex():
|
||
|
# GH 12902
|
||
|
a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
|
||
|
expected = Series((1 + 2j, 5 + 10j))
|
||
|
|
||
|
result = a.groupby(level=0).sum()
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = a.sum(level=0)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_series_indexed_differently():
|
||
|
s1 = Series(
|
||
|
[5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7],
|
||
|
index=Index(["a", "b", "c", "d", "e", "f", "g"]),
|
||
|
)
|
||
|
s2 = Series(
|
||
|
[1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"])
|
||
|
)
|
||
|
|
||
|
grouped = s1.groupby(s2)
|
||
|
agged = grouped.mean()
|
||
|
exp = s1.groupby(s2.reindex(s1.index).get).mean()
|
||
|
tm.assert_series_equal(agged, exp)
|
||
|
|
||
|
|
||
|
def test_groupby_with_hier_columns():
|
||
|
tuples = list(
|
||
|
zip(
|
||
|
*[
|
||
|
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
|
||
|
["one", "two", "one", "two", "one", "two", "one", "two"],
|
||
|
]
|
||
|
)
|
||
|
)
|
||
|
index = MultiIndex.from_tuples(tuples)
|
||
|
columns = MultiIndex.from_tuples(
|
||
|
[("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")]
|
||
|
)
|
||
|
df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
|
||
|
|
||
|
result = df.groupby(level=0).mean()
|
||
|
tm.assert_index_equal(result.columns, columns)
|
||
|
|
||
|
result = df.groupby(level=0, axis=1).mean()
|
||
|
tm.assert_index_equal(result.index, df.index)
|
||
|
|
||
|
result = df.groupby(level=0).agg(np.mean)
|
||
|
tm.assert_index_equal(result.columns, columns)
|
||
|
|
||
|
result = df.groupby(level=0).apply(lambda x: x.mean())
|
||
|
tm.assert_index_equal(result.columns, columns)
|
||
|
|
||
|
result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
|
||
|
tm.assert_index_equal(result.columns, Index(["A", "B"]))
|
||
|
tm.assert_index_equal(result.index, df.index)
|
||
|
|
||
|
# add a nuisance column
|
||
|
sorted_columns, _ = columns.sortlevel(0)
|
||
|
df["A", "foo"] = "bar"
|
||
|
result = df.groupby(level=0).mean()
|
||
|
tm.assert_index_equal(result.columns, df.columns[:-1])
|
||
|
|
||
|
|
||
|
def test_grouping_ndarray(df):
|
||
|
grouped = df.groupby(df["A"].values)
|
||
|
|
||
|
result = grouped.sum()
|
||
|
expected = df.groupby("A").sum()
|
||
|
tm.assert_frame_equal(
|
||
|
result, expected, check_names=False
|
||
|
) # Note: no names when grouping by value
|
||
|
|
||
|
|
||
|
def test_groupby_wrong_multi_labels():
|
||
|
data = """index,foo,bar,baz,spam,data
|
||
|
0,foo1,bar1,baz1,spam2,20
|
||
|
1,foo1,bar2,baz1,spam3,30
|
||
|
2,foo2,bar2,baz1,spam2,40
|
||
|
3,foo1,bar1,baz2,spam1,50
|
||
|
4,foo3,bar1,baz2,spam1,60"""
|
||
|
|
||
|
data = read_csv(StringIO(data), index_col=0)
|
||
|
|
||
|
grouped = data.groupby(["foo", "bar", "baz", "spam"])
|
||
|
|
||
|
result = grouped.agg(np.mean)
|
||
|
expected = grouped.mean()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_series_with_name(df):
|
||
|
result = df.groupby(df["A"]).mean()
|
||
|
result2 = df.groupby(df["A"], as_index=False).mean()
|
||
|
assert result.index.name == "A"
|
||
|
assert "A" in result2
|
||
|
|
||
|
result = df.groupby([df["A"], df["B"]]).mean()
|
||
|
result2 = df.groupby([df["A"], df["B"]], as_index=False).mean()
|
||
|
assert result.index.names == ("A", "B")
|
||
|
assert "A" in result2
|
||
|
assert "B" in result2
|
||
|
|
||
|
|
||
|
def test_seriesgroupby_name_attr(df):
|
||
|
# GH 6265
|
||
|
result = df.groupby("A")["C"]
|
||
|
assert result.count().name == "C"
|
||
|
assert result.mean().name == "C"
|
||
|
|
||
|
testFunc = lambda x: np.sum(x) * 2
|
||
|
assert result.agg(testFunc).name == "C"
|
||
|
|
||
|
|
||
|
def test_consistency_name():
|
||
|
# GH 12363
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||
|
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||
|
"C": np.random.randn(8) + 1.0,
|
||
|
"D": np.arange(8),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
expected = df.groupby(["A"]).B.count()
|
||
|
result = df.B.groupby(df.A).count()
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_name_propagation(df):
|
||
|
# GH 6124
|
||
|
def summarize(df, name=None):
|
||
|
return Series({"count": 1, "mean": 2, "omissions": 3}, name=name)
|
||
|
|
||
|
def summarize_random_name(df):
|
||
|
# Provide a different name for each Series. In this case, groupby
|
||
|
# should not attempt to propagate the Series name since they are
|
||
|
# inconsistent.
|
||
|
return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"])
|
||
|
|
||
|
metrics = df.groupby("A").apply(summarize)
|
||
|
assert metrics.columns.name is None
|
||
|
metrics = df.groupby("A").apply(summarize, "metrics")
|
||
|
assert metrics.columns.name == "metrics"
|
||
|
metrics = df.groupby("A").apply(summarize_random_name)
|
||
|
assert metrics.columns.name is None
|
||
|
|
||
|
|
||
|
def test_groupby_nonstring_columns():
|
||
|
df = DataFrame([np.arange(10) for x in range(10)])
|
||
|
grouped = df.groupby(0)
|
||
|
result = grouped.mean()
|
||
|
expected = df.groupby(df[0]).mean()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_mixed_type_columns():
|
||
|
# GH 13432, unorderable types in py3
|
||
|
df = DataFrame([[0, 1, 2]], columns=["A", "B", 0])
|
||
|
expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A"))
|
||
|
|
||
|
result = df.groupby("A").first()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.groupby("A").sum()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
# TODO: Ensure warning isn't emitted in the first place
|
||
|
@pytest.mark.filterwarnings("ignore:Mean of:RuntimeWarning")
|
||
|
def test_cython_grouper_series_bug_noncontig():
|
||
|
arr = np.empty((100, 100))
|
||
|
arr.fill(np.nan)
|
||
|
obj = Series(arr[:, 0])
|
||
|
inds = np.tile(range(10), 10)
|
||
|
|
||
|
result = obj.groupby(inds).agg(Series.median)
|
||
|
assert result.isna().all()
|
||
|
|
||
|
|
||
|
def test_series_grouper_noncontig_index():
|
||
|
index = Index(tm.rands_array(10, 100))
|
||
|
|
||
|
values = Series(np.random.randn(50), index=index[::2])
|
||
|
labels = np.random.randint(0, 5, 50)
|
||
|
|
||
|
# it works!
|
||
|
grouped = values.groupby(labels)
|
||
|
|
||
|
# accessing the index elements causes segfault
|
||
|
f = lambda x: len(set(map(id, x.index)))
|
||
|
grouped.agg(f)
|
||
|
|
||
|
|
||
|
def test_convert_objects_leave_decimal_alone():
|
||
|
|
||
|
s = Series(range(5))
|
||
|
labels = np.array(["a", "b", "c", "d", "e"], dtype="O")
|
||
|
|
||
|
def convert_fast(x):
|
||
|
return Decimal(str(x.mean()))
|
||
|
|
||
|
def convert_force_pure(x):
|
||
|
# base will be length 0
|
||
|
assert len(x.values.base) > 0
|
||
|
return Decimal(str(x.mean()))
|
||
|
|
||
|
grouped = s.groupby(labels)
|
||
|
|
||
|
result = grouped.agg(convert_fast)
|
||
|
assert result.dtype == np.object_
|
||
|
assert isinstance(result[0], Decimal)
|
||
|
|
||
|
result = grouped.agg(convert_force_pure)
|
||
|
assert result.dtype == np.object_
|
||
|
assert isinstance(result[0], Decimal)
|
||
|
|
||
|
|
||
|
def test_groupby_dtype_inference_empty():
|
||
|
# GH 6733
|
||
|
df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
|
||
|
assert df["x"].dtype == np.float64
|
||
|
|
||
|
result = df.groupby("x").first()
|
||
|
exp_index = Index([], name="x", dtype=np.float64)
|
||
|
expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
|
||
|
tm.assert_frame_equal(result, expected, by_blocks=True)
|
||
|
|
||
|
|
||
|
def test_groupby_list_infer_array_like(df):
|
||
|
result = df.groupby(list(df["A"])).mean()
|
||
|
expected = df.groupby(df["A"]).mean()
|
||
|
tm.assert_frame_equal(result, expected, check_names=False)
|
||
|
|
||
|
with pytest.raises(KeyError, match=r"^'foo'$"):
|
||
|
df.groupby(list(df["A"][:-1]))
|
||
|
|
||
|
# pathological case of ambiguity
|
||
|
df = DataFrame({"foo": [0, 1], "bar": [3, 4], "val": np.random.randn(2)})
|
||
|
|
||
|
result = df.groupby(["foo", "bar"]).mean()
|
||
|
expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]]
|
||
|
|
||
|
|
||
|
def test_groupby_keys_same_size_as_index():
|
||
|
# GH 11185
|
||
|
freq = "s"
|
||
|
index = pd.date_range(
|
||
|
start=pd.Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq
|
||
|
)
|
||
|
df = pd.DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index)
|
||
|
result = df.groupby([pd.Grouper(level=0, freq=freq), "metric"]).mean()
|
||
|
expected = df.set_index([df.index, "metric"])
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_one_row():
|
||
|
# GH 11741
|
||
|
msg = r"^'Z'$"
|
||
|
df1 = pd.DataFrame(np.random.randn(1, 4), columns=list("ABCD"))
|
||
|
with pytest.raises(KeyError, match=msg):
|
||
|
df1.groupby("Z")
|
||
|
df2 = pd.DataFrame(np.random.randn(2, 4), columns=list("ABCD"))
|
||
|
with pytest.raises(KeyError, match=msg):
|
||
|
df2.groupby("Z")
|
||
|
|
||
|
|
||
|
def test_groupby_nat_exclude():
|
||
|
# GH 6992
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
"values": np.random.randn(8),
|
||
|
"dt": [
|
||
|
np.nan,
|
||
|
pd.Timestamp("2013-01-01"),
|
||
|
np.nan,
|
||
|
pd.Timestamp("2013-02-01"),
|
||
|
np.nan,
|
||
|
pd.Timestamp("2013-02-01"),
|
||
|
np.nan,
|
||
|
pd.Timestamp("2013-01-01"),
|
||
|
],
|
||
|
"str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"],
|
||
|
}
|
||
|
)
|
||
|
grouped = df.groupby("dt")
|
||
|
|
||
|
expected = [pd.Index([1, 7]), pd.Index([3, 5])]
|
||
|
keys = sorted(grouped.groups.keys())
|
||
|
assert len(keys) == 2
|
||
|
for k, e in zip(keys, expected):
|
||
|
# grouped.groups keys are np.datetime64 with system tz
|
||
|
# not to be affected by tz, only compare values
|
||
|
tm.assert_index_equal(grouped.groups[k], e)
|
||
|
|
||
|
# confirm obj is not filtered
|
||
|
tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
|
||
|
assert grouped.ngroups == 2
|
||
|
|
||
|
expected = {
|
||
|
Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.intp),
|
||
|
Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.intp),
|
||
|
}
|
||
|
|
||
|
for k in grouped.indices:
|
||
|
tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
|
||
|
|
||
|
tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]])
|
||
|
tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]])
|
||
|
|
||
|
with pytest.raises(KeyError, match=r"^NaT$"):
|
||
|
grouped.get_group(pd.NaT)
|
||
|
|
||
|
nan_df = DataFrame(
|
||
|
{"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]}
|
||
|
)
|
||
|
assert nan_df["nan"].dtype == "float64"
|
||
|
assert nan_df["nat"].dtype == "datetime64[ns]"
|
||
|
|
||
|
for key in ["nan", "nat"]:
|
||
|
grouped = nan_df.groupby(key)
|
||
|
assert grouped.groups == {}
|
||
|
assert grouped.ngroups == 0
|
||
|
assert grouped.indices == {}
|
||
|
with pytest.raises(KeyError, match=r"^nan$"):
|
||
|
grouped.get_group(np.nan)
|
||
|
with pytest.raises(KeyError, match=r"^NaT$"):
|
||
|
grouped.get_group(pd.NaT)
|
||
|
|
||
|
|
||
|
def test_groupby_2d_malformed():
|
||
|
d = DataFrame(index=range(2))
|
||
|
d["group"] = ["g1", "g2"]
|
||
|
d["zeros"] = [0, 0]
|
||
|
d["ones"] = [1, 1]
|
||
|
d["label"] = ["l1", "l2"]
|
||
|
tmp = d.groupby(["group"]).mean()
|
||
|
res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
|
||
|
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
|
||
|
tm.assert_numpy_array_equal(tmp.values, res_values)
|
||
|
|
||
|
|
||
|
def test_int32_overflow():
|
||
|
B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)))
|
||
|
A = np.arange(25000)
|
||
|
df = DataFrame({"A": A, "B": B, "C": A, "D": B, "E": np.random.randn(25000)})
|
||
|
|
||
|
left = df.groupby(["A", "B", "C", "D"]).sum()
|
||
|
right = df.groupby(["D", "C", "B", "A"]).sum()
|
||
|
assert len(left) == len(right)
|
||
|
|
||
|
|
||
|
def test_groupby_sort_multi():
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"a": ["foo", "bar", "baz"],
|
||
|
"b": [3, 2, 1],
|
||
|
"c": [0, 1, 2],
|
||
|
"d": np.random.randn(3),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
tups = [tuple(row) for row in df[["a", "b", "c"]].values]
|
||
|
tups = com.asarray_tuplesafe(tups)
|
||
|
result = df.groupby(["a", "b", "c"], sort=True).sum()
|
||
|
tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
|
||
|
|
||
|
tups = [tuple(row) for row in df[["c", "a", "b"]].values]
|
||
|
tups = com.asarray_tuplesafe(tups)
|
||
|
result = df.groupby(["c", "a", "b"], sort=True).sum()
|
||
|
tm.assert_numpy_array_equal(result.index.values, tups)
|
||
|
|
||
|
tups = [tuple(x) for x in df[["b", "c", "a"]].values]
|
||
|
tups = com.asarray_tuplesafe(tups)
|
||
|
result = df.groupby(["b", "c", "a"], sort=True).sum()
|
||
|
tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
|
||
|
|
||
|
df = DataFrame(
|
||
|
{"a": [0, 1, 2, 0, 1, 2], "b": [0, 0, 0, 1, 1, 1], "d": np.random.randn(6)}
|
||
|
)
|
||
|
grouped = df.groupby(["a", "b"])["d"]
|
||
|
result = grouped.sum()
|
||
|
|
||
|
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
|
||
|
tups = [tuple(row) for row in df[keys].values]
|
||
|
tups = com.asarray_tuplesafe(tups)
|
||
|
expected = f(df.groupby(tups)[field])
|
||
|
for k, v in expected.items():
|
||
|
assert result[k] == v
|
||
|
|
||
|
_check_groupby(df, result, ["a", "b"], "d")
|
||
|
|
||
|
|
||
|
def test_dont_clobber_name_column():
|
||
|
df = DataFrame(
|
||
|
{"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2}
|
||
|
)
|
||
|
|
||
|
result = df.groupby("key").apply(lambda x: x)
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
|
||
|
def test_skip_group_keys():
|
||
|
|
||
|
tsf = tm.makeTimeDataFrame()
|
||
|
|
||
|
grouped = tsf.groupby(lambda x: x.month, group_keys=False)
|
||
|
result = grouped.apply(lambda x: x.sort_values(by="A")[:3])
|
||
|
|
||
|
pieces = [group.sort_values(by="A")[:3] for key, group in grouped]
|
||
|
|
||
|
expected = pd.concat(pieces)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False)
|
||
|
result = grouped.apply(lambda x: x.sort_values()[:3])
|
||
|
|
||
|
pieces = [group.sort_values()[:3] for key, group in grouped]
|
||
|
|
||
|
expected = pd.concat(pieces)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_no_nonsense_name(float_frame):
|
||
|
# GH #995
|
||
|
s = float_frame["C"].copy()
|
||
|
s.name = None
|
||
|
|
||
|
result = s.groupby(float_frame["A"]).agg(np.sum)
|
||
|
assert result.name is None
|
||
|
|
||
|
|
||
|
def test_multifunc_sum_bug():
|
||
|
# GH #1065
|
||
|
x = DataFrame(np.arange(9).reshape(3, 3))
|
||
|
x["test"] = 0
|
||
|
x["fl"] = [1.3, 1.5, 1.6]
|
||
|
|
||
|
grouped = x.groupby("test")
|
||
|
result = grouped.agg({"fl": "sum", 2: "size"})
|
||
|
assert result["fl"].dtype == np.float64
|
||
|
|
||
|
|
||
|
def test_handle_dict_return_value(df):
|
||
|
def f(group):
|
||
|
return {"max": group.max(), "min": group.min()}
|
||
|
|
||
|
def g(group):
|
||
|
return Series({"max": group.max(), "min": group.min()})
|
||
|
|
||
|
result = df.groupby("A")["C"].apply(f)
|
||
|
expected = df.groupby("A")["C"].apply(g)
|
||
|
|
||
|
assert isinstance(result, Series)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("grouper", ["A", ["A", "B"]])
|
||
|
def test_set_group_name(df, grouper):
|
||
|
def f(group):
|
||
|
assert group.name is not None
|
||
|
return group
|
||
|
|
||
|
def freduce(group):
|
||
|
assert group.name is not None
|
||
|
return group.sum()
|
||
|
|
||
|
def foo(x):
|
||
|
return freduce(x)
|
||
|
|
||
|
grouped = df.groupby(grouper)
|
||
|
|
||
|
# make sure all these work
|
||
|
grouped.apply(f)
|
||
|
grouped.aggregate(freduce)
|
||
|
grouped.aggregate({"C": freduce, "D": freduce})
|
||
|
grouped.transform(f)
|
||
|
|
||
|
grouped["C"].apply(f)
|
||
|
grouped["C"].aggregate(freduce)
|
||
|
grouped["C"].aggregate([freduce, foo])
|
||
|
grouped["C"].transform(f)
|
||
|
|
||
|
|
||
|
def test_group_name_available_in_inference_pass():
|
||
|
# gh-15062
|
||
|
df = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)})
|
||
|
|
||
|
names = []
|
||
|
|
||
|
def f(group):
|
||
|
names.append(group.name)
|
||
|
return group.copy()
|
||
|
|
||
|
df.groupby("a", sort=False, group_keys=False).apply(f)
|
||
|
|
||
|
expected_names = [0, 1, 2]
|
||
|
assert names == expected_names
|
||
|
|
||
|
|
||
|
def test_no_dummy_key_names(df):
|
||
|
# see gh-1291
|
||
|
result = df.groupby(df["A"].values).sum()
|
||
|
assert result.index.name is None
|
||
|
|
||
|
result = df.groupby([df["A"].values, df["B"].values]).sum()
|
||
|
assert result.index.names == (None, None)
|
||
|
|
||
|
|
||
|
def test_groupby_sort_multiindex_series():
|
||
|
# series multiindex groupby sort argument was not being passed through
|
||
|
# _compress_group_index
|
||
|
# GH 9444
|
||
|
index = MultiIndex(
|
||
|
levels=[[1, 2], [1, 2]],
|
||
|
codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
|
||
|
names=["a", "b"],
|
||
|
)
|
||
|
mseries = Series([0, 1, 2, 3, 4, 5], index=index)
|
||
|
index = MultiIndex(
|
||
|
levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"]
|
||
|
)
|
||
|
mseries_result = Series([0, 2, 4], index=index)
|
||
|
|
||
|
result = mseries.groupby(level=["a", "b"], sort=False).first()
|
||
|
tm.assert_series_equal(result, mseries_result)
|
||
|
result = mseries.groupby(level=["a", "b"], sort=True).first()
|
||
|
tm.assert_series_equal(result, mseries_result.sort_index())
|
||
|
|
||
|
|
||
|
def test_groupby_reindex_inside_function():
|
||
|
|
||
|
periods = 1000
|
||
|
ind = date_range(start="2012/1/1", freq="5min", periods=periods)
|
||
|
df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind)
|
||
|
|
||
|
def agg_before(hour, func, fix=False):
|
||
|
"""
|
||
|
Run an aggregate func on the subset of data.
|
||
|
"""
|
||
|
|
||
|
def _func(data):
|
||
|
d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna()
|
||
|
if fix:
|
||
|
data[data.index[0]]
|
||
|
if len(d) == 0:
|
||
|
return None
|
||
|
return func(d)
|
||
|
|
||
|
return _func
|
||
|
|
||
|
def afunc(data):
|
||
|
d = data.select(lambda x: x.hour < 11).dropna()
|
||
|
return np.max(d)
|
||
|
|
||
|
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
|
||
|
closure_bad = grouped.agg({"high": agg_before(11, np.max)})
|
||
|
closure_good = grouped.agg({"high": agg_before(11, np.max, True)})
|
||
|
|
||
|
tm.assert_frame_equal(closure_bad, closure_good)
|
||
|
|
||
|
|
||
|
def test_groupby_multiindex_missing_pair():
|
||
|
# GH9049
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"group1": ["a", "a", "a", "b"],
|
||
|
"group2": ["c", "c", "d", "c"],
|
||
|
"value": [1, 1, 1, 5],
|
||
|
}
|
||
|
)
|
||
|
df = df.set_index(["group1", "group2"])
|
||
|
df_grouped = df.groupby(level=["group1", "group2"], sort=True)
|
||
|
|
||
|
res = df_grouped.agg("sum")
|
||
|
idx = MultiIndex.from_tuples(
|
||
|
[("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"]
|
||
|
)
|
||
|
exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"])
|
||
|
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
|
||
|
|
||
|
def test_groupby_multiindex_not_lexsorted():
|
||
|
# GH 11640
|
||
|
|
||
|
# define the lexsorted version
|
||
|
lexsorted_mi = MultiIndex.from_tuples(
|
||
|
[("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
|
||
|
)
|
||
|
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
|
||
|
assert lexsorted_df.columns.is_lexsorted()
|
||
|
|
||
|
# define the non-lexsorted version
|
||
|
not_lexsorted_df = DataFrame(
|
||
|
columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
|
||
|
)
|
||
|
not_lexsorted_df = not_lexsorted_df.pivot_table(
|
||
|
index="a", columns=["b", "c"], values="d"
|
||
|
)
|
||
|
not_lexsorted_df = not_lexsorted_df.reset_index()
|
||
|
assert not not_lexsorted_df.columns.is_lexsorted()
|
||
|
|
||
|
# compare the results
|
||
|
tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
|
||
|
|
||
|
expected = lexsorted_df.groupby("a").mean()
|
||
|
with tm.assert_produces_warning(PerformanceWarning):
|
||
|
result = not_lexsorted_df.groupby("a").mean()
|
||
|
tm.assert_frame_equal(expected, result)
|
||
|
|
||
|
# a transforming function should work regardless of sort
|
||
|
# GH 14776
|
||
|
df = DataFrame(
|
||
|
{"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]}
|
||
|
).set_index(["x", "y"])
|
||
|
assert not df.index.is_lexsorted()
|
||
|
|
||
|
for level in [0, 1, [0, 1]]:
|
||
|
for sort in [False, True]:
|
||
|
result = df.groupby(level=level, sort=sort).apply(DataFrame.drop_duplicates)
|
||
|
expected = df
|
||
|
tm.assert_frame_equal(expected, result)
|
||
|
|
||
|
result = (
|
||
|
df.sort_index()
|
||
|
.groupby(level=level, sort=sort)
|
||
|
.apply(DataFrame.drop_duplicates)
|
||
|
)
|
||
|
expected = df.sort_index()
|
||
|
tm.assert_frame_equal(expected, result)
|
||
|
|
||
|
|
||
|
def test_index_label_overlaps_location():
|
||
|
# checking we don't have any label/location confusion in the
|
||
|
# the wake of GH5375
|
||
|
df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1])
|
||
|
g = df.groupby(list("ababb"))
|
||
|
actual = g.filter(lambda x: len(x) > 2)
|
||
|
expected = df.iloc[[1, 3, 4]]
|
||
|
tm.assert_frame_equal(actual, expected)
|
||
|
|
||
|
ser = df[0]
|
||
|
g = ser.groupby(list("ababb"))
|
||
|
actual = g.filter(lambda x: len(x) > 2)
|
||
|
expected = ser.take([1, 3, 4])
|
||
|
tm.assert_series_equal(actual, expected)
|
||
|
|
||
|
# ... and again, with a generic Index of floats
|
||
|
df.index = df.index.astype(float)
|
||
|
g = df.groupby(list("ababb"))
|
||
|
actual = g.filter(lambda x: len(x) > 2)
|
||
|
expected = df.iloc[[1, 3, 4]]
|
||
|
tm.assert_frame_equal(actual, expected)
|
||
|
|
||
|
ser = df[0]
|
||
|
g = ser.groupby(list("ababb"))
|
||
|
actual = g.filter(lambda x: len(x) > 2)
|
||
|
expected = ser.take([1, 3, 4])
|
||
|
tm.assert_series_equal(actual, expected)
|
||
|
|
||
|
|
||
|
def test_transform_doesnt_clobber_ints():
|
||
|
# GH 7972
|
||
|
n = 6
|
||
|
x = np.arange(n)
|
||
|
df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x})
|
||
|
df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x})
|
||
|
|
||
|
gb = df.groupby("a")
|
||
|
result = gb.transform("mean")
|
||
|
|
||
|
gb2 = df2.groupby("a")
|
||
|
expected = gb2.transform("mean")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"sort_column",
|
||
|
["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]],
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]]
|
||
|
)
|
||
|
def test_groupby_preserves_sort(sort_column, group_column):
|
||
|
# Test to ensure that groupby always preserves sort order of original
|
||
|
# object. Issue #8588 and #9651
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"int_groups": [3, 1, 0, 1, 0, 3, 3, 3],
|
||
|
"string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"],
|
||
|
"ints": [8, 7, 4, 5, 2, 9, 1, 1],
|
||
|
"floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
|
||
|
"strings": ["z", "d", "a", "e", "word", "word2", "42", "47"],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
# Try sorting on different types and with different group types
|
||
|
|
||
|
df = df.sort_values(by=sort_column)
|
||
|
g = df.groupby(group_column)
|
||
|
|
||
|
def test_sort(x):
|
||
|
tm.assert_frame_equal(x, x.sort_values(by=sort_column))
|
||
|
|
||
|
g.apply(test_sort)
|
||
|
|
||
|
|
||
|
def test_group_shift_with_null_key():
|
||
|
# This test is designed to replicate the segfault in issue #13813.
|
||
|
n_rows = 1200
|
||
|
|
||
|
# Generate a moderately large dataframe with occasional missing
|
||
|
# values in column `B`, and then group by [`A`, `B`]. This should
|
||
|
# force `-1` in `labels` array of `g.grouper.group_info` exactly
|
||
|
# at those places, where the group-by key is partially missing.
|
||
|
df = DataFrame(
|
||
|
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
|
||
|
dtype=float,
|
||
|
columns=["A", "B", "Z"],
|
||
|
index=None,
|
||
|
)
|
||
|
g = df.groupby(["A", "B"])
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
|
||
|
dtype=float,
|
||
|
columns=["Z"],
|
||
|
index=None,
|
||
|
)
|
||
|
result = g.shift(-1)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_group_shift_with_fill_value():
|
||
|
# GH #24128
|
||
|
n_rows = 24
|
||
|
df = DataFrame(
|
||
|
[(i % 12, i % 3, i) for i in range(n_rows)],
|
||
|
dtype=float,
|
||
|
columns=["A", "B", "Z"],
|
||
|
index=None,
|
||
|
)
|
||
|
g = df.groupby(["A", "B"])
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
|
||
|
dtype=float,
|
||
|
columns=["Z"],
|
||
|
index=None,
|
||
|
)
|
||
|
result = g.shift(-1, fill_value=0)[["Z"]]
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_group_shift_lose_timezone():
|
||
|
# GH 30134
|
||
|
now_dt = pd.Timestamp.utcnow()
|
||
|
df = DataFrame({"a": [1, 1], "date": now_dt})
|
||
|
result = df.groupby("a").shift(0).iloc[0]
|
||
|
expected = Series({"date": now_dt}, name=result.name)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_pivot_table_values_key_error():
|
||
|
# This test is designed to replicate the error in issue #14938
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
"eventDate": pd.date_range(datetime.today(), periods=20, freq="M").tolist(),
|
||
|
"thename": range(0, 20),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
df["year"] = df.set_index("eventDate").index.year
|
||
|
df["month"] = df.set_index("eventDate").index.month
|
||
|
|
||
|
with pytest.raises(KeyError, match="'badname'"):
|
||
|
df.reset_index().pivot_table(
|
||
|
index="year", columns="month", values="badname", aggfunc="count"
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_empty_dataframe_groupby():
|
||
|
# GH8093
|
||
|
df = DataFrame(columns=["A", "B", "C"])
|
||
|
|
||
|
result = df.groupby("A").sum()
|
||
|
expected = DataFrame(columns=["B", "C"], dtype=np.float64)
|
||
|
expected.index.name = "A"
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_tuple_as_grouping():
|
||
|
# https://github.com/pandas-dev/pandas/issues/18314
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
("a", "b"): [1, 1, 1, 1],
|
||
|
"a": [2, 2, 2, 2],
|
||
|
"b": [2, 2, 2, 2],
|
||
|
"c": [1, 1, 1, 1],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
with pytest.raises(KeyError, match=r"('a', 'b')"):
|
||
|
df[["a", "b", "c"]].groupby(("a", "b"))
|
||
|
|
||
|
result = df.groupby(("a", "b"))["c"].sum()
|
||
|
expected = pd.Series([4], name="c", index=pd.Index([1], name=("a", "b")))
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_tuple_correct_keyerror():
|
||
|
# https://github.com/pandas-dev/pandas/issues/18798
|
||
|
df = pd.DataFrame(
|
||
|
1, index=range(3), columns=pd.MultiIndex.from_product([[1, 2], [3, 4]])
|
||
|
)
|
||
|
with pytest.raises(KeyError, match=r"^\(7, 8\)$"):
|
||
|
df.groupby((7, 8)).mean()
|
||
|
|
||
|
|
||
|
def test_groupby_agg_ohlc_non_first():
|
||
|
# GH 21716
|
||
|
df = pd.DataFrame(
|
||
|
[[1], [1]],
|
||
|
columns=["foo"],
|
||
|
index=pd.date_range("2018-01-01", periods=2, freq="D"),
|
||
|
)
|
||
|
|
||
|
expected = pd.DataFrame(
|
||
|
[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]],
|
||
|
columns=pd.MultiIndex.from_tuples(
|
||
|
(
|
||
|
("foo", "sum", "foo"),
|
||
|
("foo", "ohlc", "open"),
|
||
|
("foo", "ohlc", "high"),
|
||
|
("foo", "ohlc", "low"),
|
||
|
("foo", "ohlc", "close"),
|
||
|
)
|
||
|
),
|
||
|
index=pd.date_range("2018-01-01", periods=2, freq="D"),
|
||
|
)
|
||
|
|
||
|
result = df.groupby(pd.Grouper(freq="D")).agg(["sum", "ohlc"])
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_multiindex_nat():
|
||
|
# GH 9236
|
||
|
values = [
|
||
|
(pd.NaT, "a"),
|
||
|
(datetime(2012, 1, 2), "a"),
|
||
|
(datetime(2012, 1, 2), "b"),
|
||
|
(datetime(2012, 1, 3), "a"),
|
||
|
]
|
||
|
mi = pd.MultiIndex.from_tuples(values, names=["date", None])
|
||
|
ser = pd.Series([3, 2, 2.5, 4], index=mi)
|
||
|
|
||
|
result = ser.groupby(level=1).mean()
|
||
|
expected = pd.Series([3.0, 2.5], index=["a", "b"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_empty_list_raises():
|
||
|
# GH 5289
|
||
|
values = zip(range(10), range(10))
|
||
|
df = DataFrame(values, columns=["apple", "b"])
|
||
|
msg = "Grouper and axis must be same length"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.groupby([[]])
|
||
|
|
||
|
|
||
|
def test_groupby_multiindex_series_keys_len_equal_group_axis():
|
||
|
# GH 25704
|
||
|
index_array = [["x", "x"], ["a", "b"], ["k", "k"]]
|
||
|
index_names = ["first", "second", "third"]
|
||
|
ri = pd.MultiIndex.from_arrays(index_array, names=index_names)
|
||
|
s = pd.Series(data=[1, 2], index=ri)
|
||
|
result = s.groupby(["first", "third"]).sum()
|
||
|
|
||
|
index_array = [["x"], ["k"]]
|
||
|
index_names = ["first", "third"]
|
||
|
ei = pd.MultiIndex.from_arrays(index_array, names=index_names)
|
||
|
expected = pd.Series([3], index=ei)
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_groups_in_BaseGrouper():
|
||
|
# GH 26326
|
||
|
# Test if DataFrame grouped with a pandas.Grouper has correct groups
|
||
|
mi = pd.MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"])
|
||
|
df = pd.DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi)
|
||
|
result = df.groupby([pd.Grouper(level="alpha"), "beta"])
|
||
|
expected = df.groupby(["alpha", "beta"])
|
||
|
assert result.groups == expected.groups
|
||
|
|
||
|
result = df.groupby(["beta", pd.Grouper(level="alpha")])
|
||
|
expected = df.groupby(["beta", "alpha"])
|
||
|
assert result.groups == expected.groups
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("group_name", ["x", ["x"]])
|
||
|
def test_groupby_axis_1(group_name):
|
||
|
# GH 27614
|
||
|
df = pd.DataFrame(
|
||
|
np.arange(12).reshape(3, 4), index=[0, 1, 0], columns=[10, 20, 10, 20]
|
||
|
)
|
||
|
df.index.name = "y"
|
||
|
df.columns.name = "x"
|
||
|
|
||
|
results = df.groupby(group_name, axis=1).sum()
|
||
|
expected = df.T.groupby(group_name).sum().T
|
||
|
tm.assert_frame_equal(results, expected)
|
||
|
|
||
|
# test on MI column
|
||
|
iterables = [["bar", "baz", "foo"], ["one", "two"]]
|
||
|
mi = pd.MultiIndex.from_product(iterables=iterables, names=["x", "x1"])
|
||
|
df = pd.DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi)
|
||
|
results = df.groupby(group_name, axis=1).sum()
|
||
|
expected = df.T.groupby(group_name).sum().T
|
||
|
tm.assert_frame_equal(results, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"op, expected",
|
||
|
[
|
||
|
(
|
||
|
"shift",
|
||
|
{
|
||
|
"time": [
|
||
|
None,
|
||
|
None,
|
||
|
Timestamp("2019-01-01 12:00:00"),
|
||
|
Timestamp("2019-01-01 12:30:00"),
|
||
|
None,
|
||
|
None,
|
||
|
]
|
||
|
},
|
||
|
),
|
||
|
(
|
||
|
"bfill",
|
||
|
{
|
||
|
"time": [
|
||
|
Timestamp("2019-01-01 12:00:00"),
|
||
|
Timestamp("2019-01-01 12:30:00"),
|
||
|
Timestamp("2019-01-01 14:00:00"),
|
||
|
Timestamp("2019-01-01 14:30:00"),
|
||
|
Timestamp("2019-01-01 14:00:00"),
|
||
|
Timestamp("2019-01-01 14:30:00"),
|
||
|
]
|
||
|
},
|
||
|
),
|
||
|
(
|
||
|
"ffill",
|
||
|
{
|
||
|
"time": [
|
||
|
Timestamp("2019-01-01 12:00:00"),
|
||
|
Timestamp("2019-01-01 12:30:00"),
|
||
|
Timestamp("2019-01-01 12:00:00"),
|
||
|
Timestamp("2019-01-01 12:30:00"),
|
||
|
Timestamp("2019-01-01 14:00:00"),
|
||
|
Timestamp("2019-01-01 14:30:00"),
|
||
|
]
|
||
|
},
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected):
|
||
|
# GH19995, GH27992: Check that timezone does not drop in shift, bfill, and ffill
|
||
|
tz = tz_naive_fixture
|
||
|
data = {
|
||
|
"id": ["A", "B", "A", "B", "A", "B"],
|
||
|
"time": [
|
||
|
Timestamp("2019-01-01 12:00:00"),
|
||
|
Timestamp("2019-01-01 12:30:00"),
|
||
|
None,
|
||
|
None,
|
||
|
Timestamp("2019-01-01 14:00:00"),
|
||
|
Timestamp("2019-01-01 14:30:00"),
|
||
|
],
|
||
|
}
|
||
|
df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz))
|
||
|
|
||
|
grouped = df.groupby("id")
|
||
|
result = getattr(grouped, op)()
|
||
|
expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_ffill_missing_arguments():
|
||
|
# GH 14955
|
||
|
df = pd.DataFrame({"a": [1, 2], "b": [1, 1]})
|
||
|
with pytest.raises(ValueError, match="Must specify a fill"):
|
||
|
df.groupby("b").fillna()
|
||
|
|
||
|
|
||
|
def test_groupby_only_none_group():
|
||
|
# see GH21624
|
||
|
# this was crashing with "ValueError: Length of passed values is 1, index implies 0"
|
||
|
df = pd.DataFrame({"g": [None], "x": 1})
|
||
|
actual = df.groupby("g")["x"].transform("sum")
|
||
|
expected = pd.Series([np.nan], name="x")
|
||
|
|
||
|
tm.assert_series_equal(actual, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_duplicate_index():
|
||
|
# GH#29189 the groupby call here used to raise
|
||
|
ser = pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0])
|
||
|
gb = ser.groupby(level=0)
|
||
|
|
||
|
result = gb.mean()
|
||
|
expected = pd.Series([2, 5.5, 8], index=[2.0, 4.0, 5.0])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
|
||
|
def test_bool_aggs_dup_column_labels(bool_agg_func):
|
||
|
# 21668
|
||
|
df = pd.DataFrame([[True, True]], columns=["a", "a"])
|
||
|
grp_by = df.groupby([0])
|
||
|
result = getattr(grp_by, bool_agg_func)()
|
||
|
|
||
|
expected = df
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"idx", [pd.Index(["a", "a"]), pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")))]
|
||
|
)
|
||
|
@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning")
|
||
|
def test_dup_labels_output_shape(groupby_func, idx):
|
||
|
if groupby_func in {"size", "ngroup", "cumcount"}:
|
||
|
pytest.skip("Not applicable")
|
||
|
|
||
|
df = pd.DataFrame([[1, 1]], columns=idx)
|
||
|
grp_by = df.groupby([0])
|
||
|
|
||
|
args = []
|
||
|
if groupby_func in {"fillna", "nth"}:
|
||
|
args.append(0)
|
||
|
elif groupby_func == "corrwith":
|
||
|
args.append(df)
|
||
|
elif groupby_func == "tshift":
|
||
|
df.index = [pd.Timestamp("today")]
|
||
|
args.extend([1, "D"])
|
||
|
|
||
|
result = getattr(grp_by, groupby_func)(*args)
|
||
|
|
||
|
assert result.shape == (1, 2)
|
||
|
tm.assert_index_equal(result.columns, idx)
|
||
|
|
||
|
|
||
|
def test_groupby_crash_on_nunique(axis):
|
||
|
# Fix following 30253
|
||
|
df = pd.DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]})
|
||
|
|
||
|
axis_number = df._get_axis_number(axis)
|
||
|
if not axis_number:
|
||
|
df = df.T
|
||
|
|
||
|
result = df.groupby(axis=axis_number, level=0).nunique()
|
||
|
|
||
|
expected = pd.DataFrame({"A": [1, 2], "D": [1, 1]})
|
||
|
if not axis_number:
|
||
|
expected = expected.T
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_list_level():
|
||
|
# GH 9790
|
||
|
expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3))
|
||
|
result = expected.groupby(level=[0]).mean()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"max_seq_items, expected",
|
||
|
[
|
||
|
(5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"),
|
||
|
(4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"),
|
||
|
],
|
||
|
)
|
||
|
def test_groups_repr_truncates(max_seq_items, expected):
|
||
|
# GH 1135
|
||
|
df = pd.DataFrame(np.random.randn(5, 1))
|
||
|
df["a"] = df.index
|
||
|
|
||
|
with pd.option_context("display.max_seq_items", max_seq_items):
|
||
|
result = df.groupby("a").groups.__repr__()
|
||
|
assert result == expected
|
||
|
|
||
|
result = df.groupby(np.array(df.a)).groups.__repr__()
|
||
|
assert result == expected
|