import numpy as np import pytest import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna import pandas._testing as tm def test_first_last_nth(df): # tests for first / last / nth grouped = df.groupby("A") first = grouped.first() expected = df.loc[[1, 0], ["B", "C", "D"]] expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() tm.assert_frame_equal(first, expected) nth = grouped.nth(0) tm.assert_frame_equal(nth, expected) last = grouped.last() expected = df.loc[[5, 7], ["B", "C", "D"]] expected.index = Index(["bar", "foo"], name="A") tm.assert_frame_equal(last, expected) nth = grouped.nth(-1) tm.assert_frame_equal(nth, expected) nth = grouped.nth(1) expected = df.loc[[2, 3], ["B", "C", "D"]].copy() expected.index = Index(["foo", "bar"], name="A") expected = expected.sort_index() tm.assert_frame_equal(nth, expected) # it works! grouped["B"].first() grouped["B"].last() grouped["B"].nth(0) df.loc[df["A"] == "foo", "B"] = np.nan assert isna(grouped["B"].first()["foo"]) assert isna(grouped["B"].last()["foo"]) assert isna(grouped["B"].nth(0)["foo"]) # v0.14.0 whatsnew df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) g = df.groupby("A") result = g.first() expected = df.iloc[[1, 2]].set_index("A") tm.assert_frame_equal(result, expected) expected = df.iloc[[1, 2]].set_index("A") result = g.nth(0, dropna="any") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("method", ["first", "last"]) def test_first_last_with_na_object(method, nulls_fixture): # https://github.com/pandas-dev/pandas/issues/32123 groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby( "a" ) result = getattr(groups, method)() if method == "first": values = [1, 3] else: values = [2, 3] values = np.array(values, dtype=result["b"].dtype) idx = pd.Index([1, 2], name="a") expected = pd.DataFrame({"b": values}, index=idx) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("index", [0, -1]) def test_nth_with_na_object(index, nulls_fixture): # https://github.com/pandas-dev/pandas/issues/32123 groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby( "a" ) result = groups.nth(index) if index == 0: values = [1, 3] else: values = [2, nulls_fixture] values = np.array(values, dtype=result["b"].dtype) idx = pd.Index([1, 2], name="a") expected = pd.DataFrame({"b": values}, index=idx) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("method", ["first", "last"]) def test_first_last_with_None(method): # https://github.com/pandas-dev/pandas/issues/32800 # None should be preserved as object dtype df = pd.DataFrame.from_dict({"id": ["a"], "value": [None]}) groups = df.groupby("id", as_index=False) result = getattr(groups, method)() tm.assert_frame_equal(result, df) def test_first_last_nth_dtypes(df_mixed_floats): df = df_mixed_floats.copy() df["E"] = True df["F"] = 1 # tests for first / last / nth grouped = df.groupby("A") first = grouped.first() expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]] expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() tm.assert_frame_equal(first, expected) last = grouped.last() expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]] expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() tm.assert_frame_equal(last, expected) nth = grouped.nth(1) expected = df.loc[[3, 2], ["B", "C", "D", "E", "F"]] expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() tm.assert_frame_equal(nth, expected) # GH 2763, first/last shifting dtypes idx = list(range(10)) idx.append(9) s = Series(data=range(11), index=idx, name="IntCol") assert s.dtype == "int64" f = s.groupby(level=0).first() assert f.dtype == "int64" def test_first_last_nth_nan_dtype(): # GH 33591 df = pd.DataFrame({"data": ["A"], "nans": pd.Series([np.nan], dtype=object)}) grouped = df.groupby("data") expected = df.set_index("data").nans tm.assert_series_equal(grouped.nans.first(), expected) tm.assert_series_equal(grouped.nans.last(), expected) tm.assert_series_equal(grouped.nans.nth(-1), expected) tm.assert_series_equal(grouped.nans.nth(0), expected) def test_first_strings_timestamps(): # GH 11244 test = pd.DataFrame( { pd.Timestamp("2012-01-01 00:00:00"): ["a", "b"], pd.Timestamp("2012-01-02 00:00:00"): ["c", "d"], "name": ["e", "e"], "aaaa": ["f", "g"], } ) result = test.groupby("name").first() expected = DataFrame( [["a", "c", "f"]], columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]), index=Index(["e"], name="name"), ) tm.assert_frame_equal(result, expected) def test_nth(): df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) g = df.groupby("A") tm.assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index("A")) tm.assert_frame_equal(g.nth(1), df.iloc[[1]].set_index("A")) tm.assert_frame_equal(g.nth(2), df.loc[[]].set_index("A")) tm.assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index("A")) tm.assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index("A")) tm.assert_frame_equal(g.nth(-3), df.loc[[]].set_index("A")) tm.assert_series_equal(g.B.nth(0), df.set_index("A").B.iloc[[0, 2]]) tm.assert_series_equal(g.B.nth(1), df.set_index("A").B.iloc[[1]]) tm.assert_frame_equal(g[["B"]].nth(0), df.loc[[0, 2], ["A", "B"]].set_index("A")) exp = df.set_index("A") tm.assert_frame_equal(g.nth(0, dropna="any"), exp.iloc[[1, 2]]) tm.assert_frame_equal(g.nth(-1, dropna="any"), exp.iloc[[1, 2]]) exp["B"] = np.nan tm.assert_frame_equal(g.nth(7, dropna="any"), exp.iloc[[1, 2]]) tm.assert_frame_equal(g.nth(2, dropna="any"), exp.iloc[[1, 2]]) # out of bounds, regression from 0.13.1 # GH 6621 df = DataFrame( { "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"}, "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"}, "two": { 0: 1.5456590000000001, 1: -0.070345000000000005, 2: -2.4004539999999999, 3: 0.46206000000000003, 4: 0.52350799999999997, }, "one": { 0: 0.56573799999999996, 1: -0.9742360000000001, 2: 1.033801, 3: -0.78543499999999999, 4: 0.70422799999999997, }, } ).set_index(["color", "food"]) result = df.groupby(level=0, as_index=False).nth(2) expected = df.iloc[[-1]] tm.assert_frame_equal(result, expected) result = df.groupby(level=0, as_index=False).nth(3) expected = df.loc[[]] tm.assert_frame_equal(result, expected) # GH 7559 # from the vbench df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype="int64") s = df[1] g = df[0] expected = s.groupby(g).first() expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) tm.assert_series_equal(expected2, expected, check_names=False) assert expected.name == 1 assert expected2.name == 1 # validate first v = s[g == 1].iloc[0] assert expected.iloc[0] == v assert expected2.iloc[0] == v # this is NOT the same as .first (as sorted is default!) # as it keeps the order in the series (and not the group order) # related GH 7287 expected = s.groupby(g, sort=False).first() result = s.groupby(g, sort=False).nth(0, dropna="all") tm.assert_series_equal(result, expected) with pytest.raises(ValueError, match="For a DataFrame groupby"): s.groupby(g, sort=False).nth(0, dropna=True) # doc example df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) g = df.groupby("A") result = g.B.nth(0, dropna="all") expected = g.B.first() tm.assert_series_equal(result, expected) # test multiple nth values df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"]) g = df.groupby("A") tm.assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index("A")) tm.assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index("A")) tm.assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index("A")) tm.assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index("A")) tm.assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index("A")) tm.assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index("A")) tm.assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index("A")) tm.assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index("A")) business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") df = DataFrame(1, index=business_dates, columns=["a", "b"]) # get the first, fourth and last two business days for each month key = [df.index.year, df.index.month] result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) expected_dates = pd.to_datetime( [ "2014/4/1", "2014/4/4", "2014/4/29", "2014/4/30", "2014/5/1", "2014/5/6", "2014/5/29", "2014/5/30", "2014/6/2", "2014/6/5", "2014/6/27", "2014/6/30", ] ) expected = DataFrame(1, columns=["a", "b"], index=expected_dates) tm.assert_frame_equal(result, expected) def test_nth_multi_index(three_group): # PR 9090, related to issue 8979 # test nth on MultiIndex, should match .first() grouped = three_group.groupby(["A", "B"]) result = grouped.nth(0) expected = grouped.first() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "data, expected_first, expected_last", [ ( { "id": ["A"], "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), "foo": [1], }, { "id": ["A"], "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), "foo": [1], }, { "id": ["A"], "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), "foo": [1], }, ), ( { "id": ["A", "B", "A"], "time": [ Timestamp("2012-01-01 13:00:00", tz="America/New_York"), Timestamp("2012-02-01 14:00:00", tz="US/Central"), Timestamp("2012-03-01 12:00:00", tz="Europe/London"), ], "foo": [1, 2, 3], }, { "id": ["A", "B"], "time": [ Timestamp("2012-01-01 13:00:00", tz="America/New_York"), Timestamp("2012-02-01 14:00:00", tz="US/Central"), ], "foo": [1, 2], }, { "id": ["A", "B"], "time": [ Timestamp("2012-03-01 12:00:00", tz="Europe/London"), Timestamp("2012-02-01 14:00:00", tz="US/Central"), ], "foo": [3, 2], }, ), ], ) def test_first_last_tz(data, expected_first, expected_last): # GH15884 # Test that the timezone is retained when calling first # or last on groupby with as_index=False df = DataFrame(data) result = df.groupby("id", as_index=False).first() expected = DataFrame(expected_first) cols = ["id", "time", "foo"] tm.assert_frame_equal(result[cols], expected[cols]) result = df.groupby("id", as_index=False)["time"].first() tm.assert_frame_equal(result, expected[["id", "time"]]) result = df.groupby("id", as_index=False).last() expected = DataFrame(expected_last) cols = ["id", "time", "foo"] tm.assert_frame_equal(result[cols], expected[cols]) result = df.groupby("id", as_index=False)["time"].last() tm.assert_frame_equal(result, expected[["id", "time"]]) @pytest.mark.parametrize( "method, ts, alpha", [ ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"], ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"], ], ) def test_first_last_tz_multi_column(method, ts, alpha): # GH 21603 category_string = pd.Series(list("abc")).astype("category") df = pd.DataFrame( { "group": [1, 1, 2], "category_string": category_string, "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), } ) result = getattr(df.groupby("group"), method)() expected = pd.DataFrame( { "category_string": pd.Categorical( [alpha, "c"], dtype=category_string.dtype ), "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], }, index=pd.Index([1, 2], name="group"), ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "values", [ pd.array([True, False], dtype="boolean"), pd.array([1, 2], dtype="Int64"), pd.to_datetime(["2020-01-01", "2020-02-01"]), pd.to_timedelta([1, 2], unit="D"), ], ) @pytest.mark.parametrize("function", ["first", "last", "min", "max"]) def test_first_last_extension_array_keeps_dtype(values, function): # https://github.com/pandas-dev/pandas/issues/33071 # https://github.com/pandas-dev/pandas/issues/32194 df = DataFrame({"a": [1, 2], "b": values}) grouped = df.groupby("a") idx = Index([1, 2], name="a") expected_series = Series(values, name="b", index=idx) expected_frame = DataFrame({"b": values}, index=idx) result_series = getattr(grouped["b"], function)() tm.assert_series_equal(result_series, expected_series) result_frame = grouped.agg({"b": function}) tm.assert_frame_equal(result_frame, expected_frame) def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex three_group = DataFrame( { "A": [ "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo", ], "B": [ "one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one", ], "C": [ "dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny", ], } ) grouped = three_group.groupby(["A", "B"]) result = grouped.nth(0) expected = DataFrame( {"C": ["dull", "dull", "dull", "dull"]}, index=MultiIndex.from_arrays( [["bar", "bar", "foo", "foo"], ["one", "two", "one", "two"]], names=["A", "B"], ), ) tm.assert_frame_equal(result, expected) def test_groupby_head_tail(): df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) g_as = df.groupby("A", as_index=True) g_not_as = df.groupby("A", as_index=False) # as_index= False, much easier tm.assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) tm.assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) empty_not_as = DataFrame( columns=df.columns, index=pd.Index([], dtype=df.index.dtype) ) empty_not_as["A"] = empty_not_as["A"].astype(df.A.dtype) empty_not_as["B"] = empty_not_as["B"].astype(df.B.dtype) tm.assert_frame_equal(empty_not_as, g_not_as.head(0)) tm.assert_frame_equal(empty_not_as, g_not_as.tail(0)) tm.assert_frame_equal(empty_not_as, g_not_as.head(-1)) tm.assert_frame_equal(empty_not_as, g_not_as.tail(-1)) tm.assert_frame_equal(df, g_not_as.head(7)) # contains all tm.assert_frame_equal(df, g_not_as.tail(7)) # as_index=True, (used to be different) df_as = df tm.assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) tm.assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) empty_as["A"] = empty_not_as["A"].astype(df.A.dtype) empty_as["B"] = empty_not_as["B"].astype(df.B.dtype) tm.assert_frame_equal(empty_as, g_as.head(0)) tm.assert_frame_equal(empty_as, g_as.tail(0)) tm.assert_frame_equal(empty_as, g_as.head(-1)) tm.assert_frame_equal(empty_as, g_as.tail(-1)) tm.assert_frame_equal(df_as, g_as.head(7)) # contains all tm.assert_frame_equal(df_as, g_as.tail(7)) # test with selection tm.assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) tm.assert_frame_equal(g_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]]) tm.assert_frame_equal(g_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]]) tm.assert_frame_equal(g_as[["A", "B"]].head(1), df_as.loc[[0, 2]]) tm.assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) tm.assert_frame_equal(g_not_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]]) tm.assert_frame_equal(g_not_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]]) tm.assert_frame_equal(g_not_as[["A", "B"]].head(1), df_as.loc[[0, 2]]) def test_group_selection_cache(): # GH 12839 nth, head, and tail should return same result consistently df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) expected = df.iloc[[0, 2]].set_index("A") g = df.groupby("A") result1 = g.head(n=2) result2 = g.nth(0) tm.assert_frame_equal(result1, df) tm.assert_frame_equal(result2, expected) g = df.groupby("A") result1 = g.tail(n=2) result2 = g.nth(0) tm.assert_frame_equal(result1, df) tm.assert_frame_equal(result2, expected) g = df.groupby("A") result1 = g.nth(0) result2 = g.head(n=2) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, df) g = df.groupby("A") result1 = g.nth(0) result2 = g.tail(n=2) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, df) def test_nth_empty(): # GH 16064 df = DataFrame(index=[0], columns=["a", "b", "c"]) result = df.groupby("a").nth(10) expected = DataFrame(index=Index([], name="a"), columns=["b", "c"]) tm.assert_frame_equal(result, expected) result = df.groupby(["a", "b"]).nth(10) expected = DataFrame( index=MultiIndex([[], []], [[], []], names=["a", "b"]), columns=["c"] ) tm.assert_frame_equal(result, expected) def test_nth_column_order(): # GH 20760 # Check that nth preserves column order df = DataFrame( [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]], columns=["A", "C", "B"], ) result = df.groupby("A").nth(0) expected = DataFrame( [["b", 100.0], ["c", 200.0]], columns=["C", "B"], index=Index([1, 2], name="A") ) tm.assert_frame_equal(result, expected) result = df.groupby("A").nth(-1, dropna="any") expected = DataFrame( [["a", 50.0], ["d", 150.0]], columns=["C", "B"], index=Index([1, 2], name="A") ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dropna", [None, "any", "all"]) def test_nth_nan_in_grouper(dropna): # GH 26011 df = DataFrame( [[np.nan, 0, 1], ["abc", 2, 3], [np.nan, 4, 5], ["def", 6, 7], [np.nan, 8, 9]], columns=list("abc"), ) result = df.groupby("a").nth(0, dropna=dropna) expected = pd.DataFrame( [[2, 3], [6, 7]], columns=list("bc"), index=Index(["abc", "def"], name="a") ) tm.assert_frame_equal(result, expected)