from datetime import datetime import itertools import numpy as np import pytest import pandas as pd from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range import pandas._testing as tm class TestDataFrameReshape: def test_pivot(self): data = { "index": ["A", "B", "C", "C", "B", "A"], "columns": ["One", "One", "One", "Two", "Two", "Two"], "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } frame = DataFrame(data) pivoted = frame.pivot(index="index", columns="columns", values="values") expected = DataFrame( { "One": {"A": 1.0, "B": 2.0, "C": 3.0}, "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, } ) expected.index.name, expected.columns.name = "index", "columns" tm.assert_frame_equal(pivoted, expected) # name tracking assert pivoted.index.name == "index" assert pivoted.columns.name == "columns" # don't specify values pivoted = frame.pivot(index="index", columns="columns") assert pivoted.index.name == "index" assert pivoted.columns.names == (None, "columns") def test_pivot_duplicates(self): data = DataFrame( { "a": ["bar", "bar", "foo", "foo", "foo"], "b": ["one", "two", "one", "one", "two"], "c": [1.0, 2.0, 3.0, 3.0, 4.0], } ) with pytest.raises(ValueError, match="duplicate entries"): data.pivot("a", "b", "c") def test_pivot_empty(self): df = DataFrame(columns=["a", "b", "c"]) result = df.pivot("a", "b", "c") expected = DataFrame() tm.assert_frame_equal(result, expected, check_names=False) def test_pivot_integer_bug(self): df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")]) result = df.pivot(index=1, columns=0, values=2) repr(result) tm.assert_index_equal(result.columns, Index(["A", "B"], name=0)) def test_pivot_index_none(self): # gh-3962 data = { "index": ["A", "B", "C", "C", "B", "A"], "columns": ["One", "One", "One", "Two", "Two", "Two"], "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } frame = DataFrame(data).set_index("index") result = frame.pivot(columns="columns", values="values") expected = DataFrame( { "One": {"A": 1.0, "B": 2.0, "C": 3.0}, "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, } ) expected.index.name, expected.columns.name = "index", "columns" tm.assert_frame_equal(result, expected) # omit values result = frame.pivot(columns="columns") expected.columns = pd.MultiIndex.from_tuples( [("values", "One"), ("values", "Two")], names=[None, "columns"] ) expected.index.name = "index" tm.assert_frame_equal(result, expected, check_names=False) assert result.index.name == "index" assert result.columns.names == (None, "columns") expected.columns = expected.columns.droplevel(0) result = frame.pivot(columns="columns", values="values") expected.columns.name = "columns" tm.assert_frame_equal(result, expected) def test_stack_unstack(self, float_frame): df = float_frame.copy() df[:] = np.arange(np.prod(df.shape)).reshape(df.shape) stacked = df.stack() stacked_df = DataFrame({"foo": stacked, "bar": stacked}) unstacked = stacked.unstack() unstacked_df = stacked_df.unstack() tm.assert_frame_equal(unstacked, df) tm.assert_frame_equal(unstacked_df["bar"], df) unstacked_cols = stacked.unstack(0) unstacked_cols_df = stacked_df.unstack(0) tm.assert_frame_equal(unstacked_cols.T, df) tm.assert_frame_equal(unstacked_cols_df["bar"].T, df) def test_stack_mixed_level(self): # GH 18310 levels = [range(3), [3, "a", "b"], [1, 2]] # flat columns: df = DataFrame(1, index=levels[0], columns=levels[1]) result = df.stack() expected = Series(1, index=MultiIndex.from_product(levels[:2])) tm.assert_series_equal(result, expected) # MultiIndex columns: df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:])) result = df.stack(1) expected = DataFrame( 1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1] ) tm.assert_frame_equal(result, expected) # as above, but used labels in level are actually of homogeneous type result = df[["a", "b"]].stack(1) expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) def test_unstack_not_consolidated(self): # Gh#34708 df = pd.DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) df2 = df[["x"]] df2["y"] = df["y"] assert len(df2._mgr.blocks) == 2 res = df2.unstack() expected = df.unstack() tm.assert_series_equal(res, expected) def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) result = data.unstack(fill_value=-1) expected = DataFrame( {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16 ) tm.assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame( {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=float ) tm.assert_frame_equal(result, expected) # GH #13971: fill_value when unstacking multiple levels: df = DataFrame( {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]} ).set_index(["x", "y", "z"]) unstacked = df.unstack(["x", "y"], fill_value=0) key = ("w", "b", "j") expected = unstacked[key] result = pd.Series([0, 0, 2], index=unstacked.index, name=key) tm.assert_series_equal(result, expected) stacked = unstacked.stack(["x", "y"]) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) result = stacked.loc[df.index] tm.assert_frame_equal(result, df) # From a series s = df["w"] result = s.unstack(["x", "y"], fill_value=0) expected = unstacked["w"] tm.assert_frame_equal(result, expected) def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list("AB"), dtype=np.int32) df.index = MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list("xyz"), dtype=np.int32) expected.columns = MultiIndex.from_tuples( [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")] ) tm.assert_frame_equal(result, expected) # From a mixed type dataframe df["A"] = df["A"].astype(np.int16) df["B"] = df["B"].astype(np.float64) result = df.unstack(fill_value=-1) expected["A"] = expected["A"].astype(np.int16) expected["B"] = expected["B"].astype(np.float64) tm.assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list("xyz"), dtype=float) expected.columns = MultiIndex.from_tuples( [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")] ) tm.assert_frame_equal(result, expected) def test_unstack_fill_frame_datetime(self): # Test unstacking with date times dv = pd.date_range("2012-01-01", periods=4).values data = Series(dv) data.index = MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) result = data.unstack() expected = DataFrame( {"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]}, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected) result = data.unstack(fill_value=dv[0]) expected = DataFrame( {"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]}, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected) def test_unstack_fill_frame_timedelta(self): # Test unstacking with time deltas td = [Timedelta(days=i) for i in range(4)] data = Series(td) data.index = MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) result = data.unstack() expected = DataFrame( {"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]}, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected) result = data.unstack(fill_value=td[1]) expected = DataFrame( {"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]}, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected) def test_unstack_fill_frame_period(self): # Test unstacking with period periods = [ Period("2012-01"), Period("2012-02"), Period("2012-03"), Period("2012-04"), ] data = Series(periods) data.index = MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) result = data.unstack() expected = DataFrame( {"a": [periods[0], None, periods[3]], "b": [periods[1], periods[2], None]}, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected) result = data.unstack(fill_value=periods[1]) expected = DataFrame( { "a": [periods[0], periods[1], periods[3]], "b": [periods[1], periods[2], periods[1]], }, index=["x", "y", "z"], ) tm.assert_frame_equal(result, expected) def test_unstack_fill_frame_categorical(self): # Test unstacking with categorical data = pd.Series(["a", "b", "c", "a"], dtype="category") data.index = pd.MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) # By default missing values will be NaN result = data.unstack() expected = DataFrame( { "a": pd.Categorical(list("axa"), categories=list("abc")), "b": pd.Categorical(list("bcx"), categories=list("abc")), }, index=list("xyz"), ) tm.assert_frame_equal(result, expected) # Fill with non-category results in a ValueError msg = r"'fill_value=d' is not present in" with pytest.raises(ValueError, match=msg): data.unstack(fill_value="d") # Fill with category value replaces missing values as expected result = data.unstack(fill_value="c") expected = DataFrame( { "a": pd.Categorical(list("aca"), categories=list("abc")), "b": pd.Categorical(list("bcc"), categories=list("abc")), }, index=list("xyz"), ) tm.assert_frame_equal(result, expected) def test_unstack_tuplename_in_multiindex(self): # GH 19966 idx = pd.MultiIndex.from_product( [["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")] ) df = pd.DataFrame({"d": [1] * 9, "e": [2] * 9}, index=idx) result = df.unstack(("A", "a")) expected = pd.DataFrame( [[1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2]], columns=pd.MultiIndex.from_tuples( [ ("d", "a"), ("d", "b"), ("d", "c"), ("e", "a"), ("e", "b"), ("e", "c"), ], names=[None, ("A", "a")], ), index=pd.Index([1, 2, 3], name=("B", "b")), ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "unstack_idx, expected_values, expected_index, expected_columns", [ ( ("A", "a"), [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]], pd.MultiIndex.from_tuples( [(1, 3), (1, 4), (2, 3), (2, 4)], names=["B", "C"] ), pd.MultiIndex.from_tuples( [("d", "a"), ("d", "b"), ("e", "a"), ("e", "b")], names=[None, ("A", "a")], ), ), ( (("A", "a"), "B"), [[1, 1, 1, 1, 2, 2, 2, 2], [1, 1, 1, 1, 2, 2, 2, 2]], pd.Index([3, 4], name="C"), pd.MultiIndex.from_tuples( [ ("d", "a", 1), ("d", "a", 2), ("d", "b", 1), ("d", "b", 2), ("e", "a", 1), ("e", "a", 2), ("e", "b", 1), ("e", "b", 2), ], names=[None, ("A", "a"), "B"], ), ), ], ) def test_unstack_mixed_type_name_in_multiindex( self, unstack_idx, expected_values, expected_index, expected_columns ): # GH 19966 idx = pd.MultiIndex.from_product( [["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"] ) df = pd.DataFrame({"d": [1] * 8, "e": [2] * 8}, index=idx) result = df.unstack(unstack_idx) expected = pd.DataFrame( expected_values, columns=expected_columns, index=expected_index, ) tm.assert_frame_equal(result, expected) def test_unstack_preserve_dtypes(self): # Checks fix for #11847 df = pd.DataFrame( dict( state=["IL", "MI", "NC"], index=["a", "b", "c"], some_categories=pd.Series(["a", "b", "c"]).astype("category"), A=np.random.rand(3), B=1, C="foo", D=pd.Timestamp("20010102"), E=pd.Series([1.0, 50.0, 100.0]).astype("float32"), F=pd.Series([3.0, 4.0, 5.0]).astype("float64"), G=False, H=pd.Series([1, 200, 923442], dtype="int8"), ) ) def unstack_and_compare(df, column_name): unstacked1 = df.unstack([column_name]) unstacked2 = df.unstack(column_name) tm.assert_frame_equal(unstacked1, unstacked2) df1 = df.set_index(["state", "index"]) unstack_and_compare(df1, "index") df1 = df.set_index(["state", "some_categories"]) unstack_and_compare(df1, "some_categories") df1 = df.set_index(["F", "C"]) unstack_and_compare(df1, "F") df1 = df.set_index(["G", "B", "state"]) unstack_and_compare(df1, "B") df1 = df.set_index(["E", "A"]) unstack_and_compare(df1, "E") df1 = df.set_index(["state", "index"]) s = df1["A"] unstack_and_compare(s, "index") def test_stack_ints(self): columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3))) df = DataFrame(np.random.randn(30, 27), columns=columns) tm.assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1)) tm.assert_frame_equal( df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1) ) df_named = df.copy() return_value = df_named.columns.set_names(range(3), inplace=True) assert return_value is None tm.assert_frame_equal( df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1) ) def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( [ ("A", "cat", "long"), ("B", "cat", "long"), ("A", "dog", "short"), ("B", "dog", "short"), ], names=["exp", "animal", "hair_length"], ) df = DataFrame(np.random.randn(4, 4), columns=columns) animal_hair_stacked = df.stack(level=["animal", "hair_length"]) exp_hair_stacked = df.stack(level=["exp", "hair_length"]) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of # the level numbers df2 = df.copy() df2.columns.names = ["exp", "animal", 1] tm.assert_frame_equal( df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False ) tm.assert_frame_equal( df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False ) # When mixed types are passed and the ints are not level # names, raise msg = ( "level should contain all level names or all level numbers, not " "a mixture of the two" ) with pytest.raises(ValueError, match=msg): df2.stack(level=["animal", 0]) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() df3.columns.names = ["exp", "animal", 0] tm.assert_frame_equal( df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False ) def test_stack_int_level_names(self): columns = MultiIndex.from_tuples( [ ("A", "cat", "long"), ("B", "cat", "long"), ("A", "dog", "short"), ("B", "dog", "short"), ], names=["exp", "animal", "hair_length"], ) df = DataFrame(np.random.randn(4, 4), columns=columns) exp_animal_stacked = df.stack(level=["exp", "animal"]) animal_hair_stacked = df.stack(level=["animal", "hair_length"]) exp_hair_stacked = df.stack(level=["exp", "hair_length"]) df2 = df.copy() df2.columns.names = [0, 1, 2] tm.assert_frame_equal( df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False ) tm.assert_frame_equal( df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False ) tm.assert_frame_equal( df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False ) # Out-of-order int column names df3 = df.copy() df3.columns.names = [2, 0, 1] tm.assert_frame_equal( df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False ) tm.assert_frame_equal( df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False ) tm.assert_frame_equal( df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False ) def test_unstack_bool(self): df = DataFrame( [False, False], index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]), columns=["col"], ) rs = df.unstack() xp = DataFrame( np.array([[False, np.nan], [np.nan, False]], dtype=object), index=["a", "b"], columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]), ) tm.assert_frame_equal(rs, xp) def test_unstack_level_binding(self): # GH9856 mi = pd.MultiIndex( levels=[["foo", "bar"], ["one", "two"], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]], names=["first", "second", "third"], ) s = pd.Series(0, index=mi) result = s.unstack([1, 2]).stack(0) expected_mi = pd.MultiIndex( levels=[["foo", "bar"], ["one", "two"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=["first", "second"], ) expected = pd.DataFrame( np.array( [[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64 ), index=expected_mi, columns=pd.Index(["a", "b"], name="third"), ) tm.assert_frame_equal(result, expected) def test_unstack_to_series(self, float_frame): # check reversibility data = float_frame.unstack() assert isinstance(data, Series) undo = data.unstack().T tm.assert_frame_equal(undo, float_frame) # check NA handling data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) data.index = Index(["a", "b", "c"]) result = data.unstack() midx = MultiIndex( levels=[["x", "y"], ["a", "b", "c"]], codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], ) expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) tm.assert_series_equal(result, expected) # check composability of unstack old_data = data.copy() for _ in range(4): data = data.unstack() tm.assert_frame_equal(old_data, data) def test_unstack_dtypes(self): # GH 2929 rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] df = DataFrame(rows, columns=list("ABCD")) result = df.dtypes expected = Series([np.dtype("int64")] * 4, index=list("ABCD")) tm.assert_series_equal(result, expected) # single dtype df2 = df.set_index(["A", "B"]) df3 = df2.unstack("B") result = df3.dtypes expected = Series( [np.dtype("int64")] * 4, index=pd.MultiIndex.from_arrays( [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") ), ) tm.assert_series_equal(result, expected) # mixed df2 = df.set_index(["A", "B"]) df2["C"] = 3.0 df3 = df2.unstack("B") result = df3.dtypes expected = Series( [np.dtype("float64")] * 2 + [np.dtype("int64")] * 2, index=pd.MultiIndex.from_arrays( [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") ), ) tm.assert_series_equal(result, expected) df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes expected = Series( [np.dtype("float64")] * 2 + [np.dtype("object")] * 2, index=pd.MultiIndex.from_arrays( [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") ), ) tm.assert_series_equal(result, expected) # GH7405 for c, d in ( (np.zeros(5), np.zeros(5)), (np.arange(5, dtype="f8"), np.arange(5, 10, dtype="f8")), ): df = DataFrame( { "A": ["a"] * 5, "C": c, "D": d, "B": pd.date_range("2012-01-01", periods=5), } ) right = df.iloc[:3].copy(deep=True) df = df.set_index(["A", "B"]) df["D"] = df["D"].astype("int64") left = df.iloc[:3].unstack(0) right = right.set_index(["A", "B"]).unstack(0) right[("D", "a")] = right[("D", "a")].astype("int64") assert left.shape == (3, 2) tm.assert_frame_equal(left, right) def test_unstack_non_unique_index_names(self): idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"]) df = DataFrame([1, 2], index=idx) msg = "The name c1 occurs multiple times, use a level number" with pytest.raises(ValueError, match=msg): df.unstack("c1") with pytest.raises(ValueError, match=msg): df.T.stack("c1") def test_unstack_unused_levels(self): # GH 17845: unused codes in index make unstack() cast int to float idx = pd.MultiIndex.from_product([["a"], ["A", "B", "C", "D"]])[:-1] df = pd.DataFrame([[1, 0]] * 3, index=idx) result = df.unstack() exp_col = pd.MultiIndex.from_product([[0, 1], ["A", "B", "C"]]) expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col) tm.assert_frame_equal(result, expected) assert (result.columns.levels[1] == idx.levels[1]).all() # Unused items on both levels levels = [[0, 1, 7], [0, 1, 2, 3]] codes = [[0, 0, 1, 1], [0, 2, 0, 2]] idx = pd.MultiIndex(levels, codes) block = np.arange(4).reshape(2, 2) df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx) result = df.unstack() expected = pd.DataFrame( np.concatenate([block * 2, block * 2 + 1], axis=1), columns=idx ) tm.assert_frame_equal(result, expected) assert (result.columns.levels[1] == idx.levels[1]).all() # With mixed dtype and NaN levels = [["a", 2, "c"], [1, 3, 5, 7]] codes = [[0, -1, 1, 1], [0, 2, -1, 2]] idx = pd.MultiIndex(levels, codes) data = np.arange(8) df = pd.DataFrame(data.reshape(4, 2), index=idx) cases = ( (0, [13, 16, 6, 9, 2, 5, 8, 11], [np.nan, "a", 2], [np.nan, 5, 1]), (1, [8, 11, 1, 4, 12, 15, 13, 16], [np.nan, 5, 1], [np.nan, "a", 2]), ) for level, idces, col_level, idx_level in cases: result = df.unstack(level=level) exp_data = np.zeros(18) * np.nan exp_data[idces] = data cols = pd.MultiIndex.from_product([[0, 1], col_level]) expected = pd.DataFrame( exp_data.reshape(3, 6), index=idx_level, columns=cols ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("cols", [["A", "C"], slice(None)]) def test_unstack_unused_level(self, cols): # GH 18562 : unused codes on the unstacked level df = pd.DataFrame( [[2010, "a", "I"], [2011, "b", "II"]], columns=["A", "B", "C"] ) ind = df.set_index(["A", "B", "C"], drop=False) selection = ind.loc[(slice(None), slice(None), "I"), cols] result = selection.unstack() expected = ind.iloc[[0]][cols] expected.columns = MultiIndex.from_product( [expected.columns, ["I"]], names=[None, "C"] ) expected.index = expected.index.droplevel("C") tm.assert_frame_equal(result, expected) def test_unstack_long_index(self): # PH 32624: Error when using a lot of indices to unstack. # The error occurred only, if a lot of indices are used. df = pd.DataFrame( [[1]], columns=pd.MultiIndex.from_tuples([[0]], names=["c1"]), index=pd.MultiIndex.from_tuples( [[0, 0, 1, 0, 0, 0, 1]], names=["i1", "i2", "i3", "i4", "i5", "i6", "i7"], ), ) result = df.unstack(["i2", "i3", "i4", "i5", "i6", "i7"]) expected = pd.DataFrame( [[1]], columns=pd.MultiIndex.from_tuples( [[0, 0, 1, 0, 0, 0, 1]], names=["c1", "i2", "i3", "i4", "i5", "i6", "i7"], ), index=pd.Index([0], name="i1"), ) tm.assert_frame_equal(result, expected) def test_unstack_multi_level_cols(self): # PH 24729: Unstack a df with multi level columns df = pd.DataFrame( [[0.0, 0.0], [0.0, 0.0]], columns=pd.MultiIndex.from_tuples( [["B", "C"], ["B", "D"]], names=["c1", "c2"] ), index=pd.MultiIndex.from_tuples( [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"], ), ) assert df.unstack(["i2", "i1"]).columns.names[-2:] == ["i2", "i1"] def test_unstack_multi_level_rows_and_cols(self): # PH 28306: Unstack df with multi level cols and rows df = pd.DataFrame( [[1, 2], [3, 4], [-1, -2], [-3, -4]], columns=pd.MultiIndex.from_tuples([["a", "b", "c"], ["d", "e", "f"]]), index=pd.MultiIndex.from_tuples( [ ["m1", "P3", 222], ["m1", "A5", 111], ["m2", "P3", 222], ["m2", "A5", 111], ], names=["i1", "i2", "i3"], ), ) result = df.unstack(["i3", "i2"]) expected = df.unstack(["i3"]).unstack(["i2"]) tm.assert_frame_equal(result, expected) def test_unstack_nan_index(self): # GH7466 def cast(val): val_str = "" if val != val else val return f"{val_str:1}" def verify(df): mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] rows, cols = df.notna().values.nonzero() for i, j in zip(rows, cols): left = sorted(df.iloc[i, j].split(".")) right = mk_list(df.index[i]) + mk_list(df.columns[j]) right = sorted(map(cast, right)) assert left == right df = DataFrame( { "jim": ["a", "b", np.nan, "d"], "joe": ["w", "x", "y", "z"], "jolie": ["a.w", "b.x", " .y", "d.z"], } ) left = df.set_index(["jim", "joe"]).unstack()["jolie"] right = df.set_index(["joe", "jim"]).unstack()["jolie"].T tm.assert_frame_equal(left, right) for idx in itertools.permutations(df.columns[:2]): mi = df.set_index(list(idx)) for lev in range(2): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == len(df) verify(udf["jolie"]) df = DataFrame( { "1st": ["d"] * 3 + [np.nan] * 5 + ["a"] * 2 + ["c"] * 3 + ["e"] * 2 + ["b"] * 5, "2nd": ["y"] * 2 + ["w"] * 3 + [np.nan] * 3 + ["z"] * 4 + [np.nan] * 3 + ["x"] * 3 + [np.nan] * 2, "3rd": [ 67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59, 50, 62, 59, 76, 52, 14, 53, 60, 51, ], } ) df["4th"], df["5th"] = ( df.apply(lambda r: ".".join(map(cast, r)), axis=1), df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1), ) for idx in itertools.permutations(["1st", "2nd", "3rd"]): mi = df.set_index(list(idx)) for lev in range(3): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == 2 * len(df) for col in ["4th", "5th"]: verify(udf[col]) # GH7403 df = pd.DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [ [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7], ] vals = list(map(list, zip(*vals))) idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B") cols = MultiIndex( levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) df.iloc[2, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]] cols = MultiIndex( levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] ) idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) df = pd.DataFrame( {"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)} ) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]] cols = MultiIndex( levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] ) idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) # GH7401 df = pd.DataFrame( { "A": list("aaaaabbbbb"), "B": (date_range("2012-01-01", periods=5).tolist() * 2), "C": np.arange(10), } ) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack() vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]]) idx = Index(["a", "b"], name="A") cols = MultiIndex( levels=[["C"], date_range("2012-01-01", periods=5)], codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], names=[None, "B"], ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) # GH4862 vals = [ ["Hg", np.nan, np.nan, 680585148], ["U", 0.0, np.nan, 680585148], ["Pb", 7.07e-06, np.nan, 680585148], ["Sn", 2.3614e-05, 0.0133, 680607017], ["Ag", 0.0, 0.0133, 680607017], ["Hg", -0.00015, 0.0133, 680607017], ] df = DataFrame( vals, columns=["agent", "change", "dosage", "s_id"], index=[17263, 17264, 17265, 17266, 17267, 17268], ) left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack() vals = [ [np.nan, np.nan, 7.07e-06, np.nan, 0.0], [0.0, -0.00015, np.nan, 2.3614e-05, np.nan], ] idx = MultiIndex( levels=[[680585148, 680607017], [0.0133]], codes=[[0, 1], [-1, 0]], names=["s_id", "dosage"], ) cols = MultiIndex( levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]], codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], names=[None, "agent"], ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"]) tm.assert_frame_equal(left.unstack(), right) # GH9497 - multiple unstack with nulls df = DataFrame( { "1st": [1, 2, 1, 2, 1, 2], "2nd": pd.date_range("2014-02-01", periods=6, freq="D"), "jim": 100 + np.arange(6), "joe": (np.random.randn(6) * 10).round(2), } ) df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02") df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"]) assert left.notna().values.sum() == 2 * len(df) for col in ["jim", "joe"]: for _, r in df.iterrows(): key = r["1st"], (col, r["2nd"], r["3rd"]) assert r[col] == left.loc[key] def test_stack_datetime_column_multiIndex(self): # GH 8039 t = datetime(2014, 1, 1) df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")])) result = df.stack() eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) ecols = MultiIndex.from_tuples([(t, "A")]) expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) tm.assert_frame_equal(result, expected) def test_stack_partial_multiIndex(self): # GH 8844 def _test_stack_with_multiindex(multiindex): df = DataFrame( np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex, ) for level in (-1, 0, 1, [0, 1], [1, 0]): result = df.stack(level=level, dropna=False) if isinstance(level, int): # Stacking a single level should not make any all-NaN rows, # so df.stack(level=level, dropna=False) should be the same # as df.stack(level=level, dropna=True). expected = df.stack(level=level, dropna=True) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected) df.columns = MultiIndex.from_tuples( df.columns.to_numpy(), names=df.columns.names ) expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected) full_multiindex = MultiIndex.from_tuples( [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], names=["Upper", "Lower"], ) for multiindex_columns in ( [0, 1, 2, 3, 4], [0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 2], [1, 2, 3], [2, 3, 4], [0, 1], [0, 2], [0, 3], [0], [2], [4], ): _test_stack_with_multiindex(full_multiindex[multiindex_columns]) if len(multiindex_columns) > 1: multiindex_columns.reverse() _test_stack_with_multiindex(full_multiindex[multiindex_columns]) df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) result = df.stack(dropna=False) expected = DataFrame( [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], index=MultiIndex( levels=[[0, 1], ["u", "x", "y", "z"]], codes=[[0, 0, 1, 1], [1, 3, 1, 3]], names=[None, "Lower"], ), columns=Index(["B", "C"], name="Upper"), dtype=df.dtypes[0], ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) @pytest.mark.parametrize("labels", [list("yxz"), list("yxy")]) def test_stack_preserve_categorical_dtype(self, ordered, labels): # GH13854 cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. midx = pd.MultiIndex.from_product([df.index, cidx]) expected = Series([10, 11, 12], index=midx) tm.assert_series_equal(result, expected) def test_stack_preserve_categorical_dtype_values(self): # GH-23077 cat = pd.Categorical(["a", "a", "b", "c"]) df = pd.DataFrame({"A": cat, "B": cat}) result = df.stack() index = pd.MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) expected = pd.Series( pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "index, columns", [ ([0, 0, 1, 1], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])), ([0, 0, 2, 3], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])), ([0, 1, 2, 3], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])), ], ) def test_stack_multi_columns_non_unique_index(self, index, columns): # GH-28301 df = pd.DataFrame(index=index, columns=columns).fillna(1) stacked = df.stack() new_index = pd.MultiIndex.from_tuples(stacked.index.to_numpy()) expected = pd.DataFrame( stacked.to_numpy(), index=new_index, columns=stacked.columns ) tm.assert_frame_equal(stacked, expected) stacked_codes = np.asarray(stacked.index.codes) expected_codes = np.asarray(new_index.codes) tm.assert_numpy_array_equal(stacked_codes, expected_codes) @pytest.mark.parametrize("level", [0, 1]) def test_unstack_mixed_extension_types(self, level): index = pd.MultiIndex.from_tuples( [("A", 0), ("A", 1), ("B", 1)], names=["a", "b"] ) df = pd.DataFrame( { "A": pd.core.arrays.integer_array([0, 1, None]), "B": pd.Categorical(["a", "a", "b"]), }, index=index, ) result = df.unstack(level=level) expected = df.astype(object).unstack(level=level) expected_dtypes = pd.Series( [df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns ) tm.assert_series_equal(result.dtypes, expected_dtypes) tm.assert_frame_equal(result.astype(object), expected) @pytest.mark.parametrize("level", [0, "baz"]) def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 mi = pd.MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"]) df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"]) df.columns.name = "foo" expected = pd.DataFrame( [[3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples( [("c", "A"), ("c", "B"), ("d", "A"), ("d", "B")], names=["baz", "foo"] ), ) expected.index.name = "bar" result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level) tm.assert_frame_equal(result, expected) def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. data = pd.Series(["a", "b", "c", "a"], dtype="object") data.index = pd.MultiIndex.from_tuples( [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) # By default missing values will be NaN result = data.unstack() expected = pd.DataFrame( {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz") ) tm.assert_frame_equal(result, expected) # Fill with any value replaces missing values as expected result = data.unstack(fill_value="d") expected = pd.DataFrame( {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz") ) tm.assert_frame_equal(result, expected) def test_unstack_timezone_aware_values(): # GH 18338 df = pd.DataFrame( { "timestamp": [pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC")], "a": ["a"], "b": ["b"], "c": ["c"], }, columns=["timestamp", "a", "b", "c"], ) result = df.set_index(["a", "b"]).unstack() expected = pd.DataFrame( [[pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC"), "c"]], index=pd.Index(["a"], name="a"), columns=pd.MultiIndex( levels=[["timestamp", "c"], ["b"]], codes=[[0, 1], [0, 0]], names=[None, "b"], ), ) tm.assert_frame_equal(result, expected) def test_stack_timezone_aware_values(): # GH 19420 ts = pd.date_range( freq="D", start="20180101", end="20180103", tz="America/New_York" ) df = pd.DataFrame({"A": ts}, index=["a", "b", "c"]) result = df.stack() expected = pd.Series( ts, index=pd.MultiIndex( levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]] ), ) tm.assert_series_equal(result, expected) def test_unstacking_multi_index_df(): # see gh-30740 df = DataFrame( { "name": ["Alice", "Bob"], "score": [9.5, 8], "employed": [False, True], "kids": [0, 0], "gender": ["female", "male"], } ) df = df.set_index(["name", "employed", "kids", "gender"]) df = df.unstack(["gender"], fill_value=0) expected = df.unstack("employed", fill_value=0).unstack("kids", fill_value=0) result = df.unstack(["employed", "kids"], fill_value=0) expected = DataFrame( [[9.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 8.0]], index=Index(["Alice", "Bob"], name="name"), columns=MultiIndex.from_tuples( [ ("score", "female", False, 0), ("score", "female", True, 0), ("score", "male", False, 0), ("score", "male", True, 0), ], names=[None, "gender", "employed", "kids"], ), ) tm.assert_frame_equal(result, expected) def test_stack_positional_level_duplicate_column_names(): # https://github.com/pandas-dev/pandas/issues/36353 columns = pd.MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"]) df = pd.DataFrame([[1, 1, 1, 1]], columns=columns) result = df.stack(0) new_columns = pd.Index(["y", "z"], name="a") new_index = pd.MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"]) expected = pd.DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns) tm.assert_frame_equal(result, expected)