from collections import OrderedDict, abc, deque import datetime as dt from datetime import datetime from decimal import Decimal from io import StringIO from itertools import combinations from warnings import catch_warnings import dateutil import numpy as np from numpy.random import randn import pytest from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import ( Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series, Timestamp, concat, date_range, isna, read_csv, ) import pandas._testing as tm from pandas.core.arrays import SparseArray from pandas.core.construction import create_series_with_explicit_dtype from pandas.tests.extension.decimal import to_decimal @pytest.fixture(params=[True, False]) def sort(request): """Boolean sort keyword for concat and DataFrame.append.""" return request.param class TestConcatAppendCommon: """ Test common dtype coercion rules between concat and append. """ def setup_method(self, method): dt_data = [ pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02"), pd.Timestamp("2011-01-03"), ] tz_data = [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), pd.Timestamp("2011-01-03", tz="US/Eastern"), ] td_data = [ pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days"), ] period_data = [ pd.Period("2011-01", freq="M"), pd.Period("2011-02", freq="M"), pd.Period("2011-03", freq="M"), ] self.data = { "bool": [True, False, True], "int64": [1, 2, 3], "float64": [1.1, np.nan, 3.3], "category": pd.Categorical(["X", "Y", "Z"]), "object": ["a", "b", "c"], "datetime64[ns]": dt_data, "datetime64[ns, US/Eastern]": tz_data, "timedelta64[ns]": td_data, "period[M]": period_data, } def _check_expected_dtype(self, obj, label): """ Check whether obj has expected dtype depending on label considering not-supported dtypes """ if isinstance(obj, pd.Index): if label == "bool": assert obj.dtype == "object" else: assert obj.dtype == label elif isinstance(obj, pd.Series): if label.startswith("period"): assert obj.dtype == "Period[M]" else: assert obj.dtype == label else: raise ValueError def test_dtypes(self): # to confirm test case covers intended dtypes for typ, vals in self.data.items(): self._check_expected_dtype(pd.Index(vals), typ) self._check_expected_dtype(pd.Series(vals), typ) def test_concatlike_same_dtypes(self): # GH 13660 for typ1, vals1 in self.data.items(): vals2 = vals1 vals3 = vals1 if typ1 == "category": exp_data = pd.Categorical(list(vals1) + list(vals2)) exp_data3 = pd.Categorical(list(vals1) + list(vals2) + list(vals3)) else: exp_data = vals1 + vals2 exp_data3 = vals1 + vals2 + vals3 # ----- Index ----- # # index.append res = pd.Index(vals1).append(pd.Index(vals2)) exp = pd.Index(exp_data) tm.assert_index_equal(res, exp) # 3 elements res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)]) exp = pd.Index(exp_data3) tm.assert_index_equal(res, exp) # index.append name mismatch i1 = pd.Index(vals1, name="x") i2 = pd.Index(vals2, name="y") res = i1.append(i2) exp = pd.Index(exp_data) tm.assert_index_equal(res, exp) # index.append name match i1 = pd.Index(vals1, name="x") i2 = pd.Index(vals2, name="x") res = i1.append(i2) exp = pd.Index(exp_data, name="x") tm.assert_index_equal(res, exp) # cannot append non-index with pytest.raises(TypeError, match="all inputs must be Index"): pd.Index(vals1).append(vals2) with pytest.raises(TypeError, match="all inputs must be Index"): pd.Index(vals1).append([pd.Index(vals2), vals3]) # ----- Series ----- # # series.append res = pd.Series(vals1).append(pd.Series(vals2), ignore_index=True) exp = pd.Series(exp_data) tm.assert_series_equal(res, exp, check_index_type=True) # concat res = pd.concat([pd.Series(vals1), pd.Series(vals2)], ignore_index=True) tm.assert_series_equal(res, exp, check_index_type=True) # 3 elements res = pd.Series(vals1).append( [pd.Series(vals2), pd.Series(vals3)], ignore_index=True ) exp = pd.Series(exp_data3) tm.assert_series_equal(res, exp) res = pd.concat( [pd.Series(vals1), pd.Series(vals2), pd.Series(vals3)], ignore_index=True, ) tm.assert_series_equal(res, exp) # name mismatch s1 = pd.Series(vals1, name="x") s2 = pd.Series(vals2, name="y") res = s1.append(s2, ignore_index=True) exp = pd.Series(exp_data) tm.assert_series_equal(res, exp, check_index_type=True) res = pd.concat([s1, s2], ignore_index=True) tm.assert_series_equal(res, exp, check_index_type=True) # name match s1 = pd.Series(vals1, name="x") s2 = pd.Series(vals2, name="x") res = s1.append(s2, ignore_index=True) exp = pd.Series(exp_data, name="x") tm.assert_series_equal(res, exp, check_index_type=True) res = pd.concat([s1, s2], ignore_index=True) tm.assert_series_equal(res, exp, check_index_type=True) # cannot append non-index msg = ( r"cannot concatenate object of type '.+'; " "only Series and DataFrame objs are valid" ) with pytest.raises(TypeError, match=msg): pd.Series(vals1).append(vals2) with pytest.raises(TypeError, match=msg): pd.Series(vals1).append([pd.Series(vals2), vals3]) with pytest.raises(TypeError, match=msg): pd.concat([pd.Series(vals1), vals2]) with pytest.raises(TypeError, match=msg): pd.concat([pd.Series(vals1), pd.Series(vals2), vals3]) def test_concatlike_dtypes_coercion(self): # GH 13660 for typ1, vals1 in self.data.items(): for typ2, vals2 in self.data.items(): vals3 = vals2 # basically infer exp_index_dtype = None exp_series_dtype = None if typ1 == typ2: # same dtype is tested in test_concatlike_same_dtypes continue elif typ1 == "category" or typ2 == "category": # TODO: suspicious continue # specify expected dtype if typ1 == "bool" and typ2 in ("int64", "float64"): # series coerces to numeric based on numpy rule # index doesn't because bool is object dtype exp_series_dtype = typ2 elif typ2 == "bool" and typ1 in ("int64", "float64"): exp_series_dtype = typ1 elif ( typ1 == "datetime64[ns, US/Eastern]" or typ2 == "datetime64[ns, US/Eastern]" or typ1 == "timedelta64[ns]" or typ2 == "timedelta64[ns]" ): exp_index_dtype = object exp_series_dtype = object exp_data = vals1 + vals2 exp_data3 = vals1 + vals2 + vals3 # ----- Index ----- # # index.append res = pd.Index(vals1).append(pd.Index(vals2)) exp = pd.Index(exp_data, dtype=exp_index_dtype) tm.assert_index_equal(res, exp) # 3 elements res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)]) exp = pd.Index(exp_data3, dtype=exp_index_dtype) tm.assert_index_equal(res, exp) # ----- Series ----- # # series.append res = pd.Series(vals1).append(pd.Series(vals2), ignore_index=True) exp = pd.Series(exp_data, dtype=exp_series_dtype) tm.assert_series_equal(res, exp, check_index_type=True) # concat res = pd.concat([pd.Series(vals1), pd.Series(vals2)], ignore_index=True) tm.assert_series_equal(res, exp, check_index_type=True) # 3 elements res = pd.Series(vals1).append( [pd.Series(vals2), pd.Series(vals3)], ignore_index=True ) exp = pd.Series(exp_data3, dtype=exp_series_dtype) tm.assert_series_equal(res, exp) res = pd.concat( [pd.Series(vals1), pd.Series(vals2), pd.Series(vals3)], ignore_index=True, ) tm.assert_series_equal(res, exp) def test_concatlike_common_coerce_to_pandas_object(self): # GH 13626 # result must be Timestamp/Timedelta, not datetime.datetime/timedelta dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"]) tdi = pd.TimedeltaIndex(["1 days", "2 days"]) exp = pd.Index( [ pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02"), pd.Timedelta("1 days"), pd.Timedelta("2 days"), ] ) res = dti.append(tdi) tm.assert_index_equal(res, exp) assert isinstance(res[0], pd.Timestamp) assert isinstance(res[-1], pd.Timedelta) dts = pd.Series(dti) tds = pd.Series(tdi) res = dts.append(tds) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) assert isinstance(res.iloc[0], pd.Timestamp) assert isinstance(res.iloc[-1], pd.Timedelta) res = pd.concat([dts, tds]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) assert isinstance(res.iloc[0], pd.Timestamp) assert isinstance(res.iloc[-1], pd.Timedelta) def test_concatlike_datetimetz(self, tz_aware_fixture): tz = tz_aware_fixture # GH 7795 dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz) exp = pd.DatetimeIndex( ["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz ) res = dti1.append(dti2) tm.assert_index_equal(res, exp) dts1 = pd.Series(dti1) dts2 = pd.Series(dti2) res = dts1.append(dts2) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) res = pd.concat([dts1, dts2]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) @pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"]) def test_concatlike_datetimetz_short(self, tz): # GH#7795 ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz) ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz) df1 = pd.DataFrame(0, index=ix1, columns=["A", "B"]) df2 = pd.DataFrame(0, index=ix2, columns=["A", "B"]) exp_idx = pd.DatetimeIndex( ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], tz=tz, ) exp = pd.DataFrame(0, index=exp_idx, columns=["A", "B"]) tm.assert_frame_equal(df1.append(df2), exp) tm.assert_frame_equal(pd.concat([df1, df2]), exp) def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): tz = tz_aware_fixture # GH 13660 # different tz coerces to object dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"]) exp = pd.Index( [ pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2011-01-02", tz=tz), pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-02"), ], dtype=object, ) res = dti1.append(dti2) tm.assert_index_equal(res, exp) dts1 = pd.Series(dti1) dts2 = pd.Series(dti2) res = dts1.append(dts2) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) res = pd.concat([dts1, dts2]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) # different tz dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") exp = pd.Index( [ pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2011-01-02", tz=tz), pd.Timestamp("2012-01-01", tz="US/Pacific"), pd.Timestamp("2012-01-02", tz="US/Pacific"), ], dtype=object, ) res = dti1.append(dti3) # tm.assert_index_equal(res, exp) dts1 = pd.Series(dti1) dts3 = pd.Series(dti3) res = dts1.append(dts3) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) res = pd.concat([dts1, dts3]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) def test_concatlike_common_period(self): # GH 13660 pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M") exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M") res = pi1.append(pi2) tm.assert_index_equal(res, exp) ps1 = pd.Series(pi1) ps2 = pd.Series(pi2) res = ps1.append(ps2) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) res = pd.concat([ps1, ps2]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) def test_concatlike_common_period_diff_freq_to_object(self): # GH 13221 pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D") exp = pd.Index( [ pd.Period("2011-01", freq="M"), pd.Period("2011-02", freq="M"), pd.Period("2012-01-01", freq="D"), pd.Period("2012-02-01", freq="D"), ], dtype=object, ) res = pi1.append(pi2) tm.assert_index_equal(res, exp) ps1 = pd.Series(pi1) ps2 = pd.Series(pi2) res = ps1.append(ps2) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) res = pd.concat([ps1, ps2]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) def test_concatlike_common_period_mixed_dt_to_object(self): # GH 13221 # different datetimelike pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") tdi = pd.TimedeltaIndex(["1 days", "2 days"]) exp = pd.Index( [ pd.Period("2011-01", freq="M"), pd.Period("2011-02", freq="M"), pd.Timedelta("1 days"), pd.Timedelta("2 days"), ], dtype=object, ) res = pi1.append(tdi) tm.assert_index_equal(res, exp) ps1 = pd.Series(pi1) tds = pd.Series(tdi) res = ps1.append(tds) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) res = pd.concat([ps1, tds]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) # inverse exp = pd.Index( [ pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Period("2011-01", freq="M"), pd.Period("2011-02", freq="M"), ], dtype=object, ) res = tdi.append(pi1) tm.assert_index_equal(res, exp) ps1 = pd.Series(pi1) tds = pd.Series(tdi) res = tds.append(ps1) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) res = pd.concat([tds, ps1]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) def test_concat_categorical(self): # GH 13524 # same categories -> category s1 = pd.Series([1, 2, np.nan], dtype="category") s2 = pd.Series([2, 1, 2], dtype="category") exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # partially different categories => not-category s1 = pd.Series([3, 2], dtype="category") s2 = pd.Series([2, 1], dtype="category") exp = pd.Series([3, 2, 2, 1]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # completely different categories (same dtype) => not-category s1 = pd.Series([10, 11, np.nan], dtype="category") s2 = pd.Series([np.nan, 1, 3, 2], dtype="category") exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) def test_union_categorical_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19096 a = pd.Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"])) b = pd.Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"])) result = pd.concat([a, b], ignore_index=True) expected = pd.Series( Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]) ) tm.assert_series_equal(result, expected) def test_concat_categorical_coercion(self): # GH 13524 # category + not-category => not-category s1 = pd.Series([1, 2, np.nan], dtype="category") s2 = pd.Series([2, 1, 2]) exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # result shouldn't be affected by 1st elem dtype exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype="object") tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # all values are not in category => not-category s1 = pd.Series([3, 2], dtype="category") s2 = pd.Series([2, 1]) exp = pd.Series([3, 2, 2, 1]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) exp = pd.Series([2, 1, 3, 2]) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # completely different categories => not-category s1 = pd.Series([10, 11, np.nan], dtype="category") s2 = pd.Series([1, 3, 2]) exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype="object") tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # different dtype => not-category s1 = pd.Series([10, 11, np.nan], dtype="category") s2 = pd.Series(["a", "b", "c"]) exp = pd.Series([10, 11, np.nan, "a", "b", "c"]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) exp = pd.Series(["a", "b", "c", 10, 11, np.nan]) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # if normal series only contains NaN-likes => not-category s1 = pd.Series([10, 11], dtype="category") s2 = pd.Series([np.nan, np.nan, np.nan]) exp = pd.Series([10, 11, np.nan, np.nan, np.nan]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) exp = pd.Series([np.nan, np.nan, np.nan, 10, 11]) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) def test_concat_categorical_3elem_coercion(self): # GH 13524 # mixed dtypes => not-category s1 = pd.Series([1, 2, np.nan], dtype="category") s2 = pd.Series([2, 1, 2], dtype="category") s3 = pd.Series([1, 2, 1, 2, np.nan]) exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="float") tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="float") tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) # values are all in either category => not-category s1 = pd.Series([4, 5, 6], dtype="category") s2 = pd.Series([1, 2, 3], dtype="category") s3 = pd.Series([1, 3, 4]) exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) # values are all in either category => not-category s1 = pd.Series([4, 5, 6], dtype="category") s2 = pd.Series([1, 2, 3], dtype="category") s3 = pd.Series([10, 11, 12]) exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) exp = pd.Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) def test_concat_categorical_multi_coercion(self): # GH 13524 s1 = pd.Series([1, 3], dtype="category") s2 = pd.Series([3, 4], dtype="category") s3 = pd.Series([2, 3]) s4 = pd.Series([2, 2], dtype="category") s5 = pd.Series([1, np.nan]) s6 = pd.Series([1, 3, 2], dtype="category") # mixed dtype, values are all in categories => not-category exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) tm.assert_series_equal(res, exp) res = s1.append([s2, s3, s4, s5, s6], ignore_index=True) tm.assert_series_equal(res, exp) exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) tm.assert_series_equal(res, exp) res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) tm.assert_series_equal(res, exp) def test_concat_categorical_ordered(self): # GH 13524 s1 = pd.Series(pd.Categorical([1, 2, np.nan], ordered=True)) s2 = pd.Series(pd.Categorical([2, 1, 2], ordered=True)) exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) exp = pd.Series( pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True) ) tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) def test_concat_categorical_coercion_nan(self): # GH 13524 # some edge cases # category + not-category => not category s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category") s2 = pd.Series([np.nan, 1]) exp = pd.Series([np.nan, np.nan, np.nan, 1]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) s1 = pd.Series([1, np.nan], dtype="category") s2 = pd.Series([np.nan, np.nan]) exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="float") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # mixed dtype, all nan-likes => not-category s1 = pd.Series([np.nan, np.nan], dtype="category") s2 = pd.Series([np.nan, np.nan]) exp = pd.Series([np.nan, np.nan, np.nan, np.nan]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # all category nan-likes => category s1 = pd.Series([np.nan, np.nan], dtype="category") s2 = pd.Series([np.nan, np.nan], dtype="category") exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) def test_concat_categorical_empty(self): # GH 13524 s1 = pd.Series([], dtype="category") s2 = pd.Series([1, 2], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) s1 = pd.Series([], dtype="category") s2 = pd.Series([], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) s1 = pd.Series([], dtype="category") s2 = pd.Series([], dtype="object") # different dtype => not-category tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) s1 = pd.Series([], dtype="category") s2 = pd.Series([np.nan, np.nan]) # empty Series is ignored exp = pd.Series([np.nan, np.nan]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) class TestAppend: def test_append(self, sort, float_frame): mixed_frame = float_frame.copy() mixed_frame["foo"] = "bar" begin_index = float_frame.index[:5] end_index = float_frame.index[5:] begin_frame = float_frame.reindex(begin_index) end_frame = float_frame.reindex(end_index) appended = begin_frame.append(end_frame) tm.assert_almost_equal(appended["A"], float_frame["A"]) del end_frame["A"] partial_appended = begin_frame.append(end_frame, sort=sort) assert "A" in partial_appended partial_appended = end_frame.append(begin_frame, sort=sort) assert "A" in partial_appended # mixed type handling appended = mixed_frame[:5].append(mixed_frame[5:]) tm.assert_frame_equal(appended, mixed_frame) # what to test here mixed_appended = mixed_frame[:5].append(float_frame[5:], sort=sort) mixed_appended2 = float_frame[:5].append(mixed_frame[5:], sort=sort) # all equal except 'foo' column tm.assert_frame_equal( mixed_appended.reindex(columns=["A", "B", "C", "D"]), mixed_appended2.reindex(columns=["A", "B", "C", "D"]), ) def test_append_empty(self, float_frame): empty = DataFrame() appended = float_frame.append(empty) tm.assert_frame_equal(float_frame, appended) assert appended is not float_frame appended = empty.append(float_frame) tm.assert_frame_equal(float_frame, appended) assert appended is not float_frame def test_append_overlap_raises(self, float_frame): msg = "Indexes have overlapping values" with pytest.raises(ValueError, match=msg): float_frame.append(float_frame, verify_integrity=True) def test_append_new_columns(self): # see gh-6129: new columns df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}}) row = Series([5, 6, 7], index=["a", "b", "c"], name="z") expected = DataFrame( { "a": {"x": 1, "y": 2, "z": 5}, "b": {"x": 3, "y": 4, "z": 6}, "c": {"z": 7}, } ) result = df.append(row) tm.assert_frame_equal(result, expected) def test_append_length0_frame(self, sort): df = DataFrame(columns=["A", "B", "C"]) df3 = DataFrame(index=[0, 1], columns=["A", "B"]) df5 = df.append(df3, sort=sort) expected = DataFrame(index=[0, 1], columns=["A", "B", "C"]) tm.assert_frame_equal(df5, expected) def test_append_records(self): arr1 = np.zeros((2,), dtype=("i4,f4,a10")) arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] arr2 = np.zeros((3,), dtype=("i4,f4,a10")) arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] df1 = DataFrame(arr1) df2 = DataFrame(arr2) result = df1.append(df2, ignore_index=True) expected = DataFrame(np.concatenate((arr1, arr2))) tm.assert_frame_equal(result, expected) # rewrite sort fixture, since we also want to test default of None def test_append_sorts(self, sort): df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) df2 = pd.DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) with tm.assert_produces_warning(None): result = df1.append(df2, sort=sort) # for None / True expected = pd.DataFrame( {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]}, columns=["a", "b", "c"], ) if sort is False: expected = expected[["b", "a", "c"]] tm.assert_frame_equal(result, expected) def test_append_different_columns(self, sort): df = DataFrame( { "bools": np.random.randn(10) > 0, "ints": np.random.randint(0, 10, 10), "floats": np.random.randn(10), "strings": ["foo", "bar"] * 5, } ) a = df[:5].loc[:, ["bools", "ints", "floats"]] b = df[5:].loc[:, ["strings", "ints", "floats"]] appended = a.append(b, sort=sort) assert isna(appended["strings"][0:4]).all() assert isna(appended["bools"][5:]).all() def test_append_many(self, sort, float_frame): chunks = [ float_frame[:5], float_frame[5:10], float_frame[10:15], float_frame[15:], ] result = chunks[0].append(chunks[1:]) tm.assert_frame_equal(result, float_frame) chunks[-1] = chunks[-1].copy() chunks[-1]["foo"] = "bar" result = chunks[0].append(chunks[1:], sort=sort) tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame) assert (result["foo"][15:] == "bar").all() assert result["foo"][:15].isna().all() def test_append_preserve_index_name(self): # #980 df1 = DataFrame(columns=["A", "B", "C"]) df1 = df1.set_index(["A"]) df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) df2 = df2.set_index(["A"]) result = df1.append(df2) assert result.index.name == "A" indexes_can_append = [ pd.RangeIndex(3), pd.Index([4, 5, 6]), pd.Index([4.5, 5.5, 6.5]), pd.Index(list("abc")), pd.CategoricalIndex("A B C".split()), pd.CategoricalIndex("D E F".split(), ordered=True), pd.IntervalIndex.from_breaks([7, 8, 9, 10]), pd.DatetimeIndex( [ dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10), dt.datetime(2013, 1, 3, 7, 12), ] ), ] indexes_cannot_append_with_other = [ pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]) ] all_indexes = indexes_can_append + indexes_cannot_append_with_other @pytest.mark.parametrize("index", all_indexes, ids=lambda x: type(x).__name__) def test_append_same_columns_type(self, index): # GH18359 # df wider than ser df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) ser_index = index[:2] ser = pd.Series([7, 8], index=ser_index, name=2) result = df.append(ser) expected = pd.DataFrame( [[1.0, 2.0, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index ) tm.assert_frame_equal(result, expected) # ser wider than df ser_index = index index = index[:2] df = pd.DataFrame([[1, 2], [4, 5]], columns=index) ser = pd.Series([7, 8, 9], index=ser_index, name=2) result = df.append(ser) expected = pd.DataFrame( [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], index=[0, 1, 2], columns=ser_index, ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "df_columns, series_index", combinations(indexes_can_append, r=2), ids=lambda x: type(x).__name__, ) def test_append_different_columns_types(self, df_columns, series_index): # GH18359 # See also test 'test_append_different_columns_types_raises' below # for errors raised when appending df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns) ser = pd.Series([7, 8, 9], index=series_index, name=2) result = df.append(ser) idx_diff = ser.index.difference(df_columns) combined_columns = Index(df_columns.tolist()).append(idx_diff) expected = pd.DataFrame( [ [1.0, 2.0, 3.0, np.nan, np.nan, np.nan], [4, 5, 6, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, 7, 8, 9], ], index=[0, 1, 2], columns=combined_columns, ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "index_can_append", indexes_can_append, ids=lambda x: type(x).__name__ ) @pytest.mark.parametrize( "index_cannot_append_with_other", indexes_cannot_append_with_other, ids=lambda x: type(x).__name__, ) def test_append_different_columns_types_raises( self, index_can_append, index_cannot_append_with_other ): # GH18359 # Dataframe.append will raise if MultiIndex appends # or is appended to a different index type # # See also test 'test_append_different_columns_types' above for # appending without raising. df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) msg = ( r"Expected tuple, got (int|long|float|str|" r"pandas._libs.interval.Interval)|" r"object of type '(int|float|Timestamp|" r"pandas._libs.interval.Interval)' has no len\(\)|" ) with pytest.raises(TypeError, match=msg): df.append(ser) df = pd.DataFrame( [[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other ) ser = pd.Series([7, 8, 9], index=index_can_append, name=2) with pytest.raises(TypeError, match=msg): df.append(ser) def test_append_dtype_coerce(self, sort): # GH 4993 # appending with datetime will incorrectly convert datetime64 df1 = DataFrame( index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], columns=["start_time"], ) df2 = DataFrame( index=[4, 5], data=[ [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)], [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)], ], columns=["start_time", "end_time"], ) expected = concat( [ Series( [ pd.NaT, pd.NaT, dt.datetime(2013, 1, 3, 6, 10), dt.datetime(2013, 1, 4, 7, 10), ], name="end_time", ), Series( [ dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0), dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 4, 0, 0), ], name="start_time", ), ], axis=1, sort=sort, ) result = df1.append(df2, ignore_index=True, sort=sort) if sort: expected = expected[["end_time", "start_time"]] else: expected = expected[["start_time", "end_time"]] tm.assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self, sort): df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")}) df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)}) appended = df1.append(df2, ignore_index=True, sort=sort) assert appended["A"].dtype == "f8" assert appended["B"].dtype == "O" def test_append_empty_frame_to_series_with_dateutil_tz(self): # GH 23682 date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) s = Series({"date": date, "a": 1.0, "b": 2.0}) df = DataFrame(columns=["c", "d"]) result_a = df.append(s, ignore_index=True) expected = DataFrame( [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"] ) # These columns get cast to object after append expected["c"] = expected["c"].astype(object) expected["d"] = expected["d"].astype(object) tm.assert_frame_equal(result_a, expected) expected = DataFrame( [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"] ) expected["c"] = expected["c"].astype(object) expected["d"] = expected["d"].astype(object) result_b = result_a.append(s, ignore_index=True) tm.assert_frame_equal(result_b, expected) # column order is different expected = expected[["c", "d", "date", "a", "b"]] result = df.append([s, s], ignore_index=True) tm.assert_frame_equal(result, expected) def test_append_empty_tz_frame_with_datetime64ns(self): # https://github.com/pandas-dev/pandas/issues/35460 df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") # pd.NaT gets inferred as tz-naive, so append result is tz-naive result = df.append({"a": pd.NaT}, ignore_index=True) expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") tm.assert_frame_equal(result, expected) # also test with typed value to append df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") result = df.append( pd.Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True ) expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") tm.assert_frame_equal(result, expected) class TestConcatenate: def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) # These are actual copies. result = concat([df, df2, df3], axis=1, copy=True) for b in result._mgr.blocks: assert b.values.base is None # These are the same. result = concat([df, df2, df3], axis=1, copy=False) for b in result._mgr.blocks: if b.is_float: assert b.values.base is df._mgr.blocks[0].values.base elif b.is_integer: assert b.values.base is df2._mgr.blocks[0].values.base elif b.is_object: assert b.values.base is not None # Float block was consolidated. df4 = DataFrame(np.random.randn(4, 1)) result = concat([df, df2, df3, df4], axis=1, copy=False) for b in result._mgr.blocks: if b.is_float: assert b.values.base is None elif b.is_integer: assert b.values.base is df2._mgr.blocks[0].values.base elif b.is_object: assert b.values.base is not None def test_concat_with_group_keys(self): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randn(4, 4)) # axis=0 df = DataFrame(np.random.randn(3, 4)) df2 = DataFrame(np.random.randn(4, 4)) result = concat([df, df2], keys=[0, 1]) exp_index = MultiIndex.from_arrays( [[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]] ) expected = DataFrame(np.r_[df.values, df2.values], index=exp_index) tm.assert_frame_equal(result, expected) result = concat([df, df], keys=[0, 1]) exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) expected = DataFrame(np.r_[df.values, df.values], index=exp_index2) tm.assert_frame_equal(result, expected) # axis=1 df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randn(4, 4)) result = concat([df, df2], keys=[0, 1], axis=1) expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index) tm.assert_frame_equal(result, expected) result = concat([df, df], keys=[0, 1], axis=1) expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2) tm.assert_frame_equal(result, expected) def test_concat_keys_specific_levels(self): df = DataFrame(np.random.randn(10, 4)) pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]] level = ["three", "two", "one", "zero"] result = concat( pieces, axis=1, keys=["one", "two", "three"], levels=[level], names=["group_key"], ) tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3])) assert result.columns.names == ["group_key", None] def test_concat_dataframe_keys_bug(self, sort): t1 = DataFrame( {"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))} ) t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))}) # it works result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) assert list(result.columns) == [("t1", "value"), ("t2", "value")] def test_concat_series_partial_columns_names(self): # GH10698 foo = Series([1, 2], name="foo") bar = Series([1, 2]) baz = Series([4, 5]) result = concat([foo, bar, baz], axis=1) expected = DataFrame( {"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1] ) tm.assert_frame_equal(result, expected) result = concat([foo, bar, baz], axis=1, keys=["red", "blue", "yellow"]) expected = DataFrame( {"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]}, columns=["red", "blue", "yellow"], ) tm.assert_frame_equal(result, expected) result = concat([foo, bar, baz], axis=1, ignore_index=True) expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("mapping", ["mapping", "dict"]) def test_concat_mapping(self, mapping, non_dict_mapping_subclass): constructor = dict if mapping == "dict" else non_dict_mapping_subclass frames = constructor( { "foo": DataFrame(np.random.randn(4, 3)), "bar": DataFrame(np.random.randn(4, 3)), "baz": DataFrame(np.random.randn(4, 3)), "qux": DataFrame(np.random.randn(4, 3)), } ) sorted_keys = list(frames.keys()) result = concat(frames) expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) tm.assert_frame_equal(result, expected) result = concat(frames, axis=1) expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, axis=1) tm.assert_frame_equal(result, expected) keys = ["baz", "foo", "bar"] result = concat(frames, keys=keys) expected = concat([frames[k] for k in keys], keys=keys) tm.assert_frame_equal(result, expected) def test_concat_ignore_index(self, sort): frame1 = DataFrame( {"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]} ) frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) frame1.index = Index(["x", "y", "z"]) frame2.index = Index(["x", "y", "q"]) v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort) nan = np.nan expected = DataFrame( [ [nan, nan, nan, 4.3], ["a", 1, 4.5, 5.2], ["b", 2, 3.2, 2.2], ["c", 3, 1.2, nan], ], index=Index(["q", "x", "y", "z"]), ) if not sort: expected = expected.loc[["x", "y", "z", "q"]] tm.assert_frame_equal(v1, expected) def test_concat_multiindex_with_keys(self): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) frame = DataFrame( np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp"), ) result = concat([frame, frame], keys=[0, 1], names=["iteration"]) assert result.index.names == ("iteration",) + index.names tm.assert_frame_equal(result.loc[0], frame) tm.assert_frame_equal(result.loc[1], frame) assert result.index.nlevels == 3 def test_concat_multiindex_with_tz(self): # GH 6606 df = DataFrame( { "dt": [ datetime(2014, 1, 1), datetime(2014, 1, 2), datetime(2014, 1, 3), ], "b": ["A", "B", "C"], "c": [1, 2, 3], "d": [4, 5, 6], } ) df["dt"] = df["dt"].apply(lambda d: Timestamp(d, tz="US/Pacific")) df = df.set_index(["dt", "b"]) exp_idx1 = DatetimeIndex( ["2014-01-01", "2014-01-02", "2014-01-03"] * 2, tz="US/Pacific", name="dt" ) exp_idx2 = Index(["A", "B", "C"] * 2, name="b") exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) expected = DataFrame( {"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"] ) result = concat([df, df]) tm.assert_frame_equal(result, expected) def test_concat_multiindex_with_none_in_index_names(self): # GH 15787 index = pd.MultiIndex.from_product([[1], range(5)], names=["level1", None]) df = pd.DataFrame({"col": range(5)}, index=index, dtype=np.int32) result = concat([df, df], keys=[1, 2], names=["level2"]) index = pd.MultiIndex.from_product( [[1, 2], [1], range(5)], names=["level2", "level1", None] ) expected = pd.DataFrame( {"col": list(range(5)) * 2}, index=index, dtype=np.int32 ) tm.assert_frame_equal(result, expected) result = concat([df, df[:2]], keys=[1, 2], names=["level2"]) level2 = [1] * 5 + [2] * 2 level1 = [1] * 7 no_name = list(range(5)) + list(range(2)) tuples = list(zip(level2, level1, no_name)) index = pd.MultiIndex.from_tuples(tuples, names=["level2", "level1", None]) expected = pd.DataFrame({"col": no_name}, index=index, dtype=np.int32) tm.assert_frame_equal(result, expected) def test_concat_keys_and_levels(self): df = DataFrame(np.random.randn(1, 3)) df2 = DataFrame(np.random.randn(1, 4)) levels = [["foo", "baz"], ["one", "two"]] names = ["first", "second"] result = concat( [df, df2, df, df2], keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], levels=levels, names=names, ) expected = concat([df, df2, df, df2]) exp_index = MultiIndex( levels=levels + [[0]], codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]], names=names + [None], ) expected.index = exp_index tm.assert_frame_equal(result, expected) # no names result = concat( [df, df2, df, df2], keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], levels=levels, ) assert result.index.names == (None,) * 3 # no levels result = concat( [df, df2, df, df2], keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], names=["first", "second"], ) assert result.index.names == ("first", "second", None) tm.assert_index_equal( result.index.levels[0], Index(["baz", "foo"], name="first") ) def test_concat_keys_levels_no_overlap(self): # GH #1406 df = DataFrame(np.random.randn(1, 3), index=["a"]) df2 = DataFrame(np.random.randn(1, 4), index=["b"]) msg = "Values not found in passed level" with pytest.raises(ValueError, match=msg): concat([df, df], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) msg = "Key one not in level" with pytest.raises(ValueError, match=msg): concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) def test_concat_rename_index(self): a = DataFrame( np.random.rand(3, 3), columns=list("ABC"), index=Index(list("abc"), name="index_a"), ) b = DataFrame( np.random.rand(3, 3), columns=list("ABC"), index=Index(list("abc"), name="index_b"), ) result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"]) exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"]) names = list(exp.index.names) names[1] = "lvl1" exp.index.set_names(names, inplace=True) tm.assert_frame_equal(result, exp) assert result.index.names == exp.index.names def test_crossed_dtypes_weird_corner(self): columns = ["A", "B", "C", "D"] df1 = DataFrame( { "A": np.array([1, 2, 3, 4], dtype="f8"), "B": np.array([1, 2, 3, 4], dtype="i8"), "C": np.array([1, 2, 3, 4], dtype="f8"), "D": np.array([1, 2, 3, 4], dtype="i8"), }, columns=columns, ) df2 = DataFrame( { "A": np.array([1, 2, 3, 4], dtype="i8"), "B": np.array([1, 2, 3, 4], dtype="f8"), "C": np.array([1, 2, 3, 4], dtype="i8"), "D": np.array([1, 2, 3, 4], dtype="f8"), }, columns=columns, ) appended = df1.append(df2, ignore_index=True) expected = DataFrame( np.concatenate([df1.values, df2.values], axis=0), columns=columns ) tm.assert_frame_equal(appended, expected) df = DataFrame(np.random.randn(1, 3), index=["a"]) df2 = DataFrame(np.random.randn(1, 4), index=["b"]) result = concat([df, df2], keys=["one", "two"], names=["first", "second"]) assert result.index.names == ("first", "second") def test_dups_index(self): # GH 4771 # single dtypes df = DataFrame( np.random.randint(0, 10, size=40).reshape(10, 4), columns=["A", "A", "C", "C"], ) result = concat([df, df], axis=1) tm.assert_frame_equal(result.iloc[:, :4], df) tm.assert_frame_equal(result.iloc[:, 4:], df) result = concat([df, df], axis=0) tm.assert_frame_equal(result.iloc[:10], df) tm.assert_frame_equal(result.iloc[10:], df) # multi dtypes df = concat( [ DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), DataFrame( np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] ), ], axis=1, ) result = concat([df, df], axis=1) tm.assert_frame_equal(result.iloc[:, :6], df) tm.assert_frame_equal(result.iloc[:, 6:], df) result = concat([df, df], axis=0) tm.assert_frame_equal(result.iloc[:10], df) tm.assert_frame_equal(result.iloc[10:], df) # append result = df.iloc[0:8, :].append(df.iloc[8:]) tm.assert_frame_equal(result, df) result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) tm.assert_frame_equal(result, df) expected = concat([df, df], axis=0) result = df.append(df) tm.assert_frame_equal(result, expected) def test_with_mixed_tuples(self, sort): # 10697 # columns have mixed tuples, so handle properly df1 = DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2)) df2 = DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2)) # it works concat([df1, df2], sort=sort) def test_handle_empty_objects(self, sort): df = DataFrame(np.random.randn(10, 4), columns=list("abcd")) baz = df[:5].copy() baz["foo"] = "bar" empty = df[5:5] frames = [baz, empty, empty, df[5:]] concatted = concat(frames, axis=0, sort=sort) expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) expected["foo"] = expected["foo"].astype("O") expected.loc[0:4, "foo"] = "bar" tm.assert_frame_equal(concatted, expected) # empty as first element with time series # GH3259 df = DataFrame( dict(A=range(10000)), index=date_range("20130101", periods=10000, freq="s") ) empty = DataFrame() result = concat([df, empty], axis=1) tm.assert_frame_equal(result, df) result = concat([empty, df], axis=1) tm.assert_frame_equal(result, df) result = concat([df, empty]) tm.assert_frame_equal(result, df) result = concat([empty, df]) tm.assert_frame_equal(result, df) def test_concat_mixed_objs(self): # concat mixed series/frames # G2385 # axis 1 index = date_range("01-Jan-2013", periods=10, freq="H") arr = np.arange(10, dtype="int64") s1 = Series(arr, index=index) s2 = Series(arr, index=index) df = DataFrame(arr.reshape(-1, 1), index=index) expected = DataFrame( np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 0] ) result = concat([df, df], axis=1) tm.assert_frame_equal(result, expected) expected = DataFrame( np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 1] ) result = concat([s1, s2], axis=1) tm.assert_frame_equal(result, expected) expected = DataFrame( np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] ) result = concat([s1, s2, s1], axis=1) tm.assert_frame_equal(result, expected) expected = DataFrame( np.repeat(arr, 5).reshape(-1, 5), index=index, columns=[0, 0, 1, 2, 3] ) result = concat([s1, df, s2, s2, s1], axis=1) tm.assert_frame_equal(result, expected) # with names s1.name = "foo" expected = DataFrame( np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, 0] ) result = concat([s1, df, s2], axis=1) tm.assert_frame_equal(result, expected) s2.name = "bar" expected = DataFrame( np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, "bar"] ) result = concat([s1, df, s2], axis=1) tm.assert_frame_equal(result, expected) # ignore index expected = DataFrame( np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] ) result = concat([s1, df, s2], axis=1, ignore_index=True) tm.assert_frame_equal(result, expected) # axis 0 expected = DataFrame( np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0] ) result = concat([s1, df, s2]) tm.assert_frame_equal(result, expected) expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) result = concat([s1, df, s2], ignore_index=True) tm.assert_frame_equal(result, expected) def test_empty_dtype_coerce(self): # xref to #12411 # xref to #12045 # xref to #11594 # see below # 10571 df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"]) df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"]) result = concat([df1, df2]) expected = df1.dtypes tm.assert_series_equal(result.dtypes, expected) def test_dtype_coerceion(self): # 12411 df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]}) result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) # 12045 import datetime df = DataFrame( {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]} ) result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) # 11594 df = DataFrame({"text": ["some words"] + [None] * 9}) result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) def test_concat_series(self): ts = tm.makeTimeSeries() ts.name = "foo" pieces = [ts[:5], ts[5:15], ts[15:]] result = concat(pieces) tm.assert_series_equal(result, ts) assert result.name == ts.name result = concat(pieces, keys=[0, 1, 2]) expected = ts.copy() ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]")) exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes) expected.index = exp_index tm.assert_series_equal(result, expected) def test_concat_series_axis1(self, sort=sort): ts = tm.makeTimeSeries() pieces = [ts[:-2], ts[2:], ts[2:-2]] result = concat(pieces, axis=1) expected = DataFrame(pieces).T tm.assert_frame_equal(result, expected) result = concat(pieces, keys=["A", "B", "C"], axis=1) expected = DataFrame(pieces, index=["A", "B", "C"]).T tm.assert_frame_equal(result, expected) # preserve series names, #2489 s = Series(randn(5), name="A") s2 = Series(randn(5), name="B") result = concat([s, s2], axis=1) expected = DataFrame({"A": s, "B": s2}) tm.assert_frame_equal(result, expected) s2.name = None result = concat([s, s2], axis=1) tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object")) # must reindex, #2603 s = Series(randn(3), index=["c", "a", "b"], name="A") s2 = Series(randn(4), index=["d", "a", "b", "c"], name="B") result = concat([s, s2], axis=1, sort=sort) expected = DataFrame({"A": s, "B": s2}) tm.assert_frame_equal(result, expected) def test_concat_series_axis1_names_applied(self): # ensure names argument is not ignored on axis=1, #23490 s = Series([1, 2, 3]) s2 = Series([4, 5, 6]) result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"]) expected = DataFrame( [[1, 4], [2, 5], [3, 6]], columns=pd.Index(["a", "b"], name="A") ) tm.assert_frame_equal(result, expected) result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"]) expected = DataFrame( [[1, 4], [2, 5], [3, 6]], columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]), ) tm.assert_frame_equal(result, expected) def test_concat_single_with_key(self): df = DataFrame(np.random.randn(10, 4)) result = concat([df], keys=["foo"]) expected = concat([df, df], keys=["foo", "bar"]) tm.assert_frame_equal(result, expected[:10]) def test_concat_exclude_none(self): df = DataFrame(np.random.randn(10, 4)) pieces = [df[:5], None, None, df[5:]] result = concat(pieces) tm.assert_frame_equal(result, df) with pytest.raises(ValueError, match="All objects passed were None"): concat([None, None]) def test_concat_datetime64_block(self): from pandas.core.indexes.datetimes import date_range rng = date_range("1/1/2000", periods=10) df = DataFrame({"time": rng}) result = concat([df, df]) assert (result.iloc[:10]["time"] == rng).all() assert (result.iloc[10:]["time"] == rng).all() def test_concat_timedelta64_block(self): from pandas import to_timedelta rng = to_timedelta(np.arange(10), unit="s") df = DataFrame({"time": rng}) result = concat([df, df]) assert (result.iloc[:10]["time"] == rng).all() assert (result.iloc[10:]["time"] == rng).all() def test_concat_keys_with_none(self): # #1649 df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]]) result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0)) expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) tm.assert_frame_equal(result, expected) result = concat( [None, df0, df0[:2], df0[:1], df0], keys=["a", "b", "c", "d", "e"] ) expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"]) tm.assert_frame_equal(result, expected) def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] # to join with union # these two are of different length! left = concat([ts1, ts2], join="outer", axis=1) right = concat([ts2, ts1], join="outer", axis=1) assert len(left) == len(right) def test_concat_bug_2972(self): ts0 = Series(np.zeros(5)) ts1 = Series(np.ones(5)) ts0.name = ts1.name = "same name" result = concat([ts0, ts1], axis=1) expected = DataFrame({0: ts0, 1: ts1}) expected.columns = ["same name", "same name"] tm.assert_frame_equal(result, expected) def test_concat_bug_3602(self): # GH 3602, duplicate columns df1 = DataFrame( { "firmNo": [0, 0, 0, 0], "prc": [6, 6, 6, 6], "stringvar": ["rrr", "rrr", "rrr", "rrr"], } ) df2 = DataFrame( {"C": [9, 10, 11, 12], "misc": [1, 2, 3, 4], "prc": [6, 6, 6, 6]} ) expected = DataFrame( [ [0, 6, "rrr", 9, 1, 6], [0, 6, "rrr", 10, 2, 6], [0, 6, "rrr", 11, 3, 6], [0, 6, "rrr", 12, 4, 6], ] ) expected.columns = ["firmNo", "prc", "stringvar", "C", "misc", "prc"] result = concat([df1, df2], axis=1) tm.assert_frame_equal(result, expected) def test_concat_inner_join_empty(self): # GH 15328 df_empty = pd.DataFrame() df_a = pd.DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") df_expected = pd.DataFrame({"a": []}, index=[], dtype="int64") for how, expected in [("inner", df_expected), ("outer", df_a)]: result = pd.concat([df_a, df_empty], axis=1, join=how) tm.assert_frame_equal(result, expected) def test_concat_series_axis1_same_names_ignore_index(self): dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1] s1 = Series(randn(len(dates)), index=dates, name="value") s2 = Series(randn(len(dates)), index=dates, name="value") result = concat([s1, s2], axis=1, ignore_index=True) expected = Index([0, 1]) tm.assert_index_equal(result.columns, expected) def test_concat_iterables(self): # GH8645 check concat works with tuples, list, generators, and weird # stuff like deque and custom iterables df1 = DataFrame([1, 2, 3]) df2 = DataFrame([4, 5, 6]) expected = DataFrame([1, 2, 3, 4, 5, 6]) tm.assert_frame_equal(concat((df1, df2), ignore_index=True), expected) tm.assert_frame_equal(concat([df1, df2], ignore_index=True), expected) tm.assert_frame_equal( concat((df for df in (df1, df2)), ignore_index=True), expected ) tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) class CustomIterator1: def __len__(self) -> int: return 2 def __getitem__(self, index): try: return {0: df1, 1: df2}[index] except KeyError as err: raise IndexError from err tm.assert_frame_equal(pd.concat(CustomIterator1(), ignore_index=True), expected) class CustomIterator2(abc.Iterable): def __iter__(self): yield df1 yield df2 tm.assert_frame_equal(pd.concat(CustomIterator2(), ignore_index=True), expected) def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = tm.makeCustomDataframe(10, 2) for obj in [1, dict(), [1, 2], (1, 2)]: msg = ( f"cannot concatenate object of type '{type(obj)}'; " "only Series and DataFrame objs are valid" ) with pytest.raises(TypeError, match=msg): concat([df1, obj]) def test_concat_invalid_first_argument(self): df1 = tm.makeCustomDataframe(10, 2) df2 = tm.makeCustomDataframe(10, 2) msg = ( "first argument must be an iterable of pandas " 'objects, you passed an object of type "DataFrame"' ) with pytest.raises(TypeError, match=msg): concat(df1, df2) # generator ok though concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) # text reader ok # GH6583 data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo2,12,13,14,15 bar2,12,13,14,15 """ reader = read_csv(StringIO(data), chunksize=1) result = concat(reader, ignore_index=True) expected = read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) def test_concat_NaT_series(self): # GH 11693 # test for merging NaT series with datetime series. x = Series( date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="US/Eastern") ) y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") expected = Series([x[0], x[1], pd.NaT, pd.NaT]) result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # all NaT with tz expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]") result = pd.concat([y, y], ignore_index=True) tm.assert_series_equal(result, expected) # without tz x = pd.Series(pd.date_range("20151124 08:00", "20151124 09:00", freq="1h")) y = pd.Series(pd.date_range("20151124 10:00", "20151124 11:00", freq="1h")) y[:] = pd.NaT expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) result = pd.concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # all NaT without tz x[:] = pd.NaT expected = pd.Series(pd.NaT, index=range(4), dtype="datetime64[ns]") result = pd.concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) def test_concat_tz_frame(self): df2 = DataFrame( dict( A=pd.Timestamp("20130102", tz="US/Eastern"), B=pd.Timestamp("20130603", tz="CET"), ), index=range(5), ) # concat df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) tm.assert_frame_equal(df2, df3) def test_concat_tz_series(self): # gh-11755: tz and no tz x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) y = Series(date_range("2012-01-01", "2012-01-02")) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # gh-11887: concat tz and object x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) y = Series(["a", "b"]) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # see gh-12217 and gh-12306 # Concatenating two UTC times first = pd.DataFrame([[datetime(2016, 1, 1)]]) first[0] = first[0].dt.tz_localize("UTC") second = pd.DataFrame([[datetime(2016, 1, 2)]]) second[0] = second[0].dt.tz_localize("UTC") result = pd.concat([first, second]) assert result[0].dtype == "datetime64[ns, UTC]" # Concatenating two London times first = pd.DataFrame([[datetime(2016, 1, 1)]]) first[0] = first[0].dt.tz_localize("Europe/London") second = pd.DataFrame([[datetime(2016, 1, 2)]]) second[0] = second[0].dt.tz_localize("Europe/London") result = pd.concat([first, second]) assert result[0].dtype == "datetime64[ns, Europe/London]" # Concatenating 2+1 London times first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) first[0] = first[0].dt.tz_localize("Europe/London") second = pd.DataFrame([[datetime(2016, 1, 3)]]) second[0] = second[0].dt.tz_localize("Europe/London") result = pd.concat([first, second]) assert result[0].dtype == "datetime64[ns, Europe/London]" # Concat'ing 1+2 London times first = pd.DataFrame([[datetime(2016, 1, 1)]]) first[0] = first[0].dt.tz_localize("Europe/London") second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) second[0] = second[0].dt.tz_localize("Europe/London") result = pd.concat([first, second]) assert result[0].dtype == "datetime64[ns, Europe/London]" def test_concat_tz_series_with_datetimelike(self): # see gh-12620: tz and timedelta x = [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-02-01", tz="US/Eastern"), ] y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")] result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) tm.assert_series_equal(result, pd.Series(x + y, dtype="object")) # tz and period y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")] result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) tm.assert_series_equal(result, pd.Series(x + y, dtype="object")) def test_concat_tz_series_tzlocal(self): # see gh-13583 x = [ pd.Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()), pd.Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()), ] y = [ pd.Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()), pd.Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()), ] result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) tm.assert_series_equal(result, pd.Series(x + y)) assert result.dtype == "datetime64[ns, tzlocal()]" @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) @pytest.mark.parametrize("s", [pd.NaT, pd.Timestamp("20150101")]) def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s): # GH 12396 # tz-naive first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply( lambda x: x.dt.tz_localize(tz1) ) second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2)) result = pd.concat([first, second], axis=0) expected = pd.DataFrame(pd.Series([pd.NaT, pd.NaT, s], index=[0, 1, 0])) expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) if tz1 != tz2: expected = expected.astype(object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2): # GH 12396 first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)) second = pd.DataFrame(pd.Series([pd.NaT]).dt.tz_localize(tz2), columns=[1]) expected = pd.DataFrame( { 0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1), 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2), } ) result = pd.concat([first, second], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): # GH 12396 # tz-naive first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1) second = pd.DataFrame( [ [pd.Timestamp("2015/01/01", tz=tz2)], [pd.Timestamp("2016/01/01", tz=tz2)], ], index=[2, 3], ) expected = pd.DataFrame( [ pd.NaT, pd.NaT, pd.Timestamp("2015/01/01", tz=tz2), pd.Timestamp("2016/01/01", tz=tz2), ] ) if tz1 != tz2: expected = expected.astype(object) result = pd.concat([first, second]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) def test_concat_NaT_dataframes(self, tz): # GH 12396 first = pd.DataFrame([[pd.NaT], [pd.NaT]]) first = first.apply(lambda x: x.dt.tz_localize(tz)) second = pd.DataFrame( [[pd.Timestamp("2015/01/01", tz=tz)], [pd.Timestamp("2016/01/01", tz=tz)]], index=[2, 3], ) expected = pd.DataFrame( [ pd.NaT, pd.NaT, pd.Timestamp("2015/01/01", tz=tz), pd.Timestamp("2016/01/01", tz=tz), ] ) result = pd.concat([first, second], axis=0) tm.assert_frame_equal(result, expected) def test_concat_period_series(self): x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D")) expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) def test_concat_period_multiple_freq_series(self): x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M")) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) assert result.dtype == "object" def test_concat_period_other_series(self): x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M")) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) assert result.dtype == "object" # non-period x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) y = Series(pd.DatetimeIndex(["2015-11-01", "2015-12-01"])) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) assert result.dtype == "object" x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) y = Series(["A", "B"]) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) assert result.dtype == "object" def test_concat_empty_series(self): # GH 11082 s1 = pd.Series([1, 2, 3], name="x") s2 = pd.Series(name="y", dtype="float64") res = pd.concat([s1, s2], axis=1) exp = pd.DataFrame( {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]}, index=pd.Index([0, 1, 2], dtype="O"), ) tm.assert_frame_equal(res, exp) s1 = pd.Series([1, 2, 3], name="x") s2 = pd.Series(name="y", dtype="float64") res = pd.concat([s1, s2], axis=0) # name will be reset exp = pd.Series([1, 2, 3]) tm.assert_series_equal(res, exp) # empty Series with no name s1 = pd.Series([1, 2, 3], name="x") s2 = pd.Series(name=None, dtype="float64") res = pd.concat([s1, s2], axis=1) exp = pd.DataFrame( {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, columns=["x", 0], index=pd.Index([0, 1, 2], dtype="O"), ) tm.assert_frame_equal(res, exp) @pytest.mark.parametrize("tz", [None, "UTC"]) @pytest.mark.parametrize("values", [[], [1, 2, 3]]) def test_concat_empty_series_timelike(self, tz, values): # GH 18447 first = Series([], dtype="M8[ns]").dt.tz_localize(tz) dtype = None if values else np.float64 second = Series(values, dtype=dtype) expected = DataFrame( { 0: pd.Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz), 1: values, } ) result = concat([first, second], axis=1) tm.assert_frame_equal(result, expected) def test_default_index(self): # is_series and ignore_index s1 = pd.Series([1, 2, 3], name="x") s2 = pd.Series([4, 5, 6], name="y") res = pd.concat([s1, s2], axis=1, ignore_index=True) assert isinstance(res.columns, pd.RangeIndex) exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) # use check_index_type=True to check the result have # RangeIndex (default index) tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) # is_series and all inputs have no names s1 = pd.Series([1, 2, 3]) s2 = pd.Series([4, 5, 6]) res = pd.concat([s1, s2], axis=1, ignore_index=False) assert isinstance(res.columns, pd.RangeIndex) exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) exp.columns = pd.RangeIndex(2) tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) # is_dataframe and ignore_index df1 = pd.DataFrame({"A": [1, 2], "B": [5, 6]}) df2 = pd.DataFrame({"A": [3, 4], "B": [7, 8]}) res = pd.concat([df1, df2], axis=0, ignore_index=True) exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"]) tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) res = pd.concat([df1, df2], axis=1, ignore_index=True) exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) def test_concat_multiindex_rangeindex(self): # GH13542 # when multi-index levels are RangeIndex objects # there is a bug in concat with objects of len 1 df = DataFrame(np.random.randn(9, 2)) df.index = MultiIndex( levels=[pd.RangeIndex(3), pd.RangeIndex(3)], codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)], ) res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) exp = df.iloc[[2, 3, 4, 5], :] tm.assert_frame_equal(res, exp) def test_concat_multiindex_dfs_with_deepcopy(self): # GH 9967 from copy import deepcopy example_multiindex1 = pd.MultiIndex.from_product([["a"], ["b"]]) example_dataframe1 = pd.DataFrame([0], index=example_multiindex1) example_multiindex2 = pd.MultiIndex.from_product([["a"], ["c"]]) example_dataframe2 = pd.DataFrame([1], index=example_multiindex2) example_dict = {"s1": example_dataframe1, "s2": example_dataframe2} expected_index = pd.MultiIndex( levels=[["s1", "s2"], ["a"], ["b", "c"]], codes=[[0, 1], [0, 0], [0, 1]], names=["testname", None, None], ) expected = pd.DataFrame([[0], [1]], index=expected_index) result_copy = pd.concat(deepcopy(example_dict), names=["testname"]) tm.assert_frame_equal(result_copy, expected) result_no_copy = pd.concat(example_dict, names=["testname"]) tm.assert_frame_equal(result_no_copy, expected) def test_categorical_concat_append(self): cat = Categorical(["a", "b"], categories=["a", "b"]) vals = [1, 2] df = DataFrame({"cats": cat, "vals": vals}) cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"]) vals2 = [1, 2, 1, 2] exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1])) tm.assert_frame_equal(pd.concat([df, df]), exp) tm.assert_frame_equal(df.append(df), exp) # GH 13524 can concat different categories cat3 = Categorical(["a", "b"], categories=["a", "b", "c"]) vals3 = [1, 2] df_different_categories = DataFrame({"cats": cat3, "vals": vals3}) res = pd.concat([df, df_different_categories], ignore_index=True) exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) tm.assert_frame_equal(res, exp) res = df.append(df_different_categories, ignore_index=True) tm.assert_frame_equal(res, exp) def test_categorical_concat_dtypes(self): # GH8143 index = ["cat", "obj", "num"] cat = Categorical(["a", "b", "c"]) obj = Series(["a", "b", "c"]) num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) result = df.dtypes == "object" expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) result = df.dtypes == "int64" expected = Series([False, False, True], index=index) tm.assert_series_equal(result, expected) result = df.dtypes == "category" expected = Series([True, False, False], index=index) tm.assert_series_equal(result, expected) def test_categorical_concat(self, sort): # See GH 10177 df1 = DataFrame( np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"] ) df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"]) cat_values = ["one", "one", "two", "one", "two", "two", "one"] df2["h"] = Series(Categorical(cat_values)) res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) exp = DataFrame( { "a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], "b": [ 1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, ], "c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], "h": [None] * 6 + cat_values, } ) tm.assert_frame_equal(res, exp) def test_categorical_concat_gh7864(self): # GH 7864 # make sure ordering is preserved df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")}) df["grade"] = Categorical(df["raw_grade"]) df["grade"].cat.set_categories(["e", "a", "b"]) df1 = df[0:3] df2 = df[3:] tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories) tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories) dfx = pd.concat([df1, df2]) tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories) dfa = df1.append(df2) tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories) def test_categorical_concat_preserve(self): # GH 8641 series concat not preserving category dtype # GH 13524 can concat different categories s = Series(list("abc"), dtype="category") s2 = Series(list("abd"), dtype="category") exp = Series(list("abcabd")) res = pd.concat([s, s2], ignore_index=True) tm.assert_series_equal(res, exp) exp = Series(list("abcabc"), dtype="category") res = pd.concat([s, s], ignore_index=True) tm.assert_series_equal(res, exp) exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category") res = pd.concat([s, s]) tm.assert_series_equal(res, exp) a = Series(np.arange(6, dtype="int64")) b = Series(list("aabbca")) df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))}) res = pd.concat([df2, df2]) exp = DataFrame( { "A": pd.concat([a, a]), "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), } ) tm.assert_frame_equal(res, exp) def test_categorical_index_preserver(self): a = Series(np.arange(6, dtype="int64")) b = Series(list("aabbca")) df2 = DataFrame( {"A": a, "B": b.astype(CategoricalDtype(list("cab")))} ).set_index("B") result = pd.concat([df2, df2]) expected = DataFrame( { "A": pd.concat([a, a]), "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), } ).set_index("B") tm.assert_frame_equal(result, expected) # wrong categories df3 = DataFrame( {"A": a, "B": Categorical(b, categories=list("abe"))} ).set_index("B") msg = "categories must match existing categories when appending" with pytest.raises(TypeError, match=msg): pd.concat([df2, df3]) def test_concat_categoricalindex(self): # GH 16111, categories that aren't lexsorted categories = [9, 0, 1, 2, 3] a = pd.Series(1, index=pd.CategoricalIndex([9, 0], categories=categories)) b = pd.Series(2, index=pd.CategoricalIndex([0, 1], categories=categories)) c = pd.Series(3, index=pd.CategoricalIndex([1, 2], categories=categories)) result = pd.concat([a, b, c], axis=1) exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) exp = pd.DataFrame( { 0: [1, 1, np.nan, np.nan], 1: [np.nan, 2, 2, np.nan], 2: [np.nan, np.nan, 3, 3], }, columns=[0, 1, 2], index=exp_idx, ) tm.assert_frame_equal(result, exp) def test_concat_order(self): # GH 17344 dfs = [pd.DataFrame(index=range(3), columns=["a", 1, None])] dfs += [ pd.DataFrame(index=range(3), columns=[None, 1, "a"]) for i in range(100) ] result = pd.concat(dfs, sort=True).columns expected = dfs[0].columns tm.assert_index_equal(result, expected) def test_concat_datetime_timezone(self): # GH 18523 idx1 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq="H") df1 = pd.DataFrame({"a": [1, 2, 3]}, index=idx1) df2 = pd.DataFrame({"b": [1, 2, 3]}, index=idx2) result = pd.concat([df1, df2], axis=1) exp_idx = ( DatetimeIndex( [ "2011-01-01 00:00:00+01:00", "2011-01-01 01:00:00+01:00", "2011-01-01 02:00:00+01:00", ], freq="H", ) .tz_convert("UTC") .tz_convert("Europe/Paris") ) expected = pd.DataFrame( [[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"] ) tm.assert_frame_equal(result, expected) idx3 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") df3 = pd.DataFrame({"b": [1, 2, 3]}, index=idx3) result = pd.concat([df1, df3], axis=1) exp_idx = DatetimeIndex( [ "2010-12-31 15:00:00+00:00", "2010-12-31 16:00:00+00:00", "2010-12-31 17:00:00+00:00", "2010-12-31 23:00:00+00:00", "2011-01-01 00:00:00+00:00", "2011-01-01 01:00:00+00:00", ] ) expected = pd.DataFrame( [ [np.nan, 1], [np.nan, 2], [np.nan, 3], [1, np.nan], [2, np.nan], [3, np.nan], ], index=exp_idx, columns=["a", "b"], ) tm.assert_frame_equal(result, expected) # GH 13783: Concat after resample result = pd.concat( [df1.resample("H").mean(), df2.resample("H").mean()], sort=True ) expected = pd.DataFrame( {"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]}, index=idx1.append(idx1), ) tm.assert_frame_equal(result, expected) def test_concat_different_extension_dtypes_upcasts(self): a = pd.Series(pd.core.arrays.integer_array([1, 2])) b = pd.Series(to_decimal([1, 2])) result = pd.concat([a, b], ignore_index=True) expected = pd.Series([1, 2, Decimal(1), Decimal(2)], dtype=object) tm.assert_series_equal(result, expected) def test_concat_odered_dict(self): # GH 21510 expected = pd.concat( [pd.Series(range(3)), pd.Series(range(4))], keys=["First", "Another"] ) result = pd.concat( OrderedDict( [("First", pd.Series(range(3))), ("Another", pd.Series(range(4)))] ) ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("pdt", [pd.Series, pd.DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["float"]) def test_concat_no_unnecessary_upcast(dt, pdt): # GH 13247 dims = pdt(dtype=object).ndim dfs = [ pdt(np.array([1], dtype=dt, ndmin=dims)), pdt(np.array([np.nan], dtype=dt, ndmin=dims)), pdt(np.array([5], dtype=dt, ndmin=dims)), ] x = pd.concat(dfs) assert x.values.dtype == dt @pytest.mark.parametrize("pdt", [create_series_with_explicit_dtype, pd.DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["int"]) def test_concat_will_upcast(dt, pdt): with catch_warnings(record=True): dims = pdt().ndim dfs = [ pdt(np.array([1], dtype=dt, ndmin=dims)), pdt(np.array([np.nan], ndmin=dims)), pdt(np.array([5], dtype=dt, ndmin=dims)), ] x = pd.concat(dfs) assert x.values.dtype == "float64" def test_concat_empty_and_non_empty_frame_regression(): # GH 18178 regression test df1 = pd.DataFrame({"foo": [1]}) df2 = pd.DataFrame({"foo": []}) expected = pd.DataFrame({"foo": [1.0]}) result = pd.concat([df1, df2]) tm.assert_frame_equal(result, expected) def test_concat_empty_and_non_empty_series_regression(): # GH 18187 regression test s1 = pd.Series([1]) s2 = pd.Series([], dtype=object) expected = s1 result = pd.concat([s1, s2]) tm.assert_series_equal(result, expected) def test_concat_sorts_columns(sort): # GH-4588 df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]}) # for sort=True/None expected = pd.DataFrame( {"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]}, columns=["a", "b", "c"], ) if sort is False: expected = expected[["b", "a", "c"]] # default with tm.assert_produces_warning(None): result = pd.concat([df1, df2], ignore_index=True, sort=sort) tm.assert_frame_equal(result, expected) def test_concat_sorts_index(sort): df1 = pd.DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"]) df2 = pd.DataFrame({"b": [1, 2]}, index=["a", "b"]) # For True/None expected = pd.DataFrame( {"a": [2, 3, 1], "b": [1, 2, None]}, index=["a", "b", "c"], columns=["a", "b"] ) if sort is False: expected = expected.loc[["c", "a", "b"]] # Warn and sort by default with tm.assert_produces_warning(None): result = pd.concat([df1, df2], axis=1, sort=sort) tm.assert_frame_equal(result, expected) def test_concat_inner_sort(sort): # https://github.com/pandas-dev/pandas/pull/20613 df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]) df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4]) with tm.assert_produces_warning(None): # unset sort should *not* warn for inner join # since that never sorted result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True) expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"]) if sort is True: expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) def test_concat_aligned_sort(): # GH-4588 df = pd.DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"]) result = pd.concat([df, df], sort=True, ignore_index=True) expected = pd.DataFrame( {"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]}, columns=["a", "b", "c"], ) tm.assert_frame_equal(result, expected) result = pd.concat([df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True) expected = expected[["b", "c"]] tm.assert_frame_equal(result, expected) def test_concat_aligned_sort_does_not_raise(): # GH-4588 # We catch TypeErrors from sorting internally and do not re-raise. df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"]) expected = pd.DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"]) result = pd.concat([df, df], ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))]) def test_concat_series_name_npscalar_tuple(s1name, s2name): # GH21015 s1 = pd.Series({"a": 1, "b": 2}, name=s1name) s2 = pd.Series({"c": 5, "d": 6}, name=s2name) result = pd.concat([s1, s2]) expected = pd.Series({"a": 1, "b": 2, "c": 5, "d": 6}) tm.assert_series_equal(result, expected) def test_concat_categorical_tz(): # GH-23816 a = pd.Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific")) b = pd.Series(["a", "b"], dtype="category") result = pd.concat([a, b], ignore_index=True) expected = pd.Series( [ pd.Timestamp("2017-01-01", tz="US/Pacific"), pd.Timestamp("2017-01-02", tz="US/Pacific"), "a", "b", ] ) tm.assert_series_equal(result, expected) def test_concat_categorical_unchanged(): # GH-12007 # test fix for when concat on categorical and float # coerces dtype categorical -> float df = pd.DataFrame(pd.Series(["a", "b", "c"], dtype="category", name="A")) ser = pd.Series([0, 1, 2], index=[0, 1, 3], name="B") result = pd.concat([df, ser], axis=1) expected = pd.DataFrame( { "A": pd.Series(["a", "b", "c", np.nan], dtype="category"), "B": pd.Series([0, 1, np.nan, 2], dtype="float"), } ) tm.assert_equal(result, expected) def test_concat_datetimeindex_freq(): # GH 3232 # Monotonic index result dr = pd.date_range("01-Jan-2013", periods=100, freq="50L", tz="UTC") data = list(range(100)) expected = pd.DataFrame(data, index=dr) result = pd.concat([expected[:50], expected[50:]]) tm.assert_frame_equal(result, expected) # Non-monotonic index result result = pd.concat([expected[50:], expected[:50]]) expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) expected.index._data.freq = None tm.assert_frame_equal(result, expected) def test_concat_empty_df_object_dtype(): # GH 9149 df_1 = pd.DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]}) df_2 = pd.DataFrame(columns=df_1.columns) result = pd.concat([df_1, df_2], axis=0) expected = df_1.astype(object) tm.assert_frame_equal(result, expected) def test_concat_sparse(): # GH 23557 a = pd.Series(SparseArray([0, 1, 2])) expected = pd.DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype( pd.SparseDtype(np.int64, 0) ) result = pd.concat([a, a], axis=1) tm.assert_frame_equal(result, expected) def test_concat_dense_sparse(): # GH 30668 a = pd.Series(pd.arrays.SparseArray([1, None]), dtype=float) b = pd.Series([1], dtype=float) expected = pd.Series(data=[1, None, 1], index=[0, 1, 0]).astype( pd.SparseDtype(np.float64, None) ) result = pd.concat([a, b], axis=0) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("test_series", [True, False]) def test_concat_copy_index(test_series, axis): # GH 29879 if test_series: ser = Series([1, 2]) comb = concat([ser, ser], axis=axis, copy=True) assert comb.index is not ser.index else: df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) comb = concat([df, df], axis=axis, copy=True) assert comb.index is not df.index assert comb.columns is not df.columns def test_concat_multiindex_datetime_object_index(): # https://github.com/pandas-dev/pandas/issues/11058 s = Series( ["a", "b"], index=MultiIndex.from_arrays( [[1, 2], Index([dt.date(2013, 1, 1), dt.date(2014, 1, 1)], dtype="object")], names=["first", "second"], ), ) s2 = Series( ["a", "b"], index=MultiIndex.from_arrays( [[1, 2], Index([dt.date(2013, 1, 1), dt.date(2015, 1, 1)], dtype="object")], names=["first", "second"], ), ) expected = DataFrame( [["a", "a"], ["b", np.nan], [np.nan, "b"]], index=MultiIndex.from_arrays( [ [1, 2, 2], DatetimeIndex( ["2013-01-01", "2014-01-01", "2015-01-01"], dtype="datetime64[ns]", freq=None, ), ], names=["first", "second"], ), ) result = concat([s, s2], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("keys", [["e", "f", "f"], ["f", "e", "f"]]) def test_duplicate_keys(keys): # GH 33654 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) s1 = Series([7, 8, 9], name="c") s2 = Series([10, 11, 12], name="d") result = concat([df, s1, s2], axis=1, keys=keys) expected_values = [[1, 4, 7, 10], [2, 5, 8, 11], [3, 6, 9, 12]] expected_columns = pd.MultiIndex.from_tuples( [(keys[0], "a"), (keys[0], "b"), (keys[1], "c"), (keys[2], "d")] ) expected = DataFrame(expected_values, columns=expected_columns) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "obj", [ tm.SubclassedDataFrame({"A": np.arange(0, 10)}), tm.SubclassedSeries(np.arange(0, 10), name="A"), ], ) def test_concat_preserves_subclass(obj): # GH28330 -- preserve subclass result = concat([obj, obj]) assert isinstance(result, type(obj)) def test_concat_frame_axis0_extension_dtypes(): # preserve extension dtype (through common_dtype mechanism) df1 = pd.DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")}) df2 = pd.DataFrame({"a": np.array([4, 5, 6])}) result = pd.concat([df1, df2], ignore_index=True) expected = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64") tm.assert_frame_equal(result, expected) result = pd.concat([df2, df1], ignore_index=True) expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") tm.assert_frame_equal(result, expected)