From 982a839408a4f8b7bac5c6df7d664455d215af34 Mon Sep 17 00:00:00 2001 From: anetakahle Date: Thu, 19 May 2022 17:08:16 +0200 Subject: [PATCH 01/17] DOC: Improve reshape\concat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Matěj Štágl --- pandas/core/reshape/concat.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index c2b36dab4a67e..bde90f819f359 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -344,6 +344,20 @@ def concat( Traceback (most recent call last): ... ValueError: Indexes have overlapping values: ['a'] + + Append a single row to the end of a ``DataFrame`` object. + + >>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0]) + >>> df7 + a b + 0 1 2 + >>> new_row = pd.Series([3]) + >>> new_row + 0 3 + >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) + a b 0 + 0 1.0 2.0 NaN + 1 NaN NaN 3.0 """ op = _Concatenator( objs, From 108d96e0009f89466e6107169d55a2bf5b6121a7 Mon Sep 17 00:00:00 2001 From: anetakahle Date: Fri, 20 May 2022 19:03:06 +0200 Subject: [PATCH 02/17] Update concat.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Matěj Štágl --- pandas/core/reshape/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index bde90f819f359..1ac4b6c1f8897 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -354,7 +354,7 @@ def concat( >>> new_row = pd.Series([3]) >>> new_row 0 3 - >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) + >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) a b 0 0 1.0 2.0 NaN 1 NaN NaN 3.0 From f4e394d4fedfbfeb5ea562c8868e5520bc99c2b7 Mon Sep 17 00:00:00 2001 From: anetakahle Date: Fri, 20 May 2022 19:04:53 +0200 Subject: [PATCH 03/17] Revert "Merge branch 'pandas-dev:main' into doc-concat" This reverts commit 824b9bd4eec93b75cc4d0fa64cc858612ecba341, reversing changes made to 982a839408a4f8b7bac5c6df7d664455d215af34. --- asv_bench/benchmarks/gil.py | 82 ++++++------------- .../06_calculate_statistics.rst | 4 +- doc/source/user_guide/10min.rst | 2 +- doc/source/user_guide/groupby.rst | 26 +++--- doc/source/user_guide/indexing.rst | 2 +- doc/source/user_guide/reshaping.rst | 15 ++-- doc/source/user_guide/timeseries.rst | 4 +- doc/source/whatsnew/v0.18.1.rst | 2 +- doc/source/whatsnew/v0.19.0.rst | 4 +- doc/source/whatsnew/v1.4.3.rst | 1 - pandas/_testing/__init__.py | 50 +++++++++++ pandas/core/groupby/groupby.py | 11 +-- pandas/tests/groupby/aggregate/test_numba.py | 27 ------ pandas/tests/groupby/transform/test_numba.py | 27 ------ 14 files changed, 101 insertions(+), 156 deletions(-) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 31654a5c75617..af2efe56c2530 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,6 +1,3 @@ -from functools import wraps -import threading - import numpy as np from pandas import ( @@ -33,57 +30,21 @@ from pandas._libs import algos except ImportError: from pandas import algos +try: + from pandas._testing import test_parallel # noqa: PDF014 + have_real_test_parallel = True +except ImportError: + have_real_test_parallel = False -from .pandas_vb_common import BaseIO # isort:skip - - -def test_parallel(num_threads=2, kwargs_list=None): - """ - Decorator to run the same function multiple times in parallel. - - Parameters - ---------- - num_threads : int, optional - The number of times the function is run in parallel. - kwargs_list : list of dicts, optional - The list of kwargs to update original - function kwargs on different threads. - - Notes - ----- - This decorator does not pass the return value of the decorated function. - - Original from scikit-image: - - https://github.com/scikit-image/scikit-image/pull/1519 - - """ - assert num_threads > 0 - has_kwargs_list = kwargs_list is not None - if has_kwargs_list: - assert len(kwargs_list) == num_threads + def test_parallel(num_threads=1): + def wrapper(fname): + return fname - def wrapper(func): - @wraps(func) - def inner(*args, **kwargs): - if has_kwargs_list: - update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) - else: - update_kwargs = lambda i: kwargs - threads = [] - for i in range(num_threads): - updated_kwargs = update_kwargs(i) - thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) - threads.append(thread) - for thread in threads: - thread.start() - for thread in threads: - thread.join() + return wrapper - return inner - return wrapper +from .pandas_vb_common import BaseIO # isort:skip class ParallelGroupbyMethods: @@ -92,7 +53,8 @@ class ParallelGroupbyMethods: param_names = ["threads", "method"] def setup(self, threads, method): - + if not have_real_test_parallel: + raise NotImplementedError N = 10**6 ngroups = 10**3 df = DataFrame( @@ -124,7 +86,8 @@ class ParallelGroups: param_names = ["threads"] def setup(self, threads): - + if not have_real_test_parallel: + raise NotImplementedError size = 2**22 ngroups = 10**3 data = Series(np.random.randint(0, ngroups, size=size)) @@ -145,7 +108,8 @@ class ParallelTake1D: param_names = ["dtype"] def setup(self, dtype): - + if not have_real_test_parallel: + raise NotImplementedError N = 10**6 df = DataFrame({"col": np.arange(N, dtype=dtype)}) indexer = np.arange(100, len(df) - 100) @@ -167,7 +131,8 @@ class ParallelKth: repeat = 5 def setup(self): - + if not have_real_test_parallel: + raise NotImplementedError N = 10**7 k = 5 * 10**5 kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}] @@ -184,7 +149,8 @@ def time_kth_smallest(self): class ParallelDatetimeFields: def setup(self): - + if not have_real_test_parallel: + raise NotImplementedError N = 10**6 self.dti = date_range("1900-01-01", periods=N, freq="T") self.period = self.dti.to_period("D") @@ -238,7 +204,8 @@ class ParallelRolling: param_names = ["method"] def setup(self, method): - + if not have_real_test_parallel: + raise NotImplementedError win = 100 arr = np.random.rand(100000) if hasattr(DataFrame, "rolling"): @@ -281,7 +248,8 @@ class ParallelReadCSV(BaseIO): param_names = ["dtype"] def setup(self, dtype): - + if not have_real_test_parallel: + raise NotImplementedError rows = 10000 cols = 50 data = { @@ -316,6 +284,8 @@ class ParallelFactorize: param_names = ["threads"] def setup(self, threads): + if not have_real_test_parallel: + raise NotImplementedError strings = tm.makeStringIndex(100000) diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst index 346a5cecf601d..298d0c4e0111c 100644 --- a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst +++ b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst @@ -154,11 +154,11 @@ The apply and combine steps are typically done together in pandas. In the previous example, we explicitly selected the 2 columns first. If not, the ``mean`` method is applied to each column containing numerical -columns by passing ``numeric_only=True``: +columns: .. ipython:: python - titanic.groupby("Sex").mean(numeric_only=True) + titanic.groupby("Sex").mean() It does not make much sense to get the average value of the ``Pclass``. If we are only interested in the average age for each gender, the diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 9916f13e015dd..9ccf191194e19 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -532,7 +532,7 @@ groups: .. ipython:: python - df.groupby("A")[["C", "D"]].sum() + df.groupby("A").sum() Grouping by multiple columns forms a hierarchical index, and again we can apply the :meth:`~pandas.core.groupby.GroupBy.sum` function: diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index f2d83885df2d0..f381d72069775 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -477,7 +477,7 @@ An obvious one is aggregation via the .. ipython:: python grouped = df.groupby("A") - grouped[["C", "D"]].aggregate(np.sum) + grouped.aggregate(np.sum) grouped = df.groupby(["A", "B"]) grouped.aggregate(np.sum) @@ -492,7 +492,7 @@ changed by using the ``as_index`` option: grouped = df.groupby(["A", "B"], as_index=False) grouped.aggregate(np.sum) - df.groupby("A", as_index=False)[["C", "D"]].sum() + df.groupby("A", as_index=False).sum() Note that you could use the ``reset_index`` DataFrame function to achieve the same result as the column names are stored in the resulting ``MultiIndex``: @@ -730,7 +730,7 @@ optimized Cython implementations: .. ipython:: python - df.groupby("A")[["C", "D"]].sum() + df.groupby("A").sum() df.groupby(["A", "B"]).mean() Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above @@ -1159,12 +1159,13 @@ Again consider the example DataFrame we've been looking at: Suppose we wish to compute the standard deviation grouped by the ``A`` column. There is a slight problem, namely that we don't care about the data in -column ``B``. We refer to this as a "nuisance" column. You can avoid nuisance -columns by specifying ``numeric_only=True``: +column ``B``. We refer to this as a "nuisance" column. If the passed +aggregation function can't be applied to some columns, the troublesome columns +will be (silently) dropped. Thus, this does not pose any problems: .. ipython:: python - df.groupby("A").std(numeric_only=True) + df.groupby("A").std() Note that ``df.groupby('A').colname.std().`` is more efficient than ``df.groupby('A').std().colname``, so if the result of an aggregation function @@ -1179,14 +1180,7 @@ is only interesting over one column (here ``colname``), it may be filtered If you do wish to include decimal or object columns in an aggregation with other non-nuisance data types, you must do so explicitly. -.. warning:: - The automatic dropping of nuisance columns has been deprecated and will be removed - in a future version of pandas. If columns are included that cannot be operated - on, pandas will instead raise an error. In order to avoid this, either select - the columns you wish to operate on or specify ``numeric_only=True``. - .. ipython:: python - :okwarning: from decimal import Decimal @@ -1310,7 +1304,7 @@ Groupby a specific column with the desired frequency. This is like resampling. .. ipython:: python - df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() You have an ambiguous specification in that you have a named index and a column that could be potential groupers. @@ -1319,9 +1313,9 @@ that could be potential groupers. df = df.set_index("Date") df["Date"] = df.index + pd.offsets.MonthEnd(2) - df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"]).sum() - df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"]).sum() Taking the first rows of each group diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 3c08b5a498eea..a94681924d211 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -583,7 +583,7 @@ without using a temporary variable. .. ipython:: python bb = pd.read_csv('data/baseball.csv', index_col='id') - (bb.groupby(['year', 'team']).sum(numeric_only=True) + (bb.groupby(['year', 'team']).sum() .loc[lambda df: df['r'] > 100]) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index b24890564d1bf..f9e68b1b39ddc 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -414,11 +414,12 @@ We can produce pivot tables from this data very easily: The result object is a :class:`DataFrame` having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table -will include all of the data in an additional level of hierarchy in the columns: +will include all of the data that can be aggregated in an additional level of +hierarchy in the columns: .. ipython:: python - pd.pivot_table(df[["A", "B", "C", "D", "E"]], index=["A", "B"], columns=["C"]) + pd.pivot_table(df, index=["A", "B"], columns=["C"]) Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For detail of :class:`Grouper`, see :ref:`Grouping with a Grouper specification `. @@ -431,7 +432,7 @@ calling :meth:`~DataFrame.to_string` if you wish: .. ipython:: python - table = pd.pivot_table(df, index=["A", "B"], columns=["C"], values=["D", "E"]) + table = pd.pivot_table(df, index=["A", "B"], columns=["C"]) print(table.to_string(na_rep="")) Note that :meth:`~DataFrame.pivot_table` is also available as an instance method on DataFrame, @@ -448,13 +449,7 @@ rows and columns: .. ipython:: python - table = df.pivot_table( - index=["A", "B"], - columns="C", - values=["D", "E"], - margins=True, - aggfunc=np.std - ) + table = df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std) table Additionally, you can call :meth:`DataFrame.stack` to display a pivoted DataFrame diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index c67d028b65b3e..582620d8b6479 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1821,7 +1821,7 @@ to resample based on datetimelike column in the frame, it can passed to the ), ) df - df.resample("M", on="date")[["a"]].sum() + df.resample("M", on="date").sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1829,7 +1829,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample("M", level="d")[["a"]].sum() + df.resample("M", level="d").sum() .. _timeseries.iterating-label: diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index 7d9008fdbdecd..f873d320822ae 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -166,7 +166,7 @@ without using temporary variable. .. ipython:: python bb = pd.read_csv("data/baseball.csv", index_col="id") - (bb.groupby(["year", "team"]).sum(numeric_only=True).loc[lambda df: df.r > 100]) + (bb.groupby(["year", "team"]).sum().loc[lambda df: df.r > 100]) .. _whatsnew_0181.partial_string_indexing: diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 113bbcf0a05bc..a2bb935c708bc 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -497,8 +497,8 @@ Other enhancements ), ) df - df.resample("M", on="date")[["a"]].sum() - df.resample("M", level="d")[["a"]].sum() + df.resample("M", on="date").sum() + df.resample("M", level="d").sum() - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the docs for more details (:issue:`13577`). - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 7c09eec212d69..23c8ad63bf7bb 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -16,7 +16,6 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) -- Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) - Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`) .. --------------------------------------------------------------------------- diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 53e003e2ed7dd..603c2f081a31a 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -3,6 +3,7 @@ import collections from datetime import datetime from decimal import Decimal +from functools import wraps import operator import os import re @@ -748,6 +749,55 @@ def makeMissingDataframe(density=0.9, random_state=None): return df +def test_parallel(num_threads=2, kwargs_list=None): + """ + Decorator to run the same function multiple times in parallel. + + Parameters + ---------- + num_threads : int, optional + The number of times the function is run in parallel. + kwargs_list : list of dicts, optional + The list of kwargs to update original + function kwargs on different threads. + + Notes + ----- + This decorator does not pass the return value of the decorated function. + + Original from scikit-image: + + https://github.com/scikit-image/scikit-image/pull/1519 + + """ + assert num_threads > 0 + has_kwargs_list = kwargs_list is not None + if has_kwargs_list: + assert len(kwargs_list) == num_threads + import threading + + def wrapper(func): + @wraps(func) + def inner(*args, **kwargs): + if has_kwargs_list: + update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) + else: + update_kwargs = lambda i: kwargs + threads = [] + for i in range(num_threads): + updated_kwargs = update_kwargs(i) + thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) + threads.append(thread) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + return inner + + return wrapper + + class SubclassedSeries(Series): _metadata = ["testattr", "name"] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f7c89b6e7dc49..0203d54e0de86 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1310,16 +1310,7 @@ def _numba_prep(self, data): sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False) sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() - if len(self.grouper.groupings) > 1: - raise NotImplementedError( - "More than 1 grouping labels are not supported with engine='numba'" - ) - # GH 46867 - index_data = data.index - if isinstance(index_data, MultiIndex): - group_key = self.grouper.groupings[0].name - index_data = index_data.get_level_values(group_key) - sorted_index_data = index_data.take(sorted_index).to_numpy() + sorted_index_data = data.index.take(sorted_index).to_numpy() starts, ends = lib.generate_slices(sorted_ids, ngroups) return ( diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 9f71c2c2fa0b6..ba58ac27284b8 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -211,30 +211,3 @@ def func_kwargs(values, index): ) expected = DataFrame({"value": [1.0, 1.0, 1.0]}) tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("numba") -def test_multiindex_one_key(nogil, parallel, nopython): - def numba_func(values, index): - return 1 - - df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) - engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} - result = df.groupby("A").agg( - numba_func, engine="numba", engine_kwargs=engine_kwargs - ) - expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"]) - tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("numba") -def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): - def numba_func(values, index): - return 1 - - df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) - engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} - with pytest.raises(NotImplementedError, match="More than 1 grouping labels"): - df.groupby(["A", "B"]).agg( - numba_func, engine="numba", engine_kwargs=engine_kwargs - ) diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 1b8570dbdc21d..a404e0b9304cc 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -199,30 +199,3 @@ def func_kwargs(values, index): ) expected = DataFrame({"value": [1.0, 1.0, 1.0]}) tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("numba") -def test_multiindex_one_key(nogil, parallel, nopython): - def numba_func(values, index): - return 1 - - df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) - engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} - result = df.groupby("A").transform( - numba_func, engine="numba", engine_kwargs=engine_kwargs - ) - expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"]) - tm.assert_frame_equal(result, expected) - - -@td.skip_if_no("numba") -def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): - def numba_func(values, index): - return 1 - - df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) - engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} - with pytest.raises(NotImplementedError, match="More than 1 grouping labels"): - df.groupby(["A", "B"]).transform( - numba_func, engine="numba", engine_kwargs=engine_kwargs - ) From 7d4a81fd3c10eeab9f38309bc508373329f0ce4b Mon Sep 17 00:00:00 2001 From: anetakahle Date: Fri, 20 May 2022 19:14:12 +0200 Subject: [PATCH 04/17] Update concat.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Matěj Štágl --- pandas/core/reshape/concat.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 1ac4b6c1f8897..8f20f554c295e 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -351,13 +351,16 @@ def concat( >>> df7 a b 0 1 2 - >>> new_row = pd.Series([3]) - >>> new_row - 0 3 - >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) - a b 0 - 0 1.0 2.0 NaN - 1 NaN NaN 3.0 + >>> new_rows = pd.Series({'a': 3, 'b': 4}) + >>> new_rows + 0 3 4 + >>> pd.concat([df7, new_rows.to_frame().T], ignore_index=True) + a b + 0 1 2 + 1 3 4 + + (It is not recomended to build DataFrames by adding single rows in a +not loop. Build a list of rows and make a DataFrame in a single concat.) """ op = _Concatenator( objs, From 873a59f4f9d8745d91220e361f55938f775b01c0 Mon Sep 17 00:00:00 2001 From: anetakahle Date: Fri, 20 May 2022 19:21:13 +0200 Subject: [PATCH 05/17] Update concat.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Matěj Štágl --- pandas/core/reshape/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 8f20f554c295e..1e5741f024392 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -359,7 +359,7 @@ def concat( 0 1 2 1 3 4 - (It is not recomended to build DataFrames by adding single rows in a + (It is not recommended to build DataFrames by adding single rows in a not loop. Build a list of rows and make a DataFrame in a single concat.) """ op = _Concatenator( From 95139122722c3827cf34a6db7862ac0eeffaf727 Mon Sep 17 00:00:00 2001 From: anetakahle Date: Fri, 20 May 2022 19:29:33 +0200 Subject: [PATCH 06/17] Update concat.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Matěj Štágl --- pandas/core/reshape/concat.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 1e5741f024392..78730db572e4d 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -226,6 +226,10 @@ def concat( pandas objects can be found `here `__. + + It is not recommended to build DataFrames by adding single rows in a +not loop. Build a list of rows and make a DataFrame in a single concat. + Examples -------- Combine two ``Series``. @@ -351,16 +355,13 @@ def concat( >>> df7 a b 0 1 2 - >>> new_rows = pd.Series({'a': 3, 'b': 4}) - >>> new_rows + >>> new_row = pd.Series({'a': 3, 'b': 4}) + >>> new_row 0 3 4 - >>> pd.concat([df7, new_rows.to_frame().T], ignore_index=True) + >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) a b 0 1 2 1 3 4 - - (It is not recommended to build DataFrames by adding single rows in a -not loop. Build a list of rows and make a DataFrame in a single concat.) """ op = _Concatenator( objs, From 0b2926527ec2b60ed3bfb3ddf433981113d74da0 Mon Sep 17 00:00:00 2001 From: anetakahle Date: Fri, 20 May 2022 19:44:54 +0200 Subject: [PATCH 07/17] Update pandas/core/reshape/concat.py Co-authored-by: Marco Edward Gorelli --- pandas/core/reshape/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 78730db572e4d..e920a49ecdde3 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -228,7 +228,7 @@ def concat( It is not recommended to build DataFrames by adding single rows in a -not loop. Build a list of rows and make a DataFrame in a single concat. +for loop. Build a list of rows and make a DataFrame in a single concat. Examples -------- From eff3bad23f818be3e58c356d39685e460892b315 Mon Sep 17 00:00:00 2001 From: anetakahle Date: Fri, 20 May 2022 19:45:20 +0200 Subject: [PATCH 08/17] Revert "Revert "Merge branch 'pandas-dev:main' into doc-concat"" This reverts commit f4e394d4fedfbfeb5ea562c8868e5520bc99c2b7. --- asv_bench/benchmarks/gil.py | 82 +++++++++++++------ .../06_calculate_statistics.rst | 4 +- doc/source/user_guide/10min.rst | 2 +- doc/source/user_guide/groupby.rst | 26 +++--- doc/source/user_guide/indexing.rst | 2 +- doc/source/user_guide/reshaping.rst | 15 ++-- doc/source/user_guide/timeseries.rst | 4 +- doc/source/whatsnew/v0.18.1.rst | 2 +- doc/source/whatsnew/v0.19.0.rst | 4 +- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/_testing/__init__.py | 50 ----------- pandas/core/groupby/groupby.py | 11 ++- pandas/tests/groupby/aggregate/test_numba.py | 27 ++++++ pandas/tests/groupby/transform/test_numba.py | 27 ++++++ 14 files changed, 156 insertions(+), 101 deletions(-) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index af2efe56c2530..31654a5c75617 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,3 +1,6 @@ +from functools import wraps +import threading + import numpy as np from pandas import ( @@ -30,21 +33,57 @@ from pandas._libs import algos except ImportError: from pandas import algos -try: - from pandas._testing import test_parallel # noqa: PDF014 - have_real_test_parallel = True -except ImportError: - have_real_test_parallel = False - def test_parallel(num_threads=1): - def wrapper(fname): - return fname +from .pandas_vb_common import BaseIO # isort:skip - return wrapper +def test_parallel(num_threads=2, kwargs_list=None): + """ + Decorator to run the same function multiple times in parallel. -from .pandas_vb_common import BaseIO # isort:skip + Parameters + ---------- + num_threads : int, optional + The number of times the function is run in parallel. + kwargs_list : list of dicts, optional + The list of kwargs to update original + function kwargs on different threads. + + Notes + ----- + This decorator does not pass the return value of the decorated function. + + Original from scikit-image: + + https://github.com/scikit-image/scikit-image/pull/1519 + + """ + assert num_threads > 0 + has_kwargs_list = kwargs_list is not None + if has_kwargs_list: + assert len(kwargs_list) == num_threads + + def wrapper(func): + @wraps(func) + def inner(*args, **kwargs): + if has_kwargs_list: + update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) + else: + update_kwargs = lambda i: kwargs + threads = [] + for i in range(num_threads): + updated_kwargs = update_kwargs(i) + thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) + threads.append(thread) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + return inner + + return wrapper class ParallelGroupbyMethods: @@ -53,8 +92,7 @@ class ParallelGroupbyMethods: param_names = ["threads", "method"] def setup(self, threads, method): - if not have_real_test_parallel: - raise NotImplementedError + N = 10**6 ngroups = 10**3 df = DataFrame( @@ -86,8 +124,7 @@ class ParallelGroups: param_names = ["threads"] def setup(self, threads): - if not have_real_test_parallel: - raise NotImplementedError + size = 2**22 ngroups = 10**3 data = Series(np.random.randint(0, ngroups, size=size)) @@ -108,8 +145,7 @@ class ParallelTake1D: param_names = ["dtype"] def setup(self, dtype): - if not have_real_test_parallel: - raise NotImplementedError + N = 10**6 df = DataFrame({"col": np.arange(N, dtype=dtype)}) indexer = np.arange(100, len(df) - 100) @@ -131,8 +167,7 @@ class ParallelKth: repeat = 5 def setup(self): - if not have_real_test_parallel: - raise NotImplementedError + N = 10**7 k = 5 * 10**5 kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}] @@ -149,8 +184,7 @@ def time_kth_smallest(self): class ParallelDatetimeFields: def setup(self): - if not have_real_test_parallel: - raise NotImplementedError + N = 10**6 self.dti = date_range("1900-01-01", periods=N, freq="T") self.period = self.dti.to_period("D") @@ -204,8 +238,7 @@ class ParallelRolling: param_names = ["method"] def setup(self, method): - if not have_real_test_parallel: - raise NotImplementedError + win = 100 arr = np.random.rand(100000) if hasattr(DataFrame, "rolling"): @@ -248,8 +281,7 @@ class ParallelReadCSV(BaseIO): param_names = ["dtype"] def setup(self, dtype): - if not have_real_test_parallel: - raise NotImplementedError + rows = 10000 cols = 50 data = { @@ -284,8 +316,6 @@ class ParallelFactorize: param_names = ["threads"] def setup(self, threads): - if not have_real_test_parallel: - raise NotImplementedError strings = tm.makeStringIndex(100000) diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst index 298d0c4e0111c..346a5cecf601d 100644 --- a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst +++ b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst @@ -154,11 +154,11 @@ The apply and combine steps are typically done together in pandas. In the previous example, we explicitly selected the 2 columns first. If not, the ``mean`` method is applied to each column containing numerical -columns: +columns by passing ``numeric_only=True``: .. ipython:: python - titanic.groupby("Sex").mean() + titanic.groupby("Sex").mean(numeric_only=True) It does not make much sense to get the average value of the ``Pclass``. If we are only interested in the average age for each gender, the diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 9ccf191194e19..9916f13e015dd 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -532,7 +532,7 @@ groups: .. ipython:: python - df.groupby("A").sum() + df.groupby("A")[["C", "D"]].sum() Grouping by multiple columns forms a hierarchical index, and again we can apply the :meth:`~pandas.core.groupby.GroupBy.sum` function: diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index f381d72069775..f2d83885df2d0 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -477,7 +477,7 @@ An obvious one is aggregation via the .. ipython:: python grouped = df.groupby("A") - grouped.aggregate(np.sum) + grouped[["C", "D"]].aggregate(np.sum) grouped = df.groupby(["A", "B"]) grouped.aggregate(np.sum) @@ -492,7 +492,7 @@ changed by using the ``as_index`` option: grouped = df.groupby(["A", "B"], as_index=False) grouped.aggregate(np.sum) - df.groupby("A", as_index=False).sum() + df.groupby("A", as_index=False)[["C", "D"]].sum() Note that you could use the ``reset_index`` DataFrame function to achieve the same result as the column names are stored in the resulting ``MultiIndex``: @@ -730,7 +730,7 @@ optimized Cython implementations: .. ipython:: python - df.groupby("A").sum() + df.groupby("A")[["C", "D"]].sum() df.groupby(["A", "B"]).mean() Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above @@ -1159,13 +1159,12 @@ Again consider the example DataFrame we've been looking at: Suppose we wish to compute the standard deviation grouped by the ``A`` column. There is a slight problem, namely that we don't care about the data in -column ``B``. We refer to this as a "nuisance" column. If the passed -aggregation function can't be applied to some columns, the troublesome columns -will be (silently) dropped. Thus, this does not pose any problems: +column ``B``. We refer to this as a "nuisance" column. You can avoid nuisance +columns by specifying ``numeric_only=True``: .. ipython:: python - df.groupby("A").std() + df.groupby("A").std(numeric_only=True) Note that ``df.groupby('A').colname.std().`` is more efficient than ``df.groupby('A').std().colname``, so if the result of an aggregation function @@ -1180,7 +1179,14 @@ is only interesting over one column (here ``colname``), it may be filtered If you do wish to include decimal or object columns in an aggregation with other non-nuisance data types, you must do so explicitly. +.. warning:: + The automatic dropping of nuisance columns has been deprecated and will be removed + in a future version of pandas. If columns are included that cannot be operated + on, pandas will instead raise an error. In order to avoid this, either select + the columns you wish to operate on or specify ``numeric_only=True``. + .. ipython:: python + :okwarning: from decimal import Decimal @@ -1304,7 +1310,7 @@ Groupby a specific column with the desired frequency. This is like resampling. .. ipython:: python - df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() + df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum() You have an ambiguous specification in that you have a named index and a column that could be potential groupers. @@ -1313,9 +1319,9 @@ that could be potential groupers. df = df.set_index("Date") df["Date"] = df.index + pd.offsets.MonthEnd(2) - df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"]).sum() + df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum() - df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"]).sum() + df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum() Taking the first rows of each group diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index a94681924d211..3c08b5a498eea 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -583,7 +583,7 @@ without using a temporary variable. .. ipython:: python bb = pd.read_csv('data/baseball.csv', index_col='id') - (bb.groupby(['year', 'team']).sum() + (bb.groupby(['year', 'team']).sum(numeric_only=True) .loc[lambda df: df['r'] > 100]) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index f9e68b1b39ddc..b24890564d1bf 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -414,12 +414,11 @@ We can produce pivot tables from this data very easily: The result object is a :class:`DataFrame` having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table -will include all of the data that can be aggregated in an additional level of -hierarchy in the columns: +will include all of the data in an additional level of hierarchy in the columns: .. ipython:: python - pd.pivot_table(df, index=["A", "B"], columns=["C"]) + pd.pivot_table(df[["A", "B", "C", "D", "E"]], index=["A", "B"], columns=["C"]) Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For detail of :class:`Grouper`, see :ref:`Grouping with a Grouper specification `. @@ -432,7 +431,7 @@ calling :meth:`~DataFrame.to_string` if you wish: .. ipython:: python - table = pd.pivot_table(df, index=["A", "B"], columns=["C"]) + table = pd.pivot_table(df, index=["A", "B"], columns=["C"], values=["D", "E"]) print(table.to_string(na_rep="")) Note that :meth:`~DataFrame.pivot_table` is also available as an instance method on DataFrame, @@ -449,7 +448,13 @@ rows and columns: .. ipython:: python - table = df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std) + table = df.pivot_table( + index=["A", "B"], + columns="C", + values=["D", "E"], + margins=True, + aggfunc=np.std + ) table Additionally, you can call :meth:`DataFrame.stack` to display a pivoted DataFrame diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 582620d8b6479..c67d028b65b3e 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1821,7 +1821,7 @@ to resample based on datetimelike column in the frame, it can passed to the ), ) df - df.resample("M", on="date").sum() + df.resample("M", on="date")[["a"]].sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1829,7 +1829,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample("M", level="d").sum() + df.resample("M", level="d")[["a"]].sum() .. _timeseries.iterating-label: diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index f873d320822ae..7d9008fdbdecd 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -166,7 +166,7 @@ without using temporary variable. .. ipython:: python bb = pd.read_csv("data/baseball.csv", index_col="id") - (bb.groupby(["year", "team"]).sum().loc[lambda df: df.r > 100]) + (bb.groupby(["year", "team"]).sum(numeric_only=True).loc[lambda df: df.r > 100]) .. _whatsnew_0181.partial_string_indexing: diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index a2bb935c708bc..113bbcf0a05bc 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -497,8 +497,8 @@ Other enhancements ), ) df - df.resample("M", on="date").sum() - df.resample("M", level="d").sum() + df.resample("M", on="date")[["a"]].sum() + df.resample("M", level="d")[["a"]].sum() - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the docs for more details (:issue:`13577`). - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 23c8ad63bf7bb..7c09eec212d69 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) +- Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) - Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`) .. --------------------------------------------------------------------------- diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 603c2f081a31a..53e003e2ed7dd 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -3,7 +3,6 @@ import collections from datetime import datetime from decimal import Decimal -from functools import wraps import operator import os import re @@ -749,55 +748,6 @@ def makeMissingDataframe(density=0.9, random_state=None): return df -def test_parallel(num_threads=2, kwargs_list=None): - """ - Decorator to run the same function multiple times in parallel. - - Parameters - ---------- - num_threads : int, optional - The number of times the function is run in parallel. - kwargs_list : list of dicts, optional - The list of kwargs to update original - function kwargs on different threads. - - Notes - ----- - This decorator does not pass the return value of the decorated function. - - Original from scikit-image: - - https://github.com/scikit-image/scikit-image/pull/1519 - - """ - assert num_threads > 0 - has_kwargs_list = kwargs_list is not None - if has_kwargs_list: - assert len(kwargs_list) == num_threads - import threading - - def wrapper(func): - @wraps(func) - def inner(*args, **kwargs): - if has_kwargs_list: - update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) - else: - update_kwargs = lambda i: kwargs - threads = [] - for i in range(num_threads): - updated_kwargs = update_kwargs(i) - thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) - threads.append(thread) - for thread in threads: - thread.start() - for thread in threads: - thread.join() - - return inner - - return wrapper - - class SubclassedSeries(Series): _metadata = ["testattr", "name"] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0203d54e0de86..f7c89b6e7dc49 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1310,7 +1310,16 @@ def _numba_prep(self, data): sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False) sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() - sorted_index_data = data.index.take(sorted_index).to_numpy() + if len(self.grouper.groupings) > 1: + raise NotImplementedError( + "More than 1 grouping labels are not supported with engine='numba'" + ) + # GH 46867 + index_data = data.index + if isinstance(index_data, MultiIndex): + group_key = self.grouper.groupings[0].name + index_data = index_data.get_level_values(group_key) + sorted_index_data = index_data.take(sorted_index).to_numpy() starts, ends = lib.generate_slices(sorted_ids, ngroups) return ( diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index ba58ac27284b8..9f71c2c2fa0b6 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -211,3 +211,30 @@ def func_kwargs(values, index): ) expected = DataFrame({"value": [1.0, 1.0, 1.0]}) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_multiindex_one_key(nogil, parallel, nopython): + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + result = df.groupby("A").agg( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) + expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"]) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + with pytest.raises(NotImplementedError, match="More than 1 grouping labels"): + df.groupby(["A", "B"]).agg( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index a404e0b9304cc..1b8570dbdc21d 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -199,3 +199,30 @@ def func_kwargs(values, index): ) expected = DataFrame({"value": [1.0, 1.0, 1.0]}) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_multiindex_one_key(nogil, parallel, nopython): + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + result = df.groupby("A").transform( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) + expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + with pytest.raises(NotImplementedError, match="More than 1 grouping labels"): + df.groupby(["A", "B"]).transform( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) From 6b553919964bd8ea93a7e75676ba8f6271cba1d2 Mon Sep 17 00:00:00 2001 From: anetakahle Date: Sat, 21 May 2022 19:47:17 +0200 Subject: [PATCH 09/17] lint file --- pandas/core/reshape/concat.py | 406 +++++++++++++++++----------------- 1 file changed, 203 insertions(+), 203 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index e920a49ecdde3..a48f8d3588c3a 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -157,211 +157,211 @@ def concat( copy: bool = True, ) -> DataFrame | Series: """ - Concatenate pandas objects along a particular axis with optional set logic - along the other axes. - - Can also add a layer of hierarchical indexing on the concatenation axis, - which may be useful if the labels are the same (or overlapping) on - the passed axis number. - - Parameters - ---------- - objs : a sequence or mapping of Series or DataFrame objects - If a mapping is passed, the sorted keys will be used as the `keys` - argument, unless it is passed, in which case the values will be - selected (see below). Any None objects will be dropped silently unless - they are all None in which case a ValueError will be raised. - axis : {0/'index', 1/'columns'}, default 0 - The axis to concatenate along. - join : {'inner', 'outer'}, default 'outer' - How to handle indexes on other axis (or axes). - ignore_index : bool, default False - If True, do not use the index values along the concatenation axis. The - resulting axis will be labeled 0, ..., n - 1. This is useful if you are - concatenating objects where the concatenation axis does not have - meaningful indexing information. Note the index values on the other - axes are still respected in the join. - keys : sequence, default None - If multiple levels passed, should contain tuples. Construct - hierarchical index using the passed keys as the outermost level. - levels : list of sequences, default None - Specific levels (unique values) to use for constructing a - MultiIndex. Otherwise they will be inferred from the keys. - names : list, default None - Names for the levels in the resulting hierarchical index. - verify_integrity : bool, default False - Check whether the new concatenated axis contains duplicates. This can - be very expensive relative to the actual data concatenation. - sort : bool, default False - Sort non-concatenation axis if it is not already aligned when `join` - is 'outer'. - This has no effect when ``join='inner'``, which already preserves - the order of the non-concatenation axis. - - .. versionchanged:: 1.0.0 - - Changed to not sort by default. - - copy : bool, default True - If False, do not copy data unnecessarily. - - Returns - ------- - object, type of objs - When concatenating all ``Series`` along the index (axis=0), a - ``Series`` is returned. When ``objs`` contains at least one - ``DataFrame``, a ``DataFrame`` is returned. When concatenating along - the columns (axis=1), a ``DataFrame`` is returned. - - See Also - -------- - DataFrame.join : Join DataFrames using indexes. - DataFrame.merge : Merge DataFrames by indexes or columns. - - Notes - ----- - The keys, levels, and names arguments are all optional. - - A walkthrough of how this method fits in with other tools for combining - pandas objects can be found `here - `__. - - - It is not recommended to build DataFrames by adding single rows in a -for loop. Build a list of rows and make a DataFrame in a single concat. - - Examples - -------- - Combine two ``Series``. - - >>> s1 = pd.Series(['a', 'b']) - >>> s2 = pd.Series(['c', 'd']) - >>> pd.concat([s1, s2]) - 0 a - 1 b - 0 c - 1 d - dtype: object - - Clear the existing index and reset it in the result - by setting the ``ignore_index`` option to ``True``. - - >>> pd.concat([s1, s2], ignore_index=True) - 0 a - 1 b - 2 c - 3 d - dtype: object - - Add a hierarchical index at the outermost level of - the data with the ``keys`` option. - - >>> pd.concat([s1, s2], keys=['s1', 's2']) - s1 0 a + Concatenate pandas objects along a particular axis with optional set logic + along the other axes. + + Can also add a layer of hierarchical indexing on the concatenation axis, + which may be useful if the labels are the same (or overlapping) on + the passed axis number. + + Parameters + ---------- + objs : a sequence or mapping of Series or DataFrame objects + If a mapping is passed, the sorted keys will be used as the `keys` + argument, unless it is passed, in which case the values will be + selected (see below). Any None objects will be dropped silently unless + they are all None in which case a ValueError will be raised. + axis : {0/'index', 1/'columns'}, default 0 + The axis to concatenate along. + join : {'inner', 'outer'}, default 'outer' + How to handle indexes on other axis (or axes). + ignore_index : bool, default False + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the index values on the other + axes are still respected in the join. + keys : sequence, default None + If multiple levels passed, should contain tuples. Construct + hierarchical index using the passed keys as the outermost level. + levels : list of sequences, default None + Specific levels (unique values) to use for constructing a + MultiIndex. Otherwise they will be inferred from the keys. + names : list, default None + Names for the levels in the resulting hierarchical index. + verify_integrity : bool, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation. + sort : bool, default False + Sort non-concatenation axis if it is not already aligned when `join` + is 'outer'. + This has no effect when ``join='inner'``, which already preserves + the order of the non-concatenation axis. + + .. versionchanged:: 1.0.0 + + Changed to not sort by default. + + copy : bool, default True + If False, do not copy data unnecessarily. + + Returns + ------- + object, type of objs + When concatenating all ``Series`` along the index (axis=0), a + ``Series`` is returned. When ``objs`` contains at least one + ``DataFrame``, a ``DataFrame`` is returned. When concatenating along + the columns (axis=1), a ``DataFrame`` is returned. + + See Also + -------- + DataFrame.join : Join DataFrames using indexes. + DataFrame.merge : Merge DataFrames by indexes or columns. + + Notes + ----- + The keys, levels, and names arguments are all optional. + + A walkthrough of how this method fits in with other tools for combining + pandas objects can be found `here + `__. + + + It is not recommended to build DataFrames by adding single rows in a + for loop. Build a list of rows and make a DataFrame in a single concat. + + Examples + -------- + Combine two ``Series``. + + >>> s1 = pd.Series(['a', 'b']) + >>> s2 = pd.Series(['c', 'd']) + >>> pd.concat([s1, s2]) + 0 a 1 b - s2 0 c + 0 c 1 d - dtype: object - - Label the index keys you create with the ``names`` option. - - >>> pd.concat([s1, s2], keys=['s1', 's2'], - ... names=['Series name', 'Row ID']) - Series name Row ID - s1 0 a - 1 b - s2 0 c - 1 d - dtype: object - - Combine two ``DataFrame`` objects with identical columns. - - >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], - ... columns=['letter', 'number']) - >>> df1 - letter number - 0 a 1 - 1 b 2 - >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], - ... columns=['letter', 'number']) - >>> df2 - letter number - 0 c 3 - 1 d 4 - >>> pd.concat([df1, df2]) - letter number - 0 a 1 - 1 b 2 - 0 c 3 - 1 d 4 - - Combine ``DataFrame`` objects with overlapping columns - and return everything. Columns outside the intersection will - be filled with ``NaN`` values. - - >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], - ... columns=['letter', 'number', 'animal']) - >>> df3 - letter number animal - 0 c 3 cat - 1 d 4 dog - >>> pd.concat([df1, df3], sort=False) - letter number animal - 0 a 1 NaN - 1 b 2 NaN - 0 c 3 cat - 1 d 4 dog - - Combine ``DataFrame`` objects with overlapping columns - and return only those that are shared by passing ``inner`` to - the ``join`` keyword argument. - - >>> pd.concat([df1, df3], join="inner") - letter number - 0 a 1 - 1 b 2 - 0 c 3 - 1 d 4 - - Combine ``DataFrame`` objects horizontally along the x axis by - passing in ``axis=1``. - - >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], - ... columns=['animal', 'name']) - >>> pd.concat([df1, df4], axis=1) - letter number animal name - 0 a 1 bird polly - 1 b 2 monkey george - - Prevent the result from including duplicate index values with the - ``verify_integrity`` option. - - >>> df5 = pd.DataFrame([1], index=['a']) - >>> df5 - 0 - a 1 - >>> df6 = pd.DataFrame([2], index=['a']) - >>> df6 - 0 - a 2 - >>> pd.concat([df5, df6], verify_integrity=True) - Traceback (most recent call last): - ... - ValueError: Indexes have overlapping values: ['a'] - - Append a single row to the end of a ``DataFrame`` object. - - >>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0]) - >>> df7 - a b - 0 1 2 - >>> new_row = pd.Series({'a': 3, 'b': 4}) - >>> new_row - 0 3 4 - >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) - a b - 0 1 2 - 1 3 4 + dtype: object + + Clear the existing index and reset it in the result + by setting the ``ignore_index`` option to ``True``. + + >>> pd.concat([s1, s2], ignore_index=True) + 0 a + 1 b + 2 c + 3 d + dtype: object + + Add a hierarchical index at the outermost level of + the data with the ``keys`` option. + + >>> pd.concat([s1, s2], keys=['s1', 's2']) + s1 0 a + 1 b + s2 0 c + 1 d + dtype: object + + Label the index keys you create with the ``names`` option. + + >>> pd.concat([s1, s2], keys=['s1', 's2'], + ... names=['Series name', 'Row ID']) + Series name Row ID + s1 0 a + 1 b + s2 0 c + 1 d + dtype: object + + Combine two ``DataFrame`` objects with identical columns. + + >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], + ... columns=['letter', 'number']) + >>> df1 + letter number + 0 a 1 + 1 b 2 + >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], + ... columns=['letter', 'number']) + >>> df2 + letter number + 0 c 3 + 1 d 4 + >>> pd.concat([df1, df2]) + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + Combine ``DataFrame`` objects with overlapping columns + and return everything. Columns outside the intersection will + be filled with ``NaN`` values. + + >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], + ... columns=['letter', 'number', 'animal']) + >>> df3 + letter number animal + 0 c 3 cat + 1 d 4 dog + >>> pd.concat([df1, df3], sort=False) + letter number animal + 0 a 1 NaN + 1 b 2 NaN + 0 c 3 cat + 1 d 4 dog + + Combine ``DataFrame`` objects with overlapping columns + and return only those that are shared by passing ``inner`` to + the ``join`` keyword argument. + + >>> pd.concat([df1, df3], join="inner") + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + Combine ``DataFrame`` objects horizontally along the x axis by + passing in ``axis=1``. + + >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], + ... columns=['animal', 'name']) + >>> pd.concat([df1, df4], axis=1) + letter number animal name + 0 a 1 bird polly + 1 b 2 monkey george + + Prevent the result from including duplicate index values with the + ``verify_integrity`` option. + + >>> df5 = pd.DataFrame([1], index=['a']) + >>> df5 + 0 + a 1 + >>> df6 = pd.DataFrame([2], index=['a']) + >>> df6 + 0 + a 2 + >>> pd.concat([df5, df6], verify_integrity=True) + Traceback (most recent call last): + ... + ValueError: Indexes have overlapping values: ['a'] + + Append a single row to the end of a ``DataFrame`` object. + + >>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0]) + >>> df7 + a b + 0 1 2 + >>> new_row = pd.Series({'a': 3, 'b': 4}) + >>> new_row + 0 3 4 + >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) + a b + 0 1 2 + 1 3 4 """ op = _Concatenator( objs, From 28c9edeb1f0a8d64620dbea64a3b34abb8cef92a Mon Sep 17 00:00:00 2001 From: anetakahle Date: Sat, 21 May 2022 20:23:28 +0200 Subject: [PATCH 10/17] indentation fix --- pandas/core/reshape/concat.py | 402 +++++++++++++++++----------------- 1 file changed, 201 insertions(+), 201 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index a48f8d3588c3a..5d33af1bd8701 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -157,211 +157,211 @@ def concat( copy: bool = True, ) -> DataFrame | Series: """ - Concatenate pandas objects along a particular axis with optional set logic - along the other axes. - - Can also add a layer of hierarchical indexing on the concatenation axis, - which may be useful if the labels are the same (or overlapping) on - the passed axis number. - - Parameters - ---------- - objs : a sequence or mapping of Series or DataFrame objects - If a mapping is passed, the sorted keys will be used as the `keys` - argument, unless it is passed, in which case the values will be - selected (see below). Any None objects will be dropped silently unless - they are all None in which case a ValueError will be raised. - axis : {0/'index', 1/'columns'}, default 0 - The axis to concatenate along. - join : {'inner', 'outer'}, default 'outer' - How to handle indexes on other axis (or axes). - ignore_index : bool, default False - If True, do not use the index values along the concatenation axis. The - resulting axis will be labeled 0, ..., n - 1. This is useful if you are - concatenating objects where the concatenation axis does not have - meaningful indexing information. Note the index values on the other - axes are still respected in the join. - keys : sequence, default None - If multiple levels passed, should contain tuples. Construct - hierarchical index using the passed keys as the outermost level. - levels : list of sequences, default None - Specific levels (unique values) to use for constructing a - MultiIndex. Otherwise they will be inferred from the keys. - names : list, default None - Names for the levels in the resulting hierarchical index. - verify_integrity : bool, default False - Check whether the new concatenated axis contains duplicates. This can - be very expensive relative to the actual data concatenation. - sort : bool, default False - Sort non-concatenation axis if it is not already aligned when `join` - is 'outer'. - This has no effect when ``join='inner'``, which already preserves - the order of the non-concatenation axis. - - .. versionchanged:: 1.0.0 - - Changed to not sort by default. - - copy : bool, default True - If False, do not copy data unnecessarily. - - Returns - ------- - object, type of objs - When concatenating all ``Series`` along the index (axis=0), a - ``Series`` is returned. When ``objs`` contains at least one - ``DataFrame``, a ``DataFrame`` is returned. When concatenating along - the columns (axis=1), a ``DataFrame`` is returned. - - See Also - -------- - DataFrame.join : Join DataFrames using indexes. - DataFrame.merge : Merge DataFrames by indexes or columns. - - Notes - ----- - The keys, levels, and names arguments are all optional. - - A walkthrough of how this method fits in with other tools for combining - pandas objects can be found `here - `__. - - - It is not recommended to build DataFrames by adding single rows in a + Concatenate pandas objects along a particular axis with optional set logic + along the other axes. + + Can also add a layer of hierarchical indexing on the concatenation axis, + which may be useful if the labels are the same (or overlapping) on + the passed axis number. + + Parameters + ---------- + objs : a sequence or mapping of Series or DataFrame objects + If a mapping is passed, the sorted keys will be used as the `keys` + argument, unless it is passed, in which case the values will be + selected (see below). Any None objects will be dropped silently unless + they are all None in which case a ValueError will be raised. + axis : {0/'index', 1/'columns'}, default 0 + The axis to concatenate along. + join : {'inner', 'outer'}, default 'outer' + How to handle indexes on other axis (or axes). + ignore_index : bool, default False + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the index values on the other + axes are still respected in the join. + keys : sequence, default None + If multiple levels passed, should contain tuples. Construct + hierarchical index using the passed keys as the outermost level. + levels : list of sequences, default None + Specific levels (unique values) to use for constructing a + MultiIndex. Otherwise they will be inferred from the keys. + names : list, default None + Names for the levels in the resulting hierarchical index. + verify_integrity : bool, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation. + sort : bool, default False + Sort non-concatenation axis if it is not already aligned when `join` + is 'outer'. + This has no effect when ``join='inner'``, which already preserves + the order of the non-concatenation axis. + + .. versionchanged:: 1.0.0 + + Changed to not sort by default. + + copy : bool, default True + If False, do not copy data unnecessarily. + + Returns + ------- + object, type of objs + When concatenating all ``Series`` along the index (axis=0), a + ``Series`` is returned. When ``objs`` contains at least one + ``DataFrame``, a ``DataFrame`` is returned. When concatenating along + the columns (axis=1), a ``DataFrame`` is returned. + + See Also + -------- + DataFrame.join : Join DataFrames using indexes. + DataFrame.merge : Merge DataFrames by indexes or columns. + + Notes + ----- + The keys, levels, and names arguments are all optional. + + A walkthrough of how this method fits in with other tools for combining + pandas objects can be found `here + `__. + + + It is not recommended to build DataFrames by adding single rows in a for loop. Build a list of rows and make a DataFrame in a single concat. - Examples - -------- - Combine two ``Series``. - - >>> s1 = pd.Series(['a', 'b']) - >>> s2 = pd.Series(['c', 'd']) - >>> pd.concat([s1, s2]) - 0 a + Examples + -------- + Combine two ``Series``. + + >>> s1 = pd.Series(['a', 'b']) + >>> s2 = pd.Series(['c', 'd']) + >>> pd.concat([s1, s2]) + 0 a + 1 b + 0 c + 1 d + dtype: object + + Clear the existing index and reset it in the result + by setting the ``ignore_index`` option to ``True``. + + >>> pd.concat([s1, s2], ignore_index=True) + 0 a + 1 b + 2 c + 3 d + dtype: object + + Add a hierarchical index at the outermost level of + the data with the ``keys`` option. + + >>> pd.concat([s1, s2], keys=['s1', 's2']) + s1 0 a 1 b - 0 c + s2 0 c 1 d - dtype: object - - Clear the existing index and reset it in the result - by setting the ``ignore_index`` option to ``True``. - - >>> pd.concat([s1, s2], ignore_index=True) - 0 a - 1 b - 2 c - 3 d - dtype: object - - Add a hierarchical index at the outermost level of - the data with the ``keys`` option. - - >>> pd.concat([s1, s2], keys=['s1', 's2']) - s1 0 a - 1 b - s2 0 c - 1 d - dtype: object - - Label the index keys you create with the ``names`` option. - - >>> pd.concat([s1, s2], keys=['s1', 's2'], - ... names=['Series name', 'Row ID']) - Series name Row ID - s1 0 a - 1 b - s2 0 c - 1 d - dtype: object - - Combine two ``DataFrame`` objects with identical columns. - - >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], - ... columns=['letter', 'number']) - >>> df1 - letter number - 0 a 1 - 1 b 2 - >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], - ... columns=['letter', 'number']) - >>> df2 - letter number - 0 c 3 - 1 d 4 - >>> pd.concat([df1, df2]) - letter number - 0 a 1 - 1 b 2 - 0 c 3 - 1 d 4 - - Combine ``DataFrame`` objects with overlapping columns - and return everything. Columns outside the intersection will - be filled with ``NaN`` values. - - >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], - ... columns=['letter', 'number', 'animal']) - >>> df3 - letter number animal - 0 c 3 cat - 1 d 4 dog - >>> pd.concat([df1, df3], sort=False) - letter number animal - 0 a 1 NaN - 1 b 2 NaN - 0 c 3 cat - 1 d 4 dog - - Combine ``DataFrame`` objects with overlapping columns - and return only those that are shared by passing ``inner`` to - the ``join`` keyword argument. - - >>> pd.concat([df1, df3], join="inner") - letter number - 0 a 1 - 1 b 2 - 0 c 3 - 1 d 4 - - Combine ``DataFrame`` objects horizontally along the x axis by - passing in ``axis=1``. - - >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], - ... columns=['animal', 'name']) - >>> pd.concat([df1, df4], axis=1) - letter number animal name - 0 a 1 bird polly - 1 b 2 monkey george - - Prevent the result from including duplicate index values with the - ``verify_integrity`` option. - - >>> df5 = pd.DataFrame([1], index=['a']) - >>> df5 - 0 - a 1 - >>> df6 = pd.DataFrame([2], index=['a']) - >>> df6 - 0 - a 2 - >>> pd.concat([df5, df6], verify_integrity=True) - Traceback (most recent call last): - ... - ValueError: Indexes have overlapping values: ['a'] - - Append a single row to the end of a ``DataFrame`` object. - - >>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0]) - >>> df7 - a b - 0 1 2 - >>> new_row = pd.Series({'a': 3, 'b': 4}) - >>> new_row - 0 3 4 - >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) - a b - 0 1 2 - 1 3 4 + dtype: object + + Label the index keys you create with the ``names`` option. + + >>> pd.concat([s1, s2], keys=['s1', 's2'], + ... names=['Series name', 'Row ID']) + Series name Row ID + s1 0 a + 1 b + s2 0 c + 1 d + dtype: object + + Combine two ``DataFrame`` objects with identical columns. + + >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], + ... columns=['letter', 'number']) + >>> df1 + letter number + 0 a 1 + 1 b 2 + >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], + ... columns=['letter', 'number']) + >>> df2 + letter number + 0 c 3 + 1 d 4 + >>> pd.concat([df1, df2]) + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + Combine ``DataFrame`` objects with overlapping columns + and return everything. Columns outside the intersection will + be filled with ``NaN`` values. + + >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], + ... columns=['letter', 'number', 'animal']) + >>> df3 + letter number animal + 0 c 3 cat + 1 d 4 dog + >>> pd.concat([df1, df3], sort=False) + letter number animal + 0 a 1 NaN + 1 b 2 NaN + 0 c 3 cat + 1 d 4 dog + + Combine ``DataFrame`` objects with overlapping columns + and return only those that are shared by passing ``inner`` to + the ``join`` keyword argument. + + >>> pd.concat([df1, df3], join="inner") + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + Combine ``DataFrame`` objects horizontally along the x axis by + passing in ``axis=1``. + + >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], + ... columns=['animal', 'name']) + >>> pd.concat([df1, df4], axis=1) + letter number animal name + 0 a 1 bird polly + 1 b 2 monkey george + + Prevent the result from including duplicate index values with the + ``verify_integrity`` option. + + >>> df5 = pd.DataFrame([1], index=['a']) + >>> df5 + 0 + a 1 + >>> df6 = pd.DataFrame([2], index=['a']) + >>> df6 + 0 + a 2 + >>> pd.concat([df5, df6], verify_integrity=True) + Traceback (most recent call last): + ... + ValueError: Indexes have overlapping values: ['a'] + + Append a single row to the end of a ``DataFrame`` object. + + >>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0]) + >>> df7 + a b + 0 1 2 + >>> new_row = pd.Series({'a': 3, 'b': 4}) + >>> new_row + 0 3 4 + >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) + a b + 0 1 2 + 1 3 4 """ op = _Concatenator( objs, From 721f63de38a4bdcafb27dcf49ffd1749ae2394e6 Mon Sep 17 00:00:00 2001 From: anetakahle Date: Sat, 21 May 2022 21:40:53 +0200 Subject: [PATCH 11/17] spaces fix --- pandas/core/reshape/concat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 5d33af1bd8701..5d600e344759d 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -359,9 +359,9 @@ def concat( >>> new_row 0 3 4 >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) - a b - 0 1 2 - 1 3 4 + a b + 0 1 2 + 1 3 4 """ op = _Concatenator( objs, From 83ed246535cd8ead9d46343a3424e6491382560b Mon Sep 17 00:00:00 2001 From: anetakahle Date: Sat, 21 May 2022 22:33:54 +0200 Subject: [PATCH 12/17] small fix --- pandas/core/reshape/concat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 5d600e344759d..5b3dedb57d6c6 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -357,7 +357,8 @@ def concat( 0 1 2 >>> new_row = pd.Series({'a': 3, 'b': 4}) >>> new_row - 0 3 4 + a 3 + b 4 >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) a b 0 1 2 From 6a1f1719b4f05a50398044516335cbe3735517fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aneta=20Kahleov=C3=A1?= Date: Sat, 21 May 2022 23:07:35 +0200 Subject: [PATCH 13/17] removed unrelated white spaces --- pandas/core/reshape/concat.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 5b3dedb57d6c6..05ba07f184bfc 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -200,7 +200,7 @@ def concat( .. versionchanged:: 1.0.0 - Changed to not sort by default. + Changed to not sort by default. copy : bool, default True If False, do not copy data unnecessarily. @@ -269,9 +269,9 @@ def concat( ... names=['Series name', 'Row ID']) Series name Row ID s1 0 a - 1 b + 1 b s2 0 c - 1 d + 1 d dtype: object Combine two ``DataFrame`` objects with identical columns. @@ -279,7 +279,7 @@ def concat( >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], ... columns=['letter', 'number']) >>> df1 - letter number + letter number 0 a 1 1 b 2 >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], @@ -302,11 +302,11 @@ def concat( >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], ... columns=['letter', 'number', 'animal']) >>> df3 - letter number animal + letter number animal 0 c 3 cat 1 d 4 dog >>> pd.concat([df1, df3], sort=False) - letter number animal + letter number animal 0 a 1 NaN 1 b 2 NaN 0 c 3 cat @@ -317,7 +317,7 @@ def concat( the ``join`` keyword argument. >>> pd.concat([df1, df3], join="inner") - letter number + letter number 0 a 1 1 b 2 0 c 3 @@ -329,7 +329,7 @@ def concat( >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], ... columns=['animal', 'name']) >>> pd.concat([df1, df4], axis=1) - letter number animal name + letter number animal name 0 a 1 bird polly 1 b 2 monkey george @@ -338,11 +338,11 @@ def concat( >>> df5 = pd.DataFrame([1], index=['a']) >>> df5 - 0 + 0 a 1 >>> df6 = pd.DataFrame([2], index=['a']) >>> df6 - 0 + 0 a 2 >>> pd.concat([df5, df6], verify_integrity=True) Traceback (most recent call last): From 16c4cfde08492d426f66caabd1c159af42e1ce22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aneta=20Kahleov=C3=A1?= Date: Sat, 21 May 2022 23:08:40 +0200 Subject: [PATCH 14/17] Update concat.py --- pandas/core/reshape/concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 05ba07f184bfc..670f20b185d9d 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -285,11 +285,11 @@ def concat( >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], ... columns=['letter', 'number']) >>> df2 - letter number + letter number 0 c 3 1 d 4 >>> pd.concat([df1, df2]) - letter number + letter number 0 a 1 1 b 2 0 c 3 From d0c8af37c95879fcb2ed893ea05693319d8a11b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aneta=20Kahleov=C3=A1?= Date: Sun, 22 May 2022 14:25:18 +0200 Subject: [PATCH 15/17] Update concat.py --- pandas/core/reshape/concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 670f20b185d9d..cd82d14b37c86 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -357,8 +357,8 @@ def concat( 0 1 2 >>> new_row = pd.Series({'a': 3, 'b': 4}) >>> new_row - a 3 - b 4 + a 3 + b 4 >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) a b 0 1 2 From 9aa4b28876cb4f4f85c85e7f2302d40b763d800d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aneta=20Kahleov=C3=A1?= Date: Sun, 22 May 2022 14:49:02 +0200 Subject: [PATCH 16/17] Update concat.py --- pandas/core/reshape/concat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index cd82d14b37c86..1e9afec765a23 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -359,6 +359,7 @@ def concat( >>> new_row a 3 b 4 + dtype: int64 >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) a b 0 1 2 From 81b98096677a294b700be7eb7134be4a9d41a22a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aneta=20Kahleov=C3=A1?= Date: Fri, 27 May 2022 12:25:36 +0200 Subject: [PATCH 17/17] Update concat.py --- pandas/core/reshape/concat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 1e9afec765a23..523cd56db3e0a 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -226,7 +226,6 @@ def concat( pandas objects can be found `here `__. - It is not recommended to build DataFrames by adding single rows in a for loop. Build a list of rows and make a DataFrame in a single concat.