pandas-dev · jreback · Dec 1, 2017 · Jul 19, 2017 · Sep 10, 2017 · Sep 11, 2017
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -504,7 +504,7 @@ the data in DataFrame.
 See the :ref:`cookbook<cookbook.merge>` for some advanced strategies.
 
 Users who are familiar with SQL but new to pandas might be interested in a
-:ref:`comparison with SQL<compare_with_sql.join>`.
+:ref:`comparison with SQL<F>`.
 
 pandas provides a single function, ``merge``, as the entry point for all
 standard database join operations between DataFrame objects:
@@ -518,14 +518,16 @@ standard database join operations between DataFrame objects:
 
 - ``left``: A DataFrame object
 - ``right``: Another DataFrame object
-- ``on``: Columns (names) to join on. Must be found in both the left and
-  right DataFrame objects. If not passed and ``left_index`` and
+- ``on``: Column or index level names to join on. Must be found in both the left
+  and right DataFrame objects. If not passed and ``left_index`` and
   ``right_index`` are ``False``, the intersection of the columns in the
   DataFrames will be inferred to be the join keys
-- ``left_on``: Columns from the left DataFrame to use as keys. Can either be
-  column names or arrays with length equal to the length of the DataFrame
-- ``right_on``: Columns from the right DataFrame to use as keys. Can either be
-  column names or arrays with length equal to the length of the DataFrame
+- ``left_on``: Columns or index levels from the left DataFrame to use as
+  keys. Can either be column names, index level names, or arrays with length
+  equal to the length of the DataFrame
+- ``right_on``: Columns or index levels from the right DataFrame to use as
+  keys. Can either be column names, index level names, or arrays with length
+  equal to the length of the DataFrame
 - ``left_index``: If ``True``, use the index (row labels) from the left
   DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex
   (hierarchical), the number of levels must match the number of join keys
@@ -1125,6 +1127,56 @@ This is not Implemented via ``join`` at-the-moment, however it can be done using
           labels=['left', 'right'], vertical=False);
    plt.close('all');
 
+.. _merging.merge_on_columns_and_levels:
+
+Merging on a combination of columns and index levels
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.21
+
+Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters
+may refer to either column names or index level names.  This enables merging
+``DataFrame`` instances on a combination of index levels and columns without
+resetting indexes.
+
+.. ipython:: python
+
+   left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1')
+
+   left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
+                        'B': ['B0', 'B1', 'B2', 'B3'],
+                        'key2': ['K0', 'K1', 'K0', 'K1']},
+                       index=left_index)
+
+   right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1')
+
+   right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
+                         'D': ['D0', 'D1', 'D2', 'D3'],
+                         'key2': ['K0', 'K0', 'K0', 'K1']},
+                        index=right_index)
+
+   result = left.merge(right, on=['key1', 'key2'])
+
+.. ipython:: python
+   :suppress:
+
+   @savefig merge_on_index_and_column.png
+   p.plot([left, right], result,
+          labels=['left', 'right'], vertical=False);
+   plt.close('all');
+
+.. note::
+
+   When DataFrames are merged on a string that matches an index level in both
+   frames, the index level is preserved as an index level in the resulting
+   DataFrame.
+
+.. note::
+
+   If a string matches both a column name and an index level name, then a
+   warning is issued and the column takes precedence. This will result in an
+   ambiguity error in a future version.
+
 Overlapping value columns
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -165,6 +165,37 @@ and new ``CategoricalDtype``.
 
 See the :ref:`CategoricalDtype docs <categorical.categoricaldtype>` for more.
 
+
+.. _whatsnew_0210.enhancements.merge_on_columns_and_levels:
+
+Merging on a combination of columns and index levels
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Strings passed to :meth:`DataFrame.merge` as the ``on``, ``left_on``, and ``right_on``
+parameters may now refer to either column names or index level names.  This enables
+merging ``DataFrame`` instances on a combination of index levels and columns
+without resetting indexes. See the :ref:`Merge on columns and levels
+<merging.merge_on_columns_and_levels>` documentation section.
+
+.. ipython:: python
+
+   left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1')
+
+   left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
+                        'B': ['B0', 'B1', 'B2', 'B3'],
+                        'key2': ['K0', 'K1', 'K0', 'K1']},
+                       index=left_index)
+
+   right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1')
+
+   right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
+                         'D': ['D0', 'D1', 'D2', 'D3'],
+                         'key2': ['K0', 'K0', 'K0', 'K1']},
+                        index=right_index)
+
+   left.merge(right, on=['key1', 'key2'])
+
+
 .. _whatsnew_0210.enhancements.other:
 
 Other Enhancements
@@ -187,6 +218,8 @@ Other Enhancements
 - Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`. (:issue:`15838`, :issue:`17438`)
 - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`)
 - Read/write methods that infer compression (:func:`read_csv`, :func:`read_table`, :func:`read_pickle`, and :meth:`~DataFrame.to_pickle`) can now infer from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`).
+- :func:`DataFrame.merge` now accepts index level names as `on`, `left_on`, and `right_on` parameters, allowing frames to be merged on a combination of columns and index levels (:issue:`14355`)
+- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`).
 - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`).
 - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`)
 - :func:`Styler.where` has been implemented. It is as a convenience for :func:`Styler.applymap` and enables simple DataFrame styling on the Jupyter notebook (:issue:`17474`).

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -68,7 +68,7 @@
                                 standardize_mapping)
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.index import (Index, MultiIndex, _ensure_index,
-                               _ensure_index_from_sequences)
+                               _ensure_index_from_sequences, RangeIndex)
 from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
                                   check_bool_indexer)
 from pandas.core.internals import (BlockManager,
@@ -139,16 +139,17 @@
     * inner: use intersection of keys from both frames, similar to a SQL inner
       join; preserve the order of the left keys
 on : label or list
-    Field names to join on. Must be found in both DataFrames. If on is
-    None and not merging on indexes, then it merges on the intersection of
-    the columns by default.
+    Column or index level names to join on. These must be found in both
+    DataFrames. If on is None and not merging on indexes then this defaults to
+    the intersection of the columns in both DataFrames.
 left_on : label or list, or array-like
-    Field names to join on in left DataFrame. Can be a vector or list of
-    vectors of the length of the DataFrame to use a particular vector as
-    the join key instead of columns
+    Column or index level names to join on in the left DataFrame. Can also
+    be a vector or list of vectors of the length of the left DataFrame.
+    These vectors are treated as though they are columns.
 right_on : label or list, or array-like
-    Field names to join on in right DataFrame or vector/list of vectors per
-    left_on docs
+    Column or index level names to join on in the right DataFrame. Can also
+    be a vector or list of vectors of the length of the right DataFrame.
+    These vectors are treated as though they are columns.
 left_index : boolean, default False
     Use the index from the left DataFrame as the join key(s). If it is a
     MultiIndex, the number of keys in the other DataFrame (either the index
@@ -2160,6 +2161,159 @@ def _getitem_frame(self, key):
             raise ValueError('Must pass DataFrame with boolean values only')
         return self.where(key)
 
+    # -------------------------------------------------------------------------
+    # Label or Level Combination Helpers
+
+    @Appender(_shared_docs['_is_level_reference'])
+    def _is_level_reference(self, key, axis=0):
+        axis = self._get_axis_number(axis)
+        if axis == 0:
+            return (isinstance(key, compat.string_types) and
+                    key not in self.columns and
+                    key in self.index.names)
+        elif axis == 1:
+            return (isinstance(key, compat.string_types) and
+                    key not in self.index and
+                    key in self.columns.names)
+
+    @Appender(_shared_docs['_is_label_reference'])
+    def _is_label_reference(self, key, axis=0):
+        axis = self._get_axis_number(axis)
+        if axis == 0:
+            return (isinstance(key, compat.string_types) and
+                    key in self.columns)
+        elif axis == 1:
+            return (isinstance(key, compat.string_types) and
+                    key in self.index)
+
+    @Appender(_shared_docs['_check_label_or_level_ambiguity'])
+    def _check_label_or_level_ambiguity(self, key, axis=0):
+
+        axis = self._get_axis_number(axis)
+
+        def raise_warning():
+
+            # Build an informative and grammatical warning
+            level_article, level_type = (('an', 'index')
+                                         if axis == 0 else
+                                         ('a', 'column'))
+
+            label_article, label_type = (('a', 'column')
+                                         if axis == 0 else
+                                         ('an', 'index'))
+
+            warnings.warn(
+                ("'{key}' is both {level_article} {level_type} level and "
+                 "{label_article} {label_type} label.\n"
+                 "Defaulting to {label_type}, but this will raise an "
+                 "ambiguity error in a future version"
+                 ).format(key=key,
+                          level_article=level_article,
+                          level_type=level_type,
+                          label_article=label_article,
+                          label_type=label_type), FutureWarning)
+
+        if axis == 0:
+            if (isinstance(key, compat.string_types) and
+                    key in self.columns and
+                    key in self.index.names):
+
+                raise_warning()
+                return True
+            else:
+                return False
+        else:
+            if (isinstance(key, compat.string_types) and
+                    key in self.index and
+                    key in self.columns.names):
+
+                raise_warning()
+                return True
+            else:
+                return False
+
+    @Appender(_shared_docs['_get_label_or_level_values'])
+    def _get_label_or_level_values(self, key, axis=0):
+        axis = self._get_axis_number(axis)
+        if axis == 0:
+            if key in self:
+                self._check_label_or_level_ambiguity(key, axis=axis)
+                values = self[key]._values
+            elif self._is_level_reference(key, axis=axis):
+                values = self.index.get_level_values(key)._values
+            else:
+                raise KeyError(key)
+        else:
+            if key in self.index:
+                self._check_label_or_level_ambiguity(key, axis=axis)
+                values = self.loc[key]._values
+            elif self._is_level_reference(key, axis=axis):
+                values = self.columns.get_level_values(key)._values
+            else:
+                raise KeyError(key)
+
+        # Check for duplicates
+        if values.ndim > 1:
+            label_axis_name = 'column' if axis == 0 else 'index'
+            raise ValueError(("The {label_axis_name} label '{key}' "
+                              "is not unique")
+                             .format(key=key,
+                                     label_axis_name=label_axis_name))
+
+        return values
+
+    @Appender(_shared_docs['_drop_labels_or_levels'])
+    def _drop_labels_or_levels(self, keys, axis=0):
+        axis = self._get_axis_number(axis)
+        keys = com._maybe_make_list(keys)
+
+        # Validate keys
+        invalid_keys = [k for k in keys if not
+                        self._is_label_or_level_reference(k, axis=axis)]
+
+        if invalid_keys:
+            raise ValueError(("The following keys are not valid labels or "
+                             "levels for {axis}: {invalid_keys}")
+                             .format(axis=axis,
+                                     invalid_keys=invalid_keys))
+
+        # Compute levels and labels to drop
+        levels_to_drop = [k for k in keys
+                          if self._is_level_reference(k, axis=axis)]
+
+        labels_to_drop = [k for k in keys
+                          if not self._is_level_reference(k, axis=axis)]
+
+        # Perform copy upfront and then use inplace operations below.
+        # This ensures that we always perform exactly one copy.
+        # ``copy`` and/or ``inplace`` options could be added in the future.
+        dropped = self.copy()
+
+        if axis == 0:
+            # Handle dropping index levels
+            if levels_to_drop:
+                dropped.reset_index(levels_to_drop, drop=True, inplace=True)
+
+            # Handle dropping columns labels
+            if labels_to_drop:
+                dropped.drop(labels_to_drop, axis=1, inplace=True)
+        else:
+            # Handle dropping column levels
+            if levels_to_drop:
+                if isinstance(dropped.columns, MultiIndex):
+                    # Drop the specified levels from the MultiIndex
+                    dropped.columns = dropped.columns.droplevel(levels_to_drop)
+                else:
+                    # Drop the last level of Index by replacing with
+                    # a RangeIndex
+                    dropped.columns = RangeIndex(dropped.columns.size)
+
+            # Handle dropping index labels
+            if labels_to_drop:
+                dropped.drop(labels_to_drop, axis=0, inplace=True)
+
+        return dropped
+
     def query(self, expr, inplace=False, **kwargs):
         """Query the columns of a frame with a boolean expression.