Merge pull request #6226 from y-p/PR_clip_excel

y-p · y-p · commit 664bccf70268 · 2014-02-04T00:57:33.000-08:00
ENH: pd.read_clipboard detects tab-separated data (excel) GH6223
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -56,12 +56,16 @@ New features
 API Changes
 ~~~~~~~~~~~
 
+
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
 
 Improvements to existing features
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+- pd.read_clipboard will, if 'sep' is unspecified, try to detect data copied from a spreadsheet
+  and parse accordingly. (:issue:`6223`)
+
 .. _release.bug_fixes-0.14.0:
 
 Bug Fixes
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -28,10 +28,13 @@ There are no deprecations of prior behavior in 0.14.0
 Enhancements
 ~~~~~~~~~~~~
 
+- pd.read_clipboard will, if 'sep' is unspecified, try to detect data copied from a spreadsheet
+  and parse accordingly. (:issue:`6223`)
+
+
 Performance
 ~~~~~~~~~~~
 
-
 Experimental
 ~~~~~~~~~~~~
 
diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py
@@ -14,12 +14,29 @@ def read_clipboard(**kwargs):  # pragma: no cover
     -------
     parsed : DataFrame
     """
-    if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None:
-        kwargs['sep'] = '\s+'
     from pandas.util.clipboard import clipboard_get
     from pandas.io.parsers import read_table
     text = clipboard_get()
 
+    # Excel copies into clipboard with \t seperation
+    # inspect no more then the 10 first lines, if they
+    # all contain an equal number (>0) of tabs, infer
+    # that this came from excel and set 'sep' accordingly
+    lines = text[:10000].split('\n')[:-1][:10]
+
+    # Need to remove leading white space, since read_table
+    # accepts:
+    #    a  b
+    # 0  1  2
+    # 1  3  4
+
+    counts = set([x.lstrip().count('\t') for x in lines])
+    if len(lines)>1 and len(counts) == 1 and counts.pop() != 0:
+        kwargs['sep'] = '\t'
+
+    if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None:
+        kwargs['sep'] = '\s+'
+
     # try to decode (if needed on PY3)
     if compat.PY3:
         try:
diff --git a/pandas/io/tests/test_clipboard.py b/pandas/io/tests/test_clipboard.py
@@ -2,6 +2,7 @@
 from numpy.random import randint
 
 import nose
+import pandas as pd
 
 from pandas import DataFrame
 from pandas import read_clipboard
@@ -65,3 +66,37 @@ def test_round_trip_frame_string(self):
     def test_round_trip_frame(self):
         for dt in self.data_types:
             self.check_round_trip_frame(dt)
+
+    def test_read_clipboard_infer_excel(self):
+        from textwrap import dedent
+        from pandas.util.clipboard import clipboard_set
+
+        text = dedent("""
+            John James	Charlie Mingus
+            1	2
+            4	Harry Carney
+            """.strip())
+        clipboard_set(text)
+        df = pd.read_clipboard()
+
+        # excel data is parsed correctly
+        self.assertEqual(df.iloc[1][1], 'Harry Carney')
+
+        # having diff tab counts doesn't trigger it
+        text = dedent("""
+            a\t b
+            1  2
+            3  4
+            """.strip())
+        clipboard_set(text)
+        res = pd.read_clipboard()
+
+        text = dedent("""
+            a  b
+            1  2
+            3  4
+            """.strip())
+        clipboard_set(text)
+        exp = pd.read_clipboard()
+
+        tm.assert_frame_equal(res, exp)