Merge pull request #6132 from hayd/str_get_dummies

hayd · hayd · commit f89ae3413990 · 2014-01-27T16:52:46.000-08:00
ENH get_dummies str method
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1155,7 +1155,6 @@ can also be used.
 Testing for Strings that Match or Contain a Pattern
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-
 You can check whether elements contain a pattern:
 
 .. ipython:: python
@@ -1221,6 +1220,21 @@ Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
     ``lower``,Equivalent to ``str.lower``
     ``upper``,Equivalent to ``str.upper``
 
+
+Getting indicator variables from seperated strings
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can extract dummy variables from string columns.
+For example if they are seperated by a ``'|'``:
+
+  .. ipython:: python
+
+      s = pd.Series(['a', 'a|b', np.nan, 'a|c'])
+      s.str.get_dummies(sep='|')
+
+See also ``pd.get_dummies``.
+
+
 .. _basics.sorting:
 
 Sorting by index and value
diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt
@@ -43,6 +43,14 @@ API changes
 - Add ``-NaN`` and ``-nan`` to the default set of NA values (:issue:`5952`).
   See :ref:`NA Values <io.na_values>`.
 
+- Added ``Series.str.get_dummies`` vectorized string method (:issue:`6021`), to extract
+  dummy/indicator variables for seperated string columns:
+
+  .. ipython:: python
+
+      s = Series(['a', 'a|b', np.nan, 'a|c'])
+      s.str.get_dummies(sep='|')
+
 - Added the ``NDFrame.equals()`` method to compare if two NDFrames are
   equal have equal axes, dtypes, and values. Added the
   ``array_equivalent`` function to compare if two ndarrays are
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -941,6 +941,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
     1  0  1    0
     2  0  0    1
 
+    See also ``Series.str.get_dummies``.
+
     """
     # Series avoids inconsistent NaN handling
     cat = Categorical.from_array(Series(data))
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -187,7 +187,6 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
     else:
         f = lambda x: pat in x
     return _na_map(f, arr, na)
-        
 
 
 def str_startswith(arr, pat, na=np.nan):
@@ -460,6 +459,46 @@ def f(x):
     return result
 
 
+def str_get_dummies(arr, sep='|'):
+    """
+    Split each string by sep and return a frame of dummy/indicator variables.
+
+    Examples
+    --------
+    >>> Series(['a|b', 'a', 'a|c']).str.get_dummies()
+       a  b  c
+    0  1  1  0
+    1  1  0  0
+    2  1  0  1
+
+    >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
+       a  b  c
+    0  1  1  0
+    1  0  0  0
+    2  1  0  1
+
+    See also ``pd.get_dummies``.
+
+    """
+    # TODO remove this hack?
+    arr = arr.fillna('')
+    try:
+        arr = sep + arr + sep
+    except TypeError:
+        arr = sep + arr.astype(str) + sep
+
+    tags = set()
+    for ts in arr.str.split(sep):
+        tags.update(ts)
+    tags = sorted(tags - set([""]))
+
+    dummies = np.empty((len(arr), len(tags)), dtype=int)
+
+    for i, t in enumerate(tags):
+        pat = sep + t + sep
+        dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
+    return DataFrame(dummies, arr.index, tags)
+
 
 def str_join(arr, sep):
     """
@@ -843,7 +882,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
         result = str_contains(self.series, pat, case=case, flags=flags,
                               na=na, regex=regex)
         return self._wrap_result(result)
-            
+
     @copy(str_replace)
     def replace(self, pat, repl, n=-1, case=True, flags=0):
         result = str_replace(self.series, pat, repl, n=n, case=case,
@@ -899,6 +938,11 @@ def rstrip(self, to_strip=None):
         result = str_rstrip(self.series, to_strip)
         return self._wrap_result(result)
 
+    @copy(str_get_dummies)
+    def get_dummies(self, sep='|'):
+        result = str_get_dummies(self.series, sep)
+        return self._wrap_result(result)
+
     count = _pat_wrapper(str_count, flags=True)
     startswith = _pat_wrapper(str_startswith, na=True)
     endswith = _pat_wrapper(str_endswith, na=True)
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -366,7 +366,6 @@ def test_replace(self):
         result = values.str.replace("(?<=\w),(?=\w)", ", ", flags=re.UNICODE)
         tm.assert_series_equal(result, exp)
 
-
     def test_repeat(self):
         values = Series(['a', 'b', NA, 'c', NA, 'd'])
 
@@ -465,7 +464,7 @@ def test_extract(self):
         # Contains tests like those in test_match and some others.
 
         values = Series(['fooBAD__barBAD', NA, 'foo'])
-        er = [NA, NA] # empty row
+        er = [NA, NA]  # empty row
 
         result = values.str.extract('.*(BAD[_]+).*(BAD)')
         exp = DataFrame([['BAD__', 'BAD'], er, er])
@@ -549,6 +548,19 @@ def test_extract(self):
         exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
         tm.assert_frame_equal(result, exp)
 
+    def test_get_dummies(self):
+        s = Series(['a|b', 'a|c', np.nan])
+        result = s.str.get_dummies('|')
+        expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+                             columns=list('abc'))
+        tm.assert_frame_equal(result, expected)
+
+        s = Series(['a;b', 'a', 7])
+        result = s.str.get_dummies(';')
+        expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]],
+                             columns=list('7ab'))
+        tm.assert_frame_equal(result, expected)
+
     def test_join(self):
         values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
         result = values.str.split('_').str.join('_')
diff --git a/vb_suite/strings.py b/vb_suite/strings.py
@@ -45,6 +45,11 @@ def make_series(letters, strlen, size):
 strings_rstrip = Benchmark("many.str.rstrip('matchthis')", setup)
 strings_get = Benchmark("many.str.get(0)", setup)
 
+setup = setup + """
+make_series(string.uppercase, strlen=10, size=10000).str.join('|')
+"""
+strings_get_dummies = Benchmark("s.str.get_dummies('|')", setup)
+
 setup = common_setup + """
 import pandas.util.testing as testing
 ser = pd.Series(testing.makeUnicodeIndex())