From ec9f59a0feb13ae2e1a47aeda3f07eef18b27be9 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 25 Oct 2016 14:53:23 -0400 Subject: [PATCH] BUG: Accept unicode quotechars again in pd.read_csv Closes gh-14477. --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/io/parsers.py | 3 +++ pandas/io/tests/parser/quoting.py | 15 ++++++++++++++- pandas/parser.pyx | 3 ++- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 8f5f78a5e93f7..0fdace12e6474 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -35,6 +35,7 @@ Bug Fixes +- Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f8cf04e08ab03..e0127c3544971 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1759,6 +1759,9 @@ def __init__(self, f, **kwds): self.delimiter = kwds['delimiter'] self.quotechar = kwds['quotechar'] + if isinstance(self.quotechar, compat.text_type): + self.quotechar = str(self.quotechar) + self.escapechar = kwds['escapechar'] self.doublequote = kwds['doublequote'] self.skipinitialspace = kwds['skipinitialspace'] diff --git a/pandas/io/tests/parser/quoting.py b/pandas/io/tests/parser/quoting.py index d0f1493be0621..765cec8243a0a 100644 --- a/pandas/io/tests/parser/quoting.py +++ b/pandas/io/tests/parser/quoting.py @@ -9,7 +9,7 @@ import pandas.util.testing as tm from pandas import DataFrame -from pandas.compat import StringIO +from pandas.compat import PY3, StringIO, u class QuotingTests(object): @@ -138,3 +138,16 @@ def test_double_quote(self): result = self.read_csv(StringIO(data), quotechar='"', doublequote=False) tm.assert_frame_equal(result, expected) + + def test_quotechar_unicode(self): + # See gh-14477 + data = 'a\n1' + expected = DataFrame({'a': [1]}) + + result = self.read_csv(StringIO(data), quotechar=u('"')) + tm.assert_frame_equal(result, expected) + + # Compared to Python 3.x, Python 2.x does not handle unicode well. + if PY3: + result = self.read_csv(StringIO(data), quotechar=u('\u0394')) + tm.assert_frame_equal(result, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 12525c7a9c587..0a2824e74120c 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -570,7 +570,8 @@ cdef class TextReader: if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE: raise TypeError('bad "quoting" value') - if not isinstance(quote_char, (str, bytes)) and quote_char is not None: + if not isinstance(quote_char, (str, compat.text_type, + bytes)) and quote_char is not None: dtype = type(quote_char).__name__ raise TypeError('"quotechar" must be string, ' 'not {dtype}'.format(dtype=dtype))