diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f93e8f4240787..d347792cdaeb6 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -968,3 +968,5 @@ Bug Fixes - Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) + +- Bug in ``to_csv()`` in Python 3 which emitted b'' around bytes (:issue:`9712`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 18b67c41b4554..0484730ebb5f9 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2020,6 +2020,14 @@ def re_replacer(s): return block + def to_native_types(self, slicer=None, na_rep='nan', quoting=None, + bytes_encoding=None, **kwargs): + result = Block.to_native_types(self, slicer, na_rep, quoting, **kwargs) + if bytes_encoding is not None: + for arr in result: + lib.object_array_decode_bytes(arr, bytes_encoding) + return result + class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): __slots__ = () diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 50d54ddb95100..0d6d16bd27d3e 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1378,6 +1378,12 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and not self.tupleize_cols) + # in Python 3, decode bytes to str so strings print without b'' + if compat.PY3: + self.bytes_encoding = (encoding or get_option("display.encoding")) + else: + self.bytes_encoding = None + # validate mi options if self.has_mi_columns: if cols is not None: @@ -1387,6 +1393,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', if cols is not None: if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, + bytes_encoding=self.bytes_encoding, float_format=float_format, date_format=date_format, quoting=self.quoting) @@ -1399,6 +1406,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', cols = self.obj.columns if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, + bytes_encoding=self.bytes_encoding, float_format=float_format, date_format=date_format, quoting=self.quoting) @@ -1506,6 +1514,8 @@ def _save_header(self): else: encoded_labels = [] + self._bytes_to_str(encoded_labels) + if not has_mi_columns: encoded_labels += list(write_cols) @@ -1565,6 +1575,7 @@ def _save_chunk(self, start_i, end_i): for i in range(len(self.blocks)): b = self.blocks[i] d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, + bytes_encoding=self.bytes_encoding, float_format=self.float_format, decimal=self.decimal, date_format=self.date_format, @@ -1575,6 +1586,7 @@ def _save_chunk(self, start_i, end_i): self.data[col_loc] = col ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, + bytes_encoding=self.bytes_encoding, float_format=self.float_format, decimal=self.decimal, date_format=self.date_format, @@ -1582,6 +1594,14 @@ def _save_chunk(self, start_i, end_i): lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) + def _bytes_to_str(self, values): + """Modify values list by decoding bytes to str.""" + if self.bytes_encoding: + for ii, value in enumerate(values): + if isinstance(value, bytes): + values[ii] = value.decode(self.bytes_encoding) + + # from collections import namedtuple # ExcelCell = namedtuple("ExcelCell", # 'row, col, val, style, mergestart, mergeend') diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index de7780d25b1e5..3468a44cd76db 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1580,12 +1580,15 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): result = _trim_front(format_array(values, None, justify='left')) return header + result - def to_native_types(self, slicer=None, **kwargs): + def to_native_types(self, slicer=None, bytes_encoding=None, **kwargs): """ slice and dice then format """ values = self if slicer is not None: values = values[slicer] - return values._format_native_types(**kwargs) + result = values._format_native_types(**kwargs) + if bytes_encoding is not None and result.dtype == object: + lib.object_array_decode_bytes(result, bytes_encoding) + return result def _format_native_types(self, na_rep='', quoting=None, **kwargs): """ actually format my specific types """ diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 0473ae79adce5..ea9ab2e741446 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1053,6 +1053,25 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re return arr +@cython.boundscheck(False) +@cython.wraparound(False) +def object_array_decode_bytes(ndarray[object, ndim=1] arr, object encoding): + """Decode any instances of bytes to str in arr using the given encoding.""" + if bytes == str: # in Python 2 these are the same and nothing needs to be done + return + + cdef int length = arr.shape[0], i = 0 + for i from 0 <= i < length: + if isinstance(arr[i], bytes): + arr[i] = arr[i].decode(encoding) + elif isinstance(arr[i], tuple): + mask = [isinstance(it, bytes) for it in arr[i]] + if any(mask): + val = [it.decode(encoding) if mask[j] else it for j, it in enumerate(arr[i])] + arr[i] = tuple(val) + + return arr + @cython.boundscheck(False) @cython.wraparound(False) def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 43c8d6f25ab01..b11465dab9a93 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -790,6 +790,28 @@ def test_to_csv_unicode_index_col(self): df2 = read_csv(buf, index_col=0, encoding='UTF-8') assert_frame_equal(df, df2) + def test_to_csv_bytes(self): + # GH 9712 + times = pd.date_range("2013-10-27 23:00", "2013-10-28 00:00", freq="H") + df = DataFrame.from_items([ + (b'hello', ['a', b'b']), + (b'times', times), + ]) + df.loc[2] = np.nan + df.index.name = 'idx' + + with ensure_clean() as path: + df.to_csv(path) + with open(path) as csvfile: + lines = csvfile.readlines() + + expected = [ + "idx,hello,times\n", + "0,a,2013-10-27 23:00:00\n", + "1,b,2013-10-28 00:00:00\n", "2,,\n", + ] + assert(lines == expected) + def test_to_csv_stringio(self): buf = StringIO() self.frame.to_csv(buf)