From 9b199195443970532a8dea5e89af8e3409f6f0f8 Mon Sep 17 00:00:00 2001 From: sidhant007 Date: Mon, 25 May 2020 13:25:27 +0800 Subject: [PATCH 1/4] Fix writing bytes in csv --- Lib/test/test_csv.py | 7 +++++++ Modules/_csv.c | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index a16d14019f341f..b48925ec68e2a9 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -230,6 +230,13 @@ def test_writerows_with_none(self): fileobj.seek(0) self.assertEqual(fileobj.read(), 'a\r\n""\r\n') + def test_writerows_with_bytes(self): + with TemporaryFile("w+", newline='', encoding='iso-8859-1') as fileobj: + writer = csv.writer(fileobj) + writer.writerows([['a', b'\xc2'], [b'\xc2', 'c']]) + fileobj.seek(0); + self.assertEqual(fileobj.read(), 'a,\xc2\r\n\xc2,c\r\n') + @support.cpython_only def test_writerows_legacy_strings(self): import _testcapi diff --git a/Modules/_csv.c b/Modules/_csv.c index 3a52632ccfd456..e9d8d01a97117a 100644 --- a/Modules/_csv.c +++ b/Modules/_csv.c @@ -121,6 +121,8 @@ typedef struct { DialectObj *dialect; /* parsing dialect */ + PyObject *encoding; /* use this encoding when writing bytes */ + Py_UCS4 *rec; /* buffer for parser.join */ Py_ssize_t rec_size; /* size of allocated record */ Py_ssize_t rec_len; /* length of record */ @@ -1206,7 +1208,12 @@ csv_writerow(WriterObj *self, PyObject *seq) else { PyObject *str; - str = PyObject_Str(field); + if (PyBytes_Check(field)) { + const char * encoding = PyUnicode_AsUTF8(self->encoding); + str = PyUnicode_FromEncodedObject(field, encoding, NULL); + } else { + str = PyObject_Str(field); + } Py_DECREF(field); if (str == NULL) { Py_DECREF(iter); @@ -1305,6 +1312,7 @@ Writer_dealloc(WriterObj *self) Py_XDECREF(self->write); if (self->rec != NULL) PyMem_Free(self->rec); + Py_XDECREF(self->encoding); PyObject_GC_Del(self); } @@ -1313,6 +1321,7 @@ Writer_traverse(WriterObj *self, visitproc visit, void *arg) { Py_VISIT(self->dialect); Py_VISIT(self->write); + Py_VISIT(self->encoding); return 0; } @@ -1321,6 +1330,7 @@ Writer_clear(WriterObj *self) { Py_CLEAR(self->dialect); Py_CLEAR(self->write); + Py_CLEAR(self->encoding); return 0; } @@ -1372,12 +1382,15 @@ csv_writer(PyObject *module, PyObject *args, PyObject *keyword_args) PyObject * output_file, * dialect = NULL; WriterObj * self = PyObject_GC_New(WriterObj, &Writer_Type); _Py_IDENTIFIER(write); + _Py_IDENTIFIER(encoding); + _Py_IDENTIFIER(getpreferredencoding); if (!self) return NULL; self->dialect = NULL; self->write = NULL; + self->encoding = NULL; self->rec = NULL; self->rec_size = 0; @@ -1398,6 +1411,27 @@ csv_writer(PyObject *module, PyObject *args, PyObject *keyword_args) Py_DECREF(self); return NULL; } + + int r = _PyObject_LookupAttrId(output_file, &PyId_encoding, &self->encoding); + if (r < 0) { + Py_DECREF(self); + return NULL; + } + else if (r == 0) { + PyObject* locale_module = PyImport_ImportModule("locale"); + if (locale_module == NULL) { + Py_DECREF(self); + return NULL; + } + self->encoding = _PyObject_CallMethodIdOneArg( + locale_module, &PyId_getpreferredencoding, Py_False); + Py_DECREF(locale_module); + } + if (self->encoding == NULL) { + Py_DECREF(self); + return NULL; + } + self->dialect = (DialectObj *)_call_dialect(dialect, keyword_args); if (self->dialect == NULL) { Py_DECREF(self); From 41202a20242b41cf77bd15cf93d4d4fcf7e6dcfc Mon Sep 17 00:00:00 2001 From: sidhant007 Date: Mon, 25 May 2020 15:22:41 +0800 Subject: [PATCH 2/4] Add more tests --- Lib/test/test_csv.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index b48925ec68e2a9..ca17d799c85bd6 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -231,12 +231,17 @@ def test_writerows_with_none(self): self.assertEqual(fileobj.read(), 'a\r\n""\r\n') def test_writerows_with_bytes(self): - with TemporaryFile("w+", newline='', encoding='iso-8859-1') as fileobj: + with TemporaryFile("w+", newline='', encoding='latin-1') as fileobj: writer = csv.writer(fileobj) writer.writerows([['a', b'\xc2'], [b'\xc2', 'c']]) fileobj.seek(0); self.assertEqual(fileobj.read(), 'a,\xc2\r\n\xc2,c\r\n') + with TemporaryFile("w+", newline='', encoding='utf-8') as fileobj: + writer = csv.writer(fileobj) + self.assertRaises(UnicodeDecodeError, writer.writerows, [['a', b'\xc2'], ['a', 'c']]) + + @support.cpython_only def test_writerows_legacy_strings(self): import _testcapi From ef69605512975a3525577de2d50a45a28ee5705b Mon Sep 17 00:00:00 2001 From: sidhant007 Date: Mon, 25 May 2020 16:20:12 +0800 Subject: [PATCH 3/4] Update csv docs --- Doc/library/csv.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Doc/library/csv.rst b/Doc/library/csv.rst index 61d39828e0194a..be100ac5c5c99f 100644 --- a/Doc/library/csv.rst +++ b/Doc/library/csv.rst @@ -434,9 +434,10 @@ Writer Objects :class:`Writer` objects (:class:`DictWriter` instances and objects returned by the :func:`writer` function) have the following public methods. A *row* must be -an iterable of strings or numbers for :class:`Writer` objects and a dictionary -mapping fieldnames to strings or numbers (by passing them through :func:`str` -first) for :class:`DictWriter` objects. Note that complex numbers are written +an iterable of strings, numbers or bytes for :class:`Writer` objects and a dictionary +mapping fieldnames to strings, numbers or bytes (by passing them through :func:`str` +first) for :class:`DictWriter` objects. Note that bytes will be written according to +the encoding scheme of the output file. Also note that complex numbers are written out surrounded by parens. This may cause some problems for other programs which read CSV files (assuming they support complex numbers at all). From a3de9552f110677aaa83930ba3decc7461fa168b Mon Sep 17 00:00:00 2001 From: sidhant007 Date: Mon, 25 May 2020 17:06:25 +0800 Subject: [PATCH 4/4] Add NEWS.d --- .../Core and Builtins/2020-05-25-07-25-29.bpo-40762.TkMFHk.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-05-25-07-25-29.bpo-40762.TkMFHk.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-25-07-25-29.bpo-40762.TkMFHk.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-25-07-25-29.bpo-40762.TkMFHk.rst new file mode 100644 index 00000000000000..1a45a5fcaabaae --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-25-07-25-29.bpo-40762.TkMFHk.rst @@ -0,0 +1 @@ +csv.Writer.writerow() now supports writing bytes as it is instead of writing them as b-prefixed strings. Uses encoding provided by the file object to write the bytes in the text mode. Incase the file object has no encoding attribute it falls back on using ``locale.getpreferredencoding`` to decide. \ No newline at end of file