diff --git a/RELEASE.rst b/RELEASE.rst index f3fb98535cb61..ca9c25294dc23 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -44,6 +44,11 @@ pandas 0.11.1 **KeyError** if **key** is not a valid store object. - The repr() for (Multi)Index now obeys display.max_seq_items rather then numpy threshold print options. (GH3426_, GH3466_) + - Added mangle_dupe_cols option to read_table/csv, allowing users + to control legacy behaviour re dupe cols (A, A.1, A.2 vs A, A ) (GH3468_) + Note: The default value will change in 0.12 to the "no mangle" behaviour, + If your code relies on this behaviour, explicitly specify mangle_dupe_cols=True + in your calls. **Bug Fixes** @@ -72,6 +77,7 @@ pandas 0.11.1 .. _GH3466: https://github.com/pydata/pandas/issues/3466 .. _GH3038: https://github.com/pydata/pandas/issues/3038 .. _GH3437: https://github.com/pydata/pandas/issues/3437 +.. _GH3468: https://github.com/pydata/pandas/issues/3468 .. _GH3455: https://github.com/pydata/pandas/issues/3455 .. _GH3457: https://github.com/pydata/pandas/issues/3457 .. _GH3461: https://github.com/pydata/pandas/issues/3461 diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 89f892daf9389..e2bbd456ea113 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -254,7 +254,8 @@ def _read(filepath_or_buffer, kwds): 'verbose': False, 'encoding': None, 'squeeze': False, - 'compression': None + 'compression': None, + 'mangle_dupe_cols': True, } @@ -340,7 +341,9 @@ def parser_f(filepath_or_buffer, verbose=False, encoding=None, - squeeze=False): + squeeze=False, + mangle_dupe_cols=True + ): # Alias sep -> delimiter. if delimiter is None: @@ -396,7 +399,9 @@ def parser_f(filepath_or_buffer, warn_bad_lines=warn_bad_lines, error_bad_lines=error_bad_lines, low_memory=low_memory, - buffer_lines=buffer_lines) + buffer_lines=buffer_lines, + mangle_dupe_cols=mangle_dupe_cols + ) return _read(filepath_or_buffer, kwds) @@ -1142,6 +1147,7 @@ def __init__(self, f, **kwds): self.skipinitialspace = kwds['skipinitialspace'] self.lineterminator = kwds['lineterminator'] self.quoting = kwds['quoting'] + self.mangle_dupe_cols = kwds.get('mangle_dupe_cols',True) self.has_index_names = False if 'has_index_names' in kwds: @@ -1323,12 +1329,13 @@ def _infer_columns(self): else: columns.append(c) - counts = {} - for i, col in enumerate(columns): - cur_count = counts.get(col, 0) - if cur_count > 0: - columns[i] = '%s.%d' % (col, cur_count) - counts[col] = cur_count + 1 + if self.mangle_dupe_cols: + counts = {} + for i, col in enumerate(columns): + cur_count = counts.get(col, 0) + if cur_count > 0: + columns[i] = '%s.%d' % (col, cur_count) + counts[col] = cur_count + 1 self._clear_buffer() diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index aa3fce3959860..5ff832431c917 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -589,14 +589,21 @@ def test_string_nas(self): tm.assert_frame_equal(result, expected) def test_duplicate_columns(self): - data = """A,A,B,B,B -1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - df = self.read_table(StringIO(data), sep=',') - self.assert_(np.array_equal(df.columns, - ['A', 'A.1', 'B', 'B.1', 'B.2'])) + for engine in ['python', 'c']: + data = """A,A,B,B,B + 1,2,3,4,5 + 6,7,8,9,10 + 11,12,13,14,15 + """ + # check default beahviour + df = self.read_table(StringIO(data), sep=',',engine=engine) + self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) + + df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=False) + self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B']) + + df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=True) + self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) def test_csv_mixed_type(self): data = """A,B,C diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx index 95c57f1675c64..694a769641b0d 100644 --- a/pandas/src/parser.pyx +++ b/pandas/src/parser.pyx @@ -249,6 +249,7 @@ cdef class TextReader: object dtype object encoding object compression + object mangle_dupe_cols set noconvert, usecols def __cinit__(self, source, @@ -298,11 +299,14 @@ cdef class TextReader: buffer_lines=None, skiprows=None, skip_footer=0, - verbose=False): + verbose=False, + mangle_dupe_cols=True): self.parser = parser_new() self.parser.chunksize = tokenize_chunksize + self.mangle_dupe_cols=mangle_dupe_cols + # For timekeeping self.clocks = [] @@ -571,8 +575,9 @@ cdef class TextReader: if name == '': name = 'Unnamed: %d' % i + count = counts.get(name, 0) - if count > 0: + if count > 0 and self.mangle_dupe_cols: header.append('%s.%d' % (name, count)) else: header.append(name)