From 899c14763fed94ba4e34064b978ba773155d7752 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 13 Mar 2013 20:34:07 -0400 Subject: [PATCH] BUG: Bug in DataFrame combine_first where non-specified values could cause dtype changes (GH3041_) --- RELEASE.rst | 2 ++ pandas/core/frame.py | 17 +++++++++++++---- pandas/tests/test_frame.py | 24 ++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 8894df02ed989..baad73652b577 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -146,6 +146,7 @@ pandas 0.11.0 - Bug in DataFrame column insertion when the column creation fails, existing frame is left in an irrecoverable state (GH3010_) - Bug in DataFrame update where non-specified values could cause dtype changes (GH3016_) + - Bug in DataFrame combine_first where non-specified values could cause dtype changes (GH3041_) - Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from other values), (GH2850_) @@ -178,6 +179,7 @@ pandas 0.11.0 .. _GH3010: https://github.com/pydata/pandas/issues/3010 .. _GH3012: https://github.com/pydata/pandas/issues/3012 .. _GH3029: https://github.com/pydata/pandas/issues/3029 +.. _GH3041: https://github.com/pydata/pandas/issues/3041 pandas 0.10.1 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c108cc677b9c6..8e7dbed742c3f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3723,7 +3723,7 @@ def _compare(a, b): return self._constructor(data=new_data, index=self.index, columns=self.columns, copy=False) - def combine(self, other, func, fill_value=None): + def combine(self, other, func, fill_value=None, overwrite=True): """ Add two DataFrame objects and do not propagate NaN values, so if for a (column, time) one frame is missing a value, it will default to the @@ -3734,6 +3734,8 @@ def combine(self, other, func, fill_value=None): other : DataFrame func : function fill_value : scalar value + overwrite : boolean, default True + If True then overwrite values for common keys in the calling frame Returns ------- @@ -3760,9 +3762,16 @@ def combine(self, other, func, fill_value=None): series = this[col].values otherSeries = other[col].values + this_mask = isnull(series) + other_mask = isnull(otherSeries) + + # don't overwrite columns unecessarily + # DO propogate if this column is not in the intersection + if not overwrite and other_mask.all(): + result[col] = this[col].copy() + continue + if do_fill: - this_mask = isnull(series) - other_mask = isnull(otherSeries) series = series.copy() otherSeries = otherSeries.copy() series[this_mask] = fill_value @@ -3798,7 +3807,7 @@ def combine_first(self, other): combined : DataFrame """ combiner = lambda x, y: np.where(isnull(x), y, x) - return self.combine(other, combiner) + return self.combine(other, combiner, overwrite=False) def update(self, other, join='left', overwrite=True, filter_func=None, raise_conflict=False): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cba6adadb8d6c..d83bed55f2418 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7248,6 +7248,30 @@ def test_combine_first_mixed_bug(self): combined = frame1.combine_first(frame2) self.assertEqual(len(combined.columns), 5) + # gh 3016 (same as in update) + df = DataFrame([[1.,2.,False, True],[4.,5.,True,False]], + columns=['A','B','bool1','bool2']) + + other = DataFrame([[45,45]],index=[0],columns=['A','B']) + result = df.combine_first(other) + assert_frame_equal(result, df) + + df.ix[0,'A'] = np.nan + result = df.combine_first(other) + df.ix[0,'A'] = 45 + assert_frame_equal(result, df) + + # doc example + df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan], + 'B' : [np.nan, 2., 3., np.nan, 6.]}) + + df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], + 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) + + result = df1.combine_first(df2) + expected = DataFrame({ 'A' : [1,2,3,5,3,7.], 'B' : [np.nan,2,3,4,6,8] }) + assert_frame_equal(result,expected) + def test_update(self): df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.],