From 9a4a6c4d1e7b59a44d6232f122d7dfb5baca76bb Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Wed, 30 Sep 2015 09:55:43 -0700 Subject: [PATCH 1/2] Basic working implementation --- pandas/core/frame.py | 9 --- pandas/core/generic.py | 123 +++++++---------------------------- pandas/core/indexing.py | 6 +- pandas/core/series.py | 3 - pandas/tests/test_generic.py | 79 ++++++++++++++++++++++ 5 files changed, 107 insertions(+), 113 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9e1eda4714734..ae4b7127e8f53 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2244,7 +2244,6 @@ def __setitem__(self, key, value): self._set_item(key, value) def _setitem_slice(self, key, value): - self._check_setitem_copy() self.ix._setitem_with_indexer(key, value) def _setitem_array(self, key, value): @@ -2255,7 +2254,6 @@ def _setitem_array(self, key, value): (len(key), len(self.index))) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] - self._check_setitem_copy() self.ix._setitem_with_indexer(indexer, value) else: if isinstance(value, DataFrame): @@ -2265,7 +2263,6 @@ def _setitem_array(self, key, value): self[k1] = value[k2] else: indexer = self.ix._convert_to_indexer(key, axis=1) - self._check_setitem_copy() self.ix._setitem_with_indexer((slice(None), indexer), value) def _setitem_frame(self, key, value): @@ -2275,7 +2272,6 @@ def _setitem_frame(self, key, value): raise TypeError('Must pass DataFrame with boolean values only') self._check_inplace_setting(value) - self._check_setitem_copy() self.where(-key, value, inplace=True) def _ensure_valid_index(self, value): @@ -2311,11 +2307,6 @@ def _set_item(self, key, value): value = self._sanitize_column(key, value) NDFrame._set_item(self, key, value) - # check if we are modifying a copy - # try to set first as we want an invalid - # value exeption to occur first - if len(self): - self._check_setitem_copy() def insert(self, loc, column, value, allow_duplicates=False): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6aec297c31d2b..75eaccf562bb9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -83,7 +83,7 @@ class NDFrame(PandasObject): _internal_names = ['_data', '_cacher', '_item_cache', '_cache', 'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value', '_metadata', - '__array_struct__', '__array_interface__'] + '__array_struct__', '__array_interface__', '_children'] _internal_names_set = set(_internal_names) _accessors = frozenset([]) _metadata = [] @@ -105,6 +105,8 @@ def __init__(self, data, axes=None, copy=False, dtype=None, object.__setattr__(self, 'is_copy', None) object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) + object.__setattr__(self, '_children', []) + def _validate_dtype(self, dtype): """ validate the passed dtype """ @@ -1174,9 +1176,6 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): except: pass - if verify_is_copy: - self._check_setitem_copy(stacklevel=5, t='referant') - if clear: self._clear_item_cache() @@ -1201,9 +1200,16 @@ def _slice(self, slobj, axis=0, kind=None): # but only in a single-dtyped view slicable case is_copy = axis!=0 or result._is_view result._set_is_copy(self, copy=is_copy) + + self._children.append(weakref.ref(result)) + return result def _set_item(self, key, value): + + # If children are views, reset to copies before setting. + self._convert_views_to_copies() + self._data.set(key, value) self._clear_item_cache() @@ -1216,103 +1222,21 @@ def _set_is_copy(self, ref=None, copy=True): else: self.is_copy = None - def _check_is_chained_assignment_possible(self): - """ - check if we are a view, have a cacher, and are of mixed type - if so, then force a setitem_copy check + def _convert_views_to_copies(self): + # Don't set on views. + if self._is_view: + self._data = self._data.copy() - should be called just near setting a value + # Before setting values, make sure children converted to copies. + for child in self._children: - will return a boolean if it we are a view and are cached, but a single-dtype - meaning that the cacher should be updated following setting - """ - if self._is_view and self._is_cached: - ref = self._get_cacher() - if ref is not None and ref._is_mixed_type: - self._check_setitem_copy(stacklevel=4, t='referant', force=True) - return True - elif self.is_copy: - self._check_setitem_copy(stacklevel=4, t='referant') - return False - - def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): - """ - - Parameters - ---------- - stacklevel : integer, default 4 - the level to show of the stack when the error is output - t : string, the type of setting error - force : boolean, default False - if True, then force showing an error - - validate if we are doing a settitem on a chained copy. - - If you call this function, be sure to set the stacklevel such that the - user will see the error *at the level of setting* - - It is technically possible to figure out that we are setting on - a copy even WITH a multi-dtyped pandas object. In other words, some blocks - may be views while other are not. Currently _is_view will ALWAYS return False - for multi-blocks to avoid having to handle this case. - - df = DataFrame(np.arange(0,9), columns=['count']) - df['group'] = 'b' - - # this technically need not raise SettingWithCopy if both are view (which is not - # generally guaranteed but is usually True - # however, this is in general not a good practice and we recommend using .loc - df.iloc[0:5]['group'] = 'a' - - """ - - if force or self.is_copy: - - value = config.get_option('mode.chained_assignment') - if value is None: - return - - # see if the copy is not actually refererd; if so, then disolve - # the copy weakref - try: - gc.collect(2) - if not gc.get_referents(self.is_copy()): - self.is_copy = None - return - except: - pass - - # we might be a false positive - try: - if self.is_copy().shape == self.shape: - self.is_copy = None - return - except: - pass - - # a custom message - if isinstance(self.is_copy, string_types): - t = self.is_copy - - elif t == 'referant': - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame\n\n" - "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy") - - else: - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame.\n" - "Try using .loc[row_indexer,col_indexer] = value instead\n\n" - "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy") - - if value == 'raise': - raise SettingWithCopyError(t) - elif value == 'warn': - warnings.warn(t, SettingWithCopyWarning, stacklevel=stacklevel) + # Make sure children of children converted. + child()._convert_views_to_copies() + + if child()._is_view: + child()._data = child()._data.copy() + + self._children=[] def __delitem__(self, key): """ @@ -2252,6 +2176,7 @@ def __setattr__(self, name, value): # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify # the same attribute. + try: object.__getattribute__(self, name) return object.__setattr__(self, name, value) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8b4528ef451ef..c27b3a0dc328a 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -111,6 +111,9 @@ def _get_setitem_indexer(self, key): raise IndexingError(key) def __setitem__(self, key, value): + # Make sure changes don't propagate to children + self.obj._convert_views_to_copies() + indexer = self._get_setitem_indexer(key) self._setitem_with_indexer(indexer, value) @@ -199,6 +202,7 @@ def _has_valid_positional_setitem_indexer(self, indexer): def _setitem_with_indexer(self, indexer, value): self._has_valid_setitem_indexer(indexer) + # also has the side effect of consolidating in-place from pandas import Panel, DataFrame, Series info_axis = self.obj._info_axis_number @@ -508,8 +512,6 @@ def can_do_equal_len(): if isinstance(value, ABCPanel): value = self._align_panel(indexer, value) - # check for chained assignment - self.obj._check_is_chained_assignment_possible() # actually do the set self.obj._consolidate_inplace() diff --git a/pandas/core/series.py b/pandas/core/series.py index 11645311467d5..8c54065a6c428 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -714,10 +714,7 @@ def setitem(key, value): self._set_with(key, value) # do the setitem - cacher_needs_updating = self._check_is_chained_assignment_possible() setitem(key, value) - if cacher_needs_updating: - self._maybe_update_cacher() def _set_with_engine(self, key, value): values = self._values diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 3a26be2ca1032..0ff77d2110210 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1681,6 +1681,85 @@ def test_set_attribute(self): assert_equal(df.y, 5) assert_series_equal(df['y'], Series([2, 4, 6], name='y')) + def test_copy_on_write(self): + + ####### + # FORWARD PROPAGATION TESTS + ####### + + ## + # Test children recorded from various slicing methods + ## + + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + self.assertTrue(len(df._children)==0) + + + views = dict() + + views['loc'] = df.loc[0:0,] + views['iloc'] = df.iloc[0:1,] + views['ix'] = df.ix[0:0,] + views['loc_of_loc'] = views['loc'].loc[0:0,] + + copies = dict() + for v in views.keys(): + self.assertTrue(views[v]._is_view) + copies[v] = views[v].copy() + + + + df.loc[0,'col1'] = -88 + + for v in views.keys(): + tm.assert_frame_equal(views[v], copies[v]) + self.assertFalse(views[v]._is_view) + + ## + # Test views become copies + # during different forms of value setting. + ## + + parent = dict() + views = dict() + copies = dict() + for v in ['loc', 'iloc', 'ix', 'column', 'attribute']: + parent[v] = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + views[v] = parent[v].loc[0:0,] + copies[v] = views[v].copy() + self.assertTrue( views[v]._is_view ) + + parent['loc'].loc[0, 'col1'] = -88 + parent['iloc'].iloc[0, 0] = -88 + parent['ix'].ix[0, 'col1'] = -88 + parent['column']['col1'] = -88 + parent['attribute'].col1 = -88 + + + for v in views.keys(): + tm.assert_frame_equal(views[v], copies[v]) + self.assertFalse(views[v]._is_view) + + + + ######## + # No Backward Propogation + ####### + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + df_copy = df.copy() + + views = dict() + + views['loc'] = df.loc[0:0,] + views['iloc'] = df.iloc[0:1,] + views['ix'] = df.ix[0:0,] + views['loc_of_loc'] = views['loc'].loc[0:0,] + + for v in views.keys(): + views[v].loc[0:0,] = -99 + + tm.assert_frame_equal(df, df_copy) + class TestPanel(tm.TestCase, Generic): _typ = Panel From f77cb62ddea47d49fd9e47daa79cdb311e365b79 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Thu, 29 Oct 2015 19:41:37 -0700 Subject: [PATCH 2/2] now keeps columns as views --- pandas/core/frame.py | 4 +++- pandas/core/generic.py | 17 +++++++++++++---- pandas/core/indexing.py | 1 + pandas/tests/test_generic.py | 37 +++++++++++++++++++++++++++--------- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ae4b7127e8f53..ba6d17997f0b1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1894,7 +1894,9 @@ def __getitem__(self, key): is_mi_columns = isinstance(self.columns, MultiIndex) try: if key in self.columns and not is_mi_columns: - return self._getitem_column(key) + result = self._getitem_column(key) + result._is_column_view = True + return result except: pass diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 75eaccf562bb9..746648772b01b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -83,7 +83,8 @@ class NDFrame(PandasObject): _internal_names = ['_data', '_cacher', '_item_cache', '_cache', 'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value', '_metadata', - '__array_struct__', '__array_interface__', '_children'] + '__array_struct__', '__array_interface__', '_children', + '_is_column_view'] _internal_names_set = set(_internal_names) _accessors = frozenset([]) _metadata = [] @@ -106,6 +107,7 @@ def __init__(self, data, axes=None, copy=False, dtype=None, object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) object.__setattr__(self, '_children', []) + object.__setattr__(self, '_is_column_view', False) def _validate_dtype(self, dtype): @@ -1076,13 +1078,19 @@ def get(self, key, default=None): ------- value : type of items contained in object """ + try: return self[key] except (KeyError, ValueError, IndexError): return default def __getitem__(self, item): - return self._get_item_cache(item) + result = self._get_item_cache(item) + + if isinstance(item, str): + result._is_column_view = True + + return result def _get_item_cache(self, item): """ return the cached item, item represents a label indexer """ @@ -1224,7 +1232,7 @@ def _set_is_copy(self, ref=None, copy=True): def _convert_views_to_copies(self): # Don't set on views. - if self._is_view: + if self._is_view and not self._is_column_view: self._data = self._data.copy() # Before setting values, make sure children converted to copies. @@ -1233,7 +1241,7 @@ def _convert_views_to_copies(self): # Make sure children of children converted. child()._convert_views_to_copies() - if child()._is_view: + if child()._is_view and not self._is_column_view: child()._data = child()._data.copy() self._children=[] @@ -2153,6 +2161,7 @@ def __finalize__(self, other, method=None, **kwargs): return self def __getattr__(self, name): + """After regular attribute access, try looking up the name This allows simpler access to columns for interactive use. """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c27b3a0dc328a..ab5ba8d00a7f4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -57,6 +57,7 @@ def __iter__(self): raise NotImplementedError('ix is not iterable') def __getitem__(self, key): + if type(key) is tuple: try: values = self.obj.get_value(*key) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 0ff77d2110210..ef9ea44f92cd2 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1687,9 +1687,7 @@ def test_copy_on_write(self): # FORWARD PROPAGATION TESTS ####### - ## - # Test children recorded from various slicing methods - ## + # Test various slicing methods add to _children df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) self.assertTrue(len(df._children)==0) @@ -1715,10 +1713,8 @@ def test_copy_on_write(self): tm.assert_frame_equal(views[v], copies[v]) self.assertFalse(views[v]._is_view) - ## - # Test views become copies - # during different forms of value setting. - ## + # Test different forms of value setting + # all trigger conversions parent = dict() views = dict() @@ -1739,8 +1735,6 @@ def test_copy_on_write(self): for v in views.keys(): tm.assert_frame_equal(views[v], copies[v]) self.assertFalse(views[v]._is_view) - - ######## # No Backward Propogation @@ -1760,6 +1754,31 @@ def test_copy_on_write(self): tm.assert_frame_equal(df, df_copy) + ### + # Dictionary-like access to single columns SHOULD give views + ### + + # If change child, should back-propagate + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + v = df['col1'] + self.assertTrue(v._is_view) + self.assertTrue(v._is_column_view) + v.loc[0]=-88 + self.assertTrue(df.loc[0,'col1'] == -88) + self.assertTrue(v._is_view) + + # If change parent, should forward-propagate + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + v = df['col1'] + self.assertTrue(v._is_view) + self.assertTrue(v._is_column_view) + df.loc[0, 'col1']=-88 + self.assertTrue(v.loc[0] == -88) + self.assertTrue(v._is_view) + + + + class TestPanel(tm.TestCase, Generic): _typ = Panel