Skip to content

CLN: Stopped object inference in constructors for pandas objects #58758

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ Removal of prior version deprecations/changes
- Removed the deprecated ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep=r"\s+"`` instead (:issue:`55569`)
- Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`)
- Stopped automatically casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`)
- Stopped performing dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when given a pandas object (:class:`Series`, :class:`Index`, :class:`ExtensionArray`), call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`)
- Stopped performing dtype inference when setting a :class:`Index` into a :class:`DataFrame` (:issue:`56102`)
- Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`)
- Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`)
Expand Down
13 changes: 3 additions & 10 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
ContextManager,
cast,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -290,17 +289,11 @@ def box_expected(expected, box_cls, transpose: bool = True):
else:
expected = pd.array(expected, copy=False)
elif box_cls is Index:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Index(expected)
expected = Index(expected)
elif box_cls is Series:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Series(expected)
expected = Series(expected)
elif box_cls is DataFrame:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Series(expected).to_frame()
expected = Series(expected).to_frame()
if transpose:
# for vector operations, we need a DataFrame to be a single-row,
# not a single-column, in order to operate against non-DataFrame
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def sanitize_array(
# Avoid ending up with a NumpyExtensionArray
dtype = dtype.numpy_dtype

data_was_index = isinstance(data, ABCIndex)
infer_object = not isinstance(data, (ABCIndex, ABCSeries))

# extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray
data = extract_array(data, extract_numpy=True, extract_range=True)
Expand Down Expand Up @@ -607,7 +607,7 @@ def sanitize_array(

if dtype is None:
subarr = data
if data.dtype == object and not data_was_index:
if data.dtype == object and infer_object:
subarr = maybe_infer_to_datetimelike(data)
elif data.dtype.kind == "U" and using_pyarrow_string_dtype():
from pandas.core.arrays.string_ import StringDtype
Expand Down
16 changes: 0 additions & 16 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,10 +728,6 @@ def __init__(
NDFrame.__init__(self, data)
return

is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
data_dtype = getattr(data, "dtype", None)
original_dtype = dtype

# GH47215
if isinstance(index, set):
raise ValueError("index cannot be a set")
Expand Down Expand Up @@ -896,18 +892,6 @@ def __init__(

NDFrame.__init__(self, mgr)

if original_dtype is None and is_pandas_object and data_dtype == np.object_:
if self.dtypes.iloc[0] != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The DataFrame "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result to get the old "
"behavior.",
FutureWarning,
stacklevel=2,
)

# ----------------------------------------------------------------------

def __dataframe__(
Expand Down
22 changes: 5 additions & 17 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,8 +490,6 @@ def __new__(
if not copy and isinstance(data, (ABCSeries, Index)):
refs = data._references

is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray))

# range
if isinstance(data, (range, RangeIndex)):
result = RangeIndex(start=data, copy=copy, name=name)
Expand All @@ -508,7 +506,7 @@ def __new__(
elif is_ea_or_datetimelike_dtype(data_dtype):
pass

elif isinstance(data, (np.ndarray, Index, ABCSeries)):
elif isinstance(data, (np.ndarray, ABCMultiIndex)):
if isinstance(data, ABCMultiIndex):
data = data._values

Expand All @@ -518,7 +516,9 @@ def __new__(
# they are actually ints, e.g. '0' and 0.0
# should not be coerced
data = com.asarray_tuplesafe(data, dtype=_dtype_obj)

elif isinstance(data, (ABCSeries, Index)):
# GH 56244: Avoid potential inference on object types
pass
elif is_scalar(data):
raise cls._raise_scalar_data_error(data)
elif hasattr(data, "__array__"):
Expand Down Expand Up @@ -571,19 +571,7 @@ def __new__(
klass = cls._dtype_to_subclass(arr.dtype)

arr = klass._ensure_array(arr, arr.dtype, copy=False)
result = klass._simple_new(arr, name, refs=refs)
if dtype is None and is_pandas_object and data_dtype == np.object_:
if result.dtype != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The Index "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result to get the old "
"behavior.",
FutureWarning,
stacklevel=2,
)
return result # type: ignore[return-value]
return klass._simple_new(arr, name, refs=refs)

@classmethod
def _ensure_array(cls, data, dtype, copy: bool):
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def ndarray_to_mgr(
) -> Manager:
# used in DataFrame.__init__
# input must be a ndarray, list, Series, Index, ExtensionArray
infer_object = not isinstance(values, (ABCSeries, Index, ExtensionArray))

if isinstance(values, ABCSeries):
if columns is None:
Expand Down Expand Up @@ -287,15 +288,14 @@ def ndarray_to_mgr(
# if we don't have a dtype specified, then try to convert objects
# on the entire block; this is to convert if we have datetimelike's
# embedded in an object type
if dtype is None and is_object_dtype(values.dtype):
if dtype is None and infer_object and is_object_dtype(values.dtype):
obj_columns = list(values)
maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
# don't convert (and copy) the objects if no type inference occurs
if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]
block_values = [
new_block_2d(dvals_list[n], placement=BlockPlacement(n))
for n in range(len(dvals_list))
new_block_2d(ensure_block_shape(dval, 2), placement=BlockPlacement(n))
for n, dval in enumerate(maybe_datetime)
]
else:
bp = BlockPlacement(slice(len(columns)))
Expand Down
16 changes: 0 additions & 16 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,10 +389,6 @@ def __init__(
self.name = name
return

is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
data_dtype = getattr(data, "dtype", None)
original_dtype = dtype

if isinstance(data, (ExtensionArray, np.ndarray)):
if copy is not False:
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
Expand Down Expand Up @@ -438,7 +434,6 @@ def __init__(
data = data.astype(dtype)

refs = data._references
data = data._values
copy = False

elif isinstance(data, np.ndarray):
Expand Down Expand Up @@ -512,17 +507,6 @@ def __init__(
self.name = name
self._set_axis(0, index)

if original_dtype is None and is_pandas_object and data_dtype == np.object_:
if self.dtype != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The Series "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result to get the old behavior.",
FutureWarning,
stacklevel=find_stack_level(),
)

def _init_dict(
self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None
):
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,12 @@ def test_dataframe_from_series_or_index_different_dtype(index_or_series):
assert df._mgr._has_no_reference(0)


def test_dataframe_from_series_infer_datetime():
def test_dataframe_from_series_dont_infer_datetime():
ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
df = DataFrame(ser)
assert not np.shares_memory(get_array(ser), get_array(df, 0))
assert df._mgr._has_no_reference(0)
df = DataFrame(ser)
assert df.dtypes.iloc[0] == np.dtype(object)
assert np.shares_memory(get_array(ser), get_array(df, 0))
assert not df._mgr._has_no_reference(0)


@pytest.mark.parametrize("index", [None, [0, 1, 2]])
Expand Down
17 changes: 5 additions & 12 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2693,21 +2693,14 @@ def test_frame_string_inference_block_dim(self):
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
assert df._mgr.blocks[0].ndim == 2

def test_inference_on_pandas_objects(self):
@pytest.mark.parametrize("klass", [Series, Index])
def test_inference_on_pandas_objects(self, klass):
# GH#56012
idx = Index([Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = DataFrame(idx, columns=["a"])
assert result.dtypes.iloc[0] != np.object_
result = DataFrame({"a": idx})
obj = klass([Timestamp("2019-12-31")], dtype=object)
result = DataFrame(obj, columns=["a"])
assert result.dtypes.iloc[0] == np.object_

ser = Series([Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = DataFrame(ser, columns=["a"])
assert result.dtypes.iloc[0] != np.object_
result = DataFrame({"a": ser})
result = DataFrame({"a": obj})
assert result.dtypes.iloc[0] == np.object_

def test_dict_keys_returns_rangeindex(self):
Expand Down
16 changes: 5 additions & 11 deletions pandas/tests/indexes/base_class/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,12 @@ def test_index_string_inference(self):
ser = Index(["a", 1])
tm.assert_index_equal(ser, expected)

def test_inference_on_pandas_objects(self):
@pytest.mark.parametrize("klass", [Series, Index])
def test_inference_on_pandas_objects(self, klass):
# GH#56012
idx = Index([pd.Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(idx)
assert result.dtype != np.object_

ser = Series([pd.Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(ser)
assert result.dtype != np.object_
obj = klass([pd.Timestamp("2019-12-31")], dtype=object)
result = Index(obj)
assert result.dtype == np.object_

def test_constructor_not_read_only(self):
# GH#57130
Expand Down
17 changes: 7 additions & 10 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,23 +104,20 @@ def test_constructor_copy(self, using_infer_string):
)
def test_constructor_from_index_dtlike(self, cast_as_obj, index):
if cast_as_obj:
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(index.astype(object))
else:
result = Index(index)

tm.assert_index_equal(result, index)

if isinstance(index, DatetimeIndex):
assert result.tz == index.tz
if cast_as_obj:
result = Index(index.astype(object))
assert result.dtype == np.dtype(object)
if isinstance(index, DatetimeIndex):
# GH#23524 check that Index(dti, dtype=object) does not
# incorrectly raise ValueError, and that nanoseconds are not
# dropped
index += pd.Timedelta(nanoseconds=50)
result = Index(index, dtype=object)
assert result.dtype == np.object_
assert list(result) == list(index)
else:
result = Index(index)

tm.assert_index_equal(result, index)

@pytest.mark.parametrize(
"index,has_tz",
Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/series/accessors/test_dt_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,8 @@ def test_dt_accessor_limited_display_api(self):
tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods)))

# Period
idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
ser = Series(idx)
idx = period_range("20130101", periods=5, freq="D", name="xxx")
ser = Series(idx)
results = get_dir(ser)
tm.assert_almost_equal(
results, sorted(set(ok_for_period + ok_for_period_methods))
Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/series/methods/test_equals.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,13 @@ def test_equals_matching_nas():
left = Series([np.datetime64("NaT")], dtype=object)
right = Series([np.datetime64("NaT")], dtype=object)
assert left.equals(right)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
assert Index(left).equals(Index(right))
assert Index(left).equals(Index(right))
assert left.array.equals(right.array)

left = Series([np.timedelta64("NaT")], dtype=object)
right = Series([np.timedelta64("NaT")], dtype=object)
assert left.equals(right)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
assert Index(left).equals(Index(right))
assert Index(left).equals(Index(right))
assert left.array.equals(right.array)

left = Series([np.float64("NaN")], dtype=object)
Expand Down
21 changes: 7 additions & 14 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1318,9 +1318,8 @@ def test_constructor_periodindex(self):
pi = period_range("20130101", periods=5, freq="D")
s = Series(pi)
assert s.dtype == "Period[D]"
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
expected = Series(pi.astype(object))
tm.assert_series_equal(s, expected)
expected = Series(pi.astype(object))
assert expected.dtype == object

def test_constructor_dict(self):
d = {"a": 0.0, "b": 1.0, "c": 2.0}
Expand Down Expand Up @@ -2141,20 +2140,14 @@ def test_series_string_inference_na_first(self):
result = Series([pd.NA, "b"])
tm.assert_series_equal(result, expected)

def test_inference_on_pandas_objects(self):
@pytest.mark.parametrize("klass", [Series, Index])
def test_inference_on_pandas_objects(self, klass):
# GH#56012
ser = Series([Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(None):
# This doesn't do inference
result = Series(ser)
obj = klass([Timestamp("2019-12-31")], dtype=object)
# This doesn't do inference
result = Series(obj)
assert result.dtype == np.object_

idx = Index([Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Series(idx)
assert result.dtype != np.object_


class TestSeriesConstructorIndexCoercion:
def test_series_constructor_datetimelike_index_coercion(self):
Expand Down
12 changes: 0 additions & 12 deletions pandas/tests/tseries/frequencies/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
date_range,
period_range,
)
import pandas._testing as tm
from pandas.core.arrays import (
DatetimeArray,
TimedeltaArray,
Expand Down Expand Up @@ -202,17 +201,6 @@ def test_infer_freq_custom(base_delta_code_pair, constructor):
assert frequencies.infer_freq(index) is None


@pytest.mark.parametrize(
"freq,expected", [("Q", "QE-DEC"), ("Q-NOV", "QE-NOV"), ("Q-OCT", "QE-OCT")]
)
def test_infer_freq_index(freq, expected):
rng = period_range("1959Q2", "2009Q3", freq=freq)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
rng = Index(rng.to_timestamp("D", how="e").astype(object))

assert rng.inferred_freq == expected


@pytest.mark.parametrize(
"expected,dates",
list(
Expand Down