Skip to content

REF: share string parsing code #50736

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 30 additions & 56 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ from pandas._libs.tslibs.parsing import parse_datetime_string
from pandas._libs.tslibs.conversion cimport (
_TSObject,
cast_from_unit,
convert_datetime_to_tsobject,
convert_str_to_tsobject,
convert_timezone,
get_datetime64_nanos,
parse_pydatetime,
Expand Down Expand Up @@ -482,20 +482,15 @@ cpdef array_to_datetime(
object val, tz
ndarray[int64_t] iresult
npy_datetimestruct dts
NPY_DATETIMEUNIT out_bestunit
bint utc_convert = bool(utc)
bint seen_datetime_offset = False
bint is_raise = errors=="raise"
bint is_ignore = errors=="ignore"
bint is_coerce = errors=="coerce"
bint is_same_offsets
_TSObject _ts
int64_t value
int out_local = 0, out_tzoffset = 0
float tz_offset
set out_tzoffset_vals = set()
bint string_to_dts_failed
datetime py_dt
tzinfo tz_out = None
bint found_tz = False, found_naive = False
cnp.broadcast mi
Expand Down Expand Up @@ -557,61 +552,40 @@ cpdef array_to_datetime(
# GH#32264 np.str_ object
val = str(val)

if len(val) == 0 or val in nat_strings:
iresult[i] = NPY_NAT
if parse_today_now(val, &iresult[i], utc):
# We can't _quite_ dispatch this to convert_str_to_tsobject
# bc there isn't a nice way to pass "utc"
cnp.PyArray_MultiIter_NEXT(mi)
continue

string_to_dts_failed = string_to_dts(
val, &dts, &out_bestunit, &out_local,
&out_tzoffset, False, None, False
_ts = convert_str_to_tsobject(
val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
)
if string_to_dts_failed:
# An error at this point is a _parsing_ error
# specifically _not_ OutOfBoundsDatetime
if parse_today_now(val, &iresult[i], utc):
cnp.PyArray_MultiIter_NEXT(mi)
continue

py_dt = parse_datetime_string(val,
dayfirst=dayfirst,
yearfirst=yearfirst)
# If the dateutil parser returned tzinfo, capture it
# to check if all arguments have the same tzinfo
tz = py_dt.utcoffset()

if tz is not None:
seen_datetime_offset = True
# dateutil timezone objects cannot be hashed, so
# store the UTC offsets in seconds instead
out_tzoffset_vals.add(tz.total_seconds())
else:
# Add a marker for naive string, to track if we are
# parsing mixed naive and aware strings
out_tzoffset_vals.add("naive")

_ts = convert_datetime_to_tsobject(py_dt, None)
iresult[i] = _ts.value
try:
_ts.ensure_reso(NPY_FR_ns)
except OutOfBoundsDatetime as err:
# re-raise with better exception message
raise OutOfBoundsDatetime(
f"Out of bounds nanosecond timestamp: {val}"
) from err

iresult[i] = _ts.value

tz = _ts.tzinfo
if tz is not None:
# dateutil timezone objects cannot be hashed, so
# store the UTC offsets in seconds instead
nsecs = tz.utcoffset(None).total_seconds()
out_tzoffset_vals.add(nsecs)
# need to set seen_datetime_offset *after* the
# potentially-raising timezone(timedelta(...)) call,
# otherwise we can go down the is_same_offsets path
# bc len(out_tzoffset_vals) == 0
seen_datetime_offset = True
else:
# No error reported by string_to_dts, pick back up
# where we left off
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
if out_local == 1:
seen_datetime_offset = True
# Store the out_tzoffset in seconds
# since we store the total_seconds of
# dateutil.tz.tzoffset objects
out_tzoffset_vals.add(out_tzoffset * 60.)
tz = timezone(timedelta(minutes=out_tzoffset))
value = tz_localize_to_utc_single(value, tz)
out_local = 0
out_tzoffset = 0
else:
# Add a marker for naive string, to track if we are
# parsing mixed naive and aware strings
out_tzoffset_vals.add("naive")
iresult[i] = value
check_dts_bounds(&dts)
# Add a marker for naive string, to track if we are
# parsing mixed naive and aware strings
out_tzoffset_vals.add("naive")

else:
raise TypeError(f"{type(val)} is not convertible to datetime")
Expand Down
4 changes: 4 additions & 0 deletions pandas/_libs/tslibs/conversion.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, tzinfo tz,
int32_t nanos=*,
NPY_DATETIMEUNIT reso=*)

cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
bint dayfirst=*,
bint yearfirst=*)

cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1

cpdef datetime localize_pydatetime(datetime dt, tzinfo tz)
Expand Down
8 changes: 4 additions & 4 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
obj = _TSObject()

if isinstance(ts, str):
return _convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst)
return convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst)

if ts is None or ts is NaT:
obj.value = NPY_NAT
Expand Down Expand Up @@ -463,9 +463,9 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts,
return obj


cdef _TSObject _convert_str_to_tsobject(str ts, tzinfo tz, str unit,
bint dayfirst=False,
bint yearfirst=False):
cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
bint dayfirst=False,
bint yearfirst=False):
"""
Convert a string input `ts`, along with optional timezone object`tz`
to a _TSObject.
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/indexes/datetimes/test_scalar_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ def test_dti_date(self):
@pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]])
def test_dti_date_out_of_range(self, data):
# GH#1475
msg = "^Out of bounds nanosecond timestamp: 1400-01-01 00:00:00, at position 0$"
msg = (
"^Out of bounds nanosecond timestamp: "
"1400-01-01( 00:00:00)?, at position 0$"
)
with pytest.raises(OutOfBoundsDatetime, match=msg):
DatetimeIndex(data)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2783,7 +2783,7 @@ def test_day_not_in_month_coerce(self, cache, arg, format, warning):
assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache))

def test_day_not_in_month_raise(self, cache):
msg = "day is out of range for month"
msg = "could not convert string to Timestamp"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
UserWarning, match="Could not infer format"
Expand Down