From 82317e091fb0dee7c70653ec3f2313d58bfc312c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Mar 2022 16:51:22 -0800 Subject: [PATCH 01/16] REF: implement Localizer --- pandas/_libs/tslibs/timezones.pxd | 20 ++++ pandas/_libs/tslibs/timezones.pyx | 37 ++++++++ pandas/_libs/tslibs/tzconversion.pxd | 10 +- pandas/_libs/tslibs/tzconversion.pyx | 18 ++++ pandas/_libs/tslibs/vectorized.pyx | 133 ++++++--------------------- 5 files changed, 112 insertions(+), 106 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 13f196a567952..1bc12c7bd8619 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -3,6 +3,10 @@ from cpython.datetime cimport ( timedelta, tzinfo, ) +from numpy cimport ( + int64_t, + ndarray, +) cdef tzinfo utc_pytz @@ -20,3 +24,19 @@ cdef timedelta get_utcoffset(tzinfo tz, datetime obj) cdef bint is_fixed_offset(tzinfo tz) cdef object get_dst_info(tzinfo tz) + + +cdef class Localizer: + cdef: + tzinfo tz + bint use_utc + bint use_fixed + bint use_tzlocal + bint use_dst + bint use_pytz + ndarray trans + int64_t[:] deltas + int64_t delta + str typ + + cdef ndarray[int64_t] prepare(self, const int64_t[:] stamps) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 224c5be1f3b7d..7feeb7d045b78 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -406,3 +406,40 @@ def tz_standardize(tz: tzinfo) -> tzinfo: if treat_tz_as_pytz(tz): return pytz.timezone(str(tz)) return tz + + +cdef class Localizer: + # cdef: + # tzinfo tz + # bint use_utc + # bint use_fixed + # bint use_tzlocal + # bint use_pytz + # bint use_dst + # ndarray trans + # int64_t[:] deltas + # int64_t delta + # str typ + + def __cinit__(self, tzinfo tz): + self.tz = tz + if is_utc(tz) or tz is None: + self.use_utc = True + elif is_tzlocal(tz): + self.use_tzlocal = True + else: + trans, deltas, typ = get_dst_info(tz) + self.trans = trans + self.deltas = deltas + self.typ = typ + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + self.use_fixed = True + self.delta = deltas[0] + else: + self.use_dst = True + + cdef ndarray[int64_t] prepare(self, const int64_t[:] stamps): + if self.use_dst: + return self.trans.searchsorted(stamps, side="right") - 1 diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 3666d00707ac8..d7d78d6f2f1a9 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -1,5 +1,10 @@ from cpython.datetime cimport tzinfo -from numpy cimport int64_t +from numpy cimport ( + int64_t, + intp_t, +) + +from pandas._libs.tslibs.timezones cimport Localizer cdef int64_t tz_convert_utc_to_tzlocal( @@ -9,3 +14,6 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz) cdef int64_t tz_localize_to_utc_single( int64_t val, tzinfo tz, object ambiguous=*, object nonexistent=* ) except? -1 + + +cdef int64_t utc_val_to_local_val(Localizer info, int64_t utc_val, intp_t[:] pos, Py_ssize_t i) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 4dbfabad5dc84..c24fdf5714d68 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -38,6 +38,7 @@ from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, ) from pandas._libs.tslibs.timezones cimport ( + Localizer, get_dst_info, get_utcoffset, is_fixed_offset, @@ -566,3 +567,20 @@ cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True, return val - delta else: return val + delta + + +# TODO: make this a Localizer method? would require moving tzlocal func +cdef int64_t utc_val_to_local_val(Localizer info, int64_t utc_val, intp_t[:] pos, Py_ssize_t i): + cdef: + int64_t local_val + + if info.use_utc: + local_val = utc_val + elif info.use_tzlocal: + local_val = _tz_convert_tzlocal_utc(utc_val, info.tz, to_utc=False) + elif info.use_fixed: + local_val = utc_val + info.delta + else: + local_val = utc_val + info.deltas[pos[i]] + + return local_val diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 17720de33ab33..456e08d437d09 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -31,11 +31,15 @@ from .offsets cimport BaseOffset from .period cimport get_period_ordinal from .timestamps cimport create_timestamp_from_ts from .timezones cimport ( + Localizer, get_dst_info, is_tzlocal, is_utc, ) -from .tzconversion cimport tz_convert_utc_to_tzlocal +from .tzconversion cimport ( + tz_convert_utc_to_tzlocal, + utc_val_to_local_val, +) # ------------------------------------------------------------------------- @@ -131,6 +135,7 @@ def ints_to_pydatetime( object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) bint use_utc = False, use_tzlocal = False, use_fixed = False bint use_pytz = False + Localizer info = Localizer(tz) if box == "date": assert (tz is None), "tz should be None when converting to date" @@ -168,11 +173,11 @@ def ints_to_pydatetime( if value == NPY_NAT: result[i] = NaT else: - if use_utc: + if info.use_utc: local_val = value - elif use_tzlocal: + elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(value, tz) - elif use_fixed: + elif info.use_fixed: local_val = value + delta elif not use_pytz: # i.e. dateutil @@ -225,37 +230,17 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: Py_ssize_t i, n = len(stamps) npy_datetimestruct dts int reso = RESO_DAY, curr_reso - ndarray[int64_t] trans - int64_t[:] deltas intp_t[:] pos - int64_t local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False + int64_t local_val + Localizer info = Localizer(tz) - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(stamps, side="right") - 1 + pos = info.prepare(stamps) for i in range(n): if stamps[i] == NPY_NAT: continue - if use_utc: - local_val = stamps[i] - elif use_tzlocal: - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - elif use_fixed: - local_val = stamps[i] + delta - else: - local_val = stamps[i] + deltas[pos[i]] + local_val = utc_val_to_local_val(info, stamps[i], pos, i) dt64_to_dtstruct(local_val, &dts) curr_reso = _reso_stamp(&dts) @@ -287,39 +272,18 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t cdef: Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) - ndarray[int64_t] trans - int64_t[:] deltas - str typ - Py_ssize_t[:] pos - int64_t local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False + intp_t[:] pos + int64_t local_val + Localizer info = Localizer(tz) - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(stamps, side="right") - 1 + pos = info.prepare(stamps) for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - if use_utc: - local_val = stamps[i] - elif use_tzlocal: - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - elif use_fixed: - local_val = stamps[i] + delta - else: - local_val = stamps[i] + deltas[pos[i]] + local_val = utc_val_to_local_val(info, stamps[i], pos, i) result[i] = normalize_i8_stamp(local_val) @@ -345,36 +309,15 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: """ cdef: Py_ssize_t i, n = len(stamps) - ndarray[int64_t] trans - int64_t[:] deltas - intp_t[:] pos - int64_t local_val, delta = NPY_NAT - str typ + int64_t local_val int64_t day_nanos = 24 * 3600 * 1_000_000_000 - bint use_utc = False, use_tzlocal = False, use_fixed = False + intp_t[:] pos + Localizer info = Localizer(tz) - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(stamps, side="right") - 1 + pos = info.prepare(stamps) for i in range(n): - if use_utc: - local_val = stamps[i] - elif use_tzlocal: - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - elif use_fixed: - local_val = stamps[i] + delta - else: - local_val = stamps[i] + deltas[pos[i]] + local_val = utc_val_to_local_val(info, stamps[i], pos, i) if local_val % day_nanos != 0: return False @@ -391,39 +334,19 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) - ndarray[int64_t] trans - int64_t[:] deltas - Py_ssize_t[:] pos + intp_t[:] pos npy_datetimestruct dts - int64_t local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False + int64_t local_val + Localizer info = Localizer(tz) - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(stamps, side="right") - 1 + pos = info.prepare(stamps) for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - if use_utc: - local_val = stamps[i] - elif use_tzlocal: - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - elif use_fixed: - local_val = stamps[i] + delta - else: - local_val = stamps[i] + deltas[pos[i]] + local_val = utc_val_to_local_val(info, stamps[i], pos, i) dt64_to_dtstruct(local_val, &dts) result[i] = get_period_ordinal(&dts, freq) From 49bd3d3853c94810fc3db8e31f81d117bbc7875a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Mar 2022 19:04:48 -0800 Subject: [PATCH 02/16] REF: use utc_val_to_local_val --- pandas/_libs/tslibs/timezones.pyx | 2 ++ pandas/_libs/tslibs/vectorized.pyx | 53 ++++++++---------------------- 2 files changed, 15 insertions(+), 40 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 7feeb7d045b78..30dda92abf28b 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -439,6 +439,8 @@ cdef class Localizer: self.delta = deltas[0] else: self.use_dst = True + if typ == "pytz": + self.use_pytz = True cdef ndarray[int64_t] prepare(self, const int64_t[:] stamps): if self.use_dst: diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 456e08d437d09..c7d353ad37362 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -124,17 +124,12 @@ def ints_to_pydatetime( """ cdef: Py_ssize_t i, n = len(stamps) - ndarray[int64_t] trans - int64_t[:] deltas intp_t[:] pos npy_datetimestruct dts - object dt, new_tz - str typ - int64_t value, local_val, delta = NPY_NAT # dummy for delta + object new_tz + int64_t value, local_val ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) - bint use_utc = False, use_tzlocal = False, use_fixed = False - bint use_pytz = False Localizer info = Localizer(tz) if box == "date": @@ -152,19 +147,7 @@ def ints_to_pydatetime( "box must be one of 'datetime', 'date', 'time' or 'timestamp'" ) - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(stamps, side="right") - 1 - use_pytz = typ == "pytz" + pos = info.prepare(stamps) for i in range(n): new_tz = tz @@ -172,26 +155,16 @@ def ints_to_pydatetime( if value == NPY_NAT: result[i] = NaT - else: - if info.use_utc: - local_val = value - elif info.use_tzlocal: - local_val = tz_convert_utc_to_tzlocal(value, tz) - elif info.use_fixed: - local_val = value + delta - elif not use_pytz: - # i.e. dateutil - # no zone-name change for dateutil tzs - dst etc - # represented in single object. - local_val = value + deltas[pos[i]] - else: - # pytz - # find right representation of dst etc in pytz timezone - new_tz = tz._tzinfos[tz._transition_info[pos[i]]] - local_val = value + deltas[pos[i]] - - dt64_to_dtstruct(local_val, &dts) - result[i] = func_create(value, dts, new_tz, freq, fold) + continue + + local_val = utc_val_to_local_val(info, value, pos, i) + + if info.use_pytz: + # find right representation of dst etc in pytz timezone + new_tz = tz._tzinfos[tz._transition_info[pos[i]]] + + dt64_to_dtstruct(local_val, &dts) + result[i] = func_create(value, dts, new_tz, freq, fold) return result From 64d78de61e7771451b0c9448f5358faf12811c76 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Mar 2022 20:35:57 -0800 Subject: [PATCH 03/16] reuse utc_val_to_local_val --- pandas/_libs/tslibs/tzconversion.pyx | 52 ++++++---------------------- 1 file changed, 11 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index c24fdf5714d68..d465a8f1a9d7e 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -461,53 +461,23 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): converted : ndarray[int64_t] """ cdef: - int64_t[:] converted, deltas + int64_t[:] converted Py_ssize_t i, n = len(vals) - int64_t val, delta + int64_t val intp_t[:] pos - ndarray[int64_t] trans - str typ + Localizer info = Localizer(tz) - if is_utc(tz): - return vals - elif is_tzlocal(tz): - converted = np.empty(n, dtype=np.int64) - for i in range(n): - val = vals[i] - if val == NPY_NAT: - converted[i] = NPY_NAT - else: - converted[i] = _tz_convert_tzlocal_utc(val, tz, to_utc=False) - else: - converted = np.empty(n, dtype=np.int64) + converted = np.empty(n, dtype=np.int64) - trans, deltas, typ = get_dst_info(tz) + pos = info.prepare(vals) - if typ not in ["pytz", "dateutil"]: - # FixedOffset, we know len(deltas) == 1 - delta = deltas[0] - - for i in range(n): - val = vals[i] - if val == NPY_NAT: - converted[i] = val - else: - converted[i] = val + delta - - else: - pos = trans.searchsorted(vals, side="right") - 1 - - for i in range(n): - val = vals[i] - if val == NPY_NAT: - converted[i] = val - else: - if pos[i] < 0: - # TODO: How is this reached? Should we be checking for - # it elsewhere? - raise ValueError("First time before start of DST info") + for i in range(n): + val = vals[i] + if val == NPY_NAT: + converted[i] = NPY_NAT + continue - converted[i] = val + deltas[pos[i]] + converted[i] = utc_val_to_local_val(info, val, pos, i) return converted From ba739606495677d4f3b9948c47a5346c93bf7bdd Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Mar 2022 08:01:34 -0800 Subject: [PATCH 04/16] intp_t[:]->intp_t* --- pandas/_libs/tslibs/timezones.pxd | 4 +++- pandas/_libs/tslibs/timezones.pyx | 14 +++++++++++--- pandas/_libs/tslibs/tzconversion.pxd | 2 +- pandas/_libs/tslibs/tzconversion.pyx | 15 +++++++-------- pandas/_libs/tslibs/vectorized.pyx | 10 +++++----- 5 files changed, 27 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 1bc12c7bd8619..d7588eeefbb5e 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -5,6 +5,7 @@ from cpython.datetime cimport ( ) from numpy cimport ( int64_t, + intp_t, ndarray, ) @@ -39,4 +40,5 @@ cdef class Localizer: int64_t delta str typ - cdef ndarray[int64_t] prepare(self, const int64_t[:] stamps) + cdef int64_t prepare1(self, int64_t utc_val) + cdef intp_t* prepare(self, const int64_t[:] stamps) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 30dda92abf28b..40b55315d56bd 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -26,7 +26,10 @@ UTC = pytz.utc import numpy as np cimport numpy as cnp -from numpy cimport int64_t +from numpy cimport ( + int64_t, + intp_t, +) cnp.import_array() @@ -442,6 +445,11 @@ cdef class Localizer: if typ == "pytz": self.use_pytz = True - cdef ndarray[int64_t] prepare(self, const int64_t[:] stamps): + cdef int64_t prepare1(self, int64_t utc_val): + if self.use_dst: + return self.trans.searchsorted(utc_val, side="right") - 1 + + cdef intp_t* prepare(self, const int64_t[:] stamps): if self.use_dst: - return self.trans.searchsorted(stamps, side="right") - 1 + + return cnp.PyArray_DATA(self.trans.searchsorted(stamps, side="right") - 1) diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index d7d78d6f2f1a9..dd9b4e9ecae2f 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -16,4 +16,4 @@ cdef int64_t tz_localize_to_utc_single( ) except? -1 -cdef int64_t utc_val_to_local_val(Localizer info, int64_t utc_val, intp_t[:] pos, Py_ssize_t i) +cdef int64_t utc_val_to_local_val(Localizer info, int64_t utc_val, intp_t* pos, Py_ssize_t i) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index d465a8f1a9d7e..7b5442bf68fa8 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -404,18 +404,17 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): int64_t[:] deltas ndarray[int64_t, ndim=1] trans intp_t pos + Localizer info = Localizer(tz) if val == NPY_NAT: return val - if is_utc(tz): + if info.use_utc: return val - elif is_tzlocal(tz): + elif info.use_tzlocal: return _tz_convert_tzlocal_utc(val, tz, to_utc=False) - elif is_fixed_offset(tz): - _, deltas, _ = get_dst_info(tz) - delta = deltas[0] - return val + delta + elif info.use_fixed: + return val + info.delta else: trans, deltas, _ = get_dst_info(tz) pos = trans.searchsorted(val, side="right") - 1 @@ -464,7 +463,7 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): int64_t[:] converted Py_ssize_t i, n = len(vals) int64_t val - intp_t[:] pos + intp_t* pos Localizer info = Localizer(tz) converted = np.empty(n, dtype=np.int64) @@ -540,7 +539,7 @@ cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True, # TODO: make this a Localizer method? would require moving tzlocal func -cdef int64_t utc_val_to_local_val(Localizer info, int64_t utc_val, intp_t[:] pos, Py_ssize_t i): +cdef int64_t utc_val_to_local_val(Localizer info, int64_t utc_val, intp_t* pos, Py_ssize_t i): cdef: int64_t local_val diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index c7d353ad37362..17aaf680df6a5 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -124,7 +124,7 @@ def ints_to_pydatetime( """ cdef: Py_ssize_t i, n = len(stamps) - intp_t[:] pos + intp_t* pos npy_datetimestruct dts object new_tz int64_t value, local_val @@ -203,7 +203,7 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: Py_ssize_t i, n = len(stamps) npy_datetimestruct dts int reso = RESO_DAY, curr_reso - intp_t[:] pos + intp_t* pos int64_t local_val Localizer info = Localizer(tz) @@ -245,7 +245,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t cdef: Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) - intp_t[:] pos + intp_t* pos int64_t local_val Localizer info = Localizer(tz) @@ -284,7 +284,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: Py_ssize_t i, n = len(stamps) int64_t local_val int64_t day_nanos = 24 * 3600 * 1_000_000_000 - intp_t[:] pos + intp_t* pos Localizer info = Localizer(tz) pos = info.prepare(stamps) @@ -307,7 +307,7 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) - intp_t[:] pos + intp_t* pos npy_datetimestruct dts int64_t local_val Localizer info = Localizer(tz) From 866eee2fd61e2c8ad7dfe002e02af7f02a9d673d Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Mar 2022 08:23:28 -0800 Subject: [PATCH 05/16] avoid double-copy --- pandas/_libs/tslibs/tzconversion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 7b5442bf68fa8..eb863d6660f52 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -441,7 +441,7 @@ def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): return np.array([], dtype=np.int64) converted = _tz_convert_from_utc(vals, tz) - return np.array(converted, dtype=np.int64) + return np.asarray(converted, dtype=np.int64) @cython.boundscheck(False) From 7d04c1c552c68512525fe68c36be09ee1bdb3121 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Mar 2022 11:38:35 -0800 Subject: [PATCH 06/16] PERF tweeks --- pandas/_libs/tslibs/timezones.pyx | 3 +++ pandas/_libs/tslibs/tzconversion.pyx | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 40b55315d56bd..f5e316980aea7 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -3,6 +3,8 @@ from datetime import ( timezone, ) +import cython + from cpython.datetime cimport ( datetime, timedelta, @@ -411,6 +413,7 @@ def tz_standardize(tz: tzinfo) -> tzinfo: return tz +@cython.freelist(16) cdef class Localizer: # cdef: # tzinfo tz diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index eb863d6660f52..0717a56cfa77c 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -466,6 +466,10 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): intp_t* pos Localizer info = Localizer(tz) + if info.use_utc: + # fastpath + return vals.copy() + converted = np.empty(n, dtype=np.int64) pos = info.prepare(vals) From a8c56fe5d77441fb5437368786513a2e80052706 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Mar 2022 12:59:46 -0800 Subject: [PATCH 07/16] move Localizer to tzconversion --- pandas/_libs/tslibs/timezones.pxd | 22 ------------ pandas/_libs/tslibs/timezones.pyx | 50 +--------------------------- pandas/_libs/tslibs/tzconversion.pxd | 20 +++++++++-- pandas/_libs/tslibs/tzconversion.pyx | 46 ++++++++++++++++++++++++- pandas/_libs/tslibs/vectorized.pyx | 3 +- 5 files changed, 65 insertions(+), 76 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index d7588eeefbb5e..13f196a567952 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -3,11 +3,6 @@ from cpython.datetime cimport ( timedelta, tzinfo, ) -from numpy cimport ( - int64_t, - intp_t, - ndarray, -) cdef tzinfo utc_pytz @@ -25,20 +20,3 @@ cdef timedelta get_utcoffset(tzinfo tz, datetime obj) cdef bint is_fixed_offset(tzinfo tz) cdef object get_dst_info(tzinfo tz) - - -cdef class Localizer: - cdef: - tzinfo tz - bint use_utc - bint use_fixed - bint use_tzlocal - bint use_dst - bint use_pytz - ndarray trans - int64_t[:] deltas - int64_t delta - str typ - - cdef int64_t prepare1(self, int64_t utc_val) - cdef intp_t* prepare(self, const int64_t[:] stamps) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index f5e316980aea7..ca33490b472ce 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -28,10 +28,7 @@ UTC = pytz.utc import numpy as np cimport numpy as cnp -from numpy cimport ( - int64_t, - intp_t, -) +from numpy cimport int64_t cnp.import_array() @@ -411,48 +408,3 @@ def tz_standardize(tz: tzinfo) -> tzinfo: if treat_tz_as_pytz(tz): return pytz.timezone(str(tz)) return tz - - -@cython.freelist(16) -cdef class Localizer: - # cdef: - # tzinfo tz - # bint use_utc - # bint use_fixed - # bint use_tzlocal - # bint use_pytz - # bint use_dst - # ndarray trans - # int64_t[:] deltas - # int64_t delta - # str typ - - def __cinit__(self, tzinfo tz): - self.tz = tz - if is_utc(tz) or tz is None: - self.use_utc = True - elif is_tzlocal(tz): - self.use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - self.trans = trans - self.deltas = deltas - self.typ = typ - - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - self.use_fixed = True - self.delta = deltas[0] - else: - self.use_dst = True - if typ == "pytz": - self.use_pytz = True - - cdef int64_t prepare1(self, int64_t utc_val): - if self.use_dst: - return self.trans.searchsorted(utc_val, side="right") - 1 - - cdef intp_t* prepare(self, const int64_t[:] stamps): - if self.use_dst: - - return cnp.PyArray_DATA(self.trans.searchsorted(stamps, side="right") - 1) diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index dd9b4e9ecae2f..0b14439ffe706 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -2,10 +2,9 @@ from cpython.datetime cimport tzinfo from numpy cimport ( int64_t, intp_t, + ndarray, ) -from pandas._libs.tslibs.timezones cimport Localizer - cdef int64_t tz_convert_utc_to_tzlocal( int64_t utc_val, tzinfo tz, bint* fold=* @@ -16,4 +15,21 @@ cdef int64_t tz_localize_to_utc_single( ) except? -1 +cdef class Localizer: + cdef: + tzinfo tz + bint use_utc + bint use_fixed + bint use_tzlocal + bint use_dst + bint use_pytz + ndarray trans + int64_t[:] deltas + int64_t delta + str typ + + cdef int64_t prepare1(self, int64_t utc_val) + cdef intp_t* prepare(self, const int64_t[:] stamps) + + cdef int64_t utc_val_to_local_val(Localizer info, int64_t utc_val, intp_t* pos, Py_ssize_t i) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 0717a56cfa77c..ee9d150bede06 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -38,7 +38,6 @@ from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, ) from pandas._libs.tslibs.timezones cimport ( - Localizer, get_dst_info, get_utcoffset, is_fixed_offset, @@ -542,6 +541,51 @@ cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True, return val + delta +@cython.freelist(16) +cdef class Localizer: + # cdef: + # tzinfo tz + # bint use_utc + # bint use_fixed + # bint use_tzlocal + # bint use_pytz + # bint use_dst + # ndarray trans + # int64_t[:] deltas + # int64_t delta + # str typ + + def __cinit__(self, tzinfo tz): + self.tz = tz + if is_utc(tz) or tz is None: + self.use_utc = True + elif is_tzlocal(tz): + self.use_tzlocal = True + else: + trans, deltas, typ = get_dst_info(tz) + self.trans = trans + self.deltas = deltas + self.typ = typ + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + self.use_fixed = True + self.delta = deltas[0] + else: + self.use_dst = True + if typ == "pytz": + self.use_pytz = True + + cdef int64_t prepare1(self, int64_t utc_val): + if self.use_dst: + return self.trans.searchsorted(utc_val, side="right") - 1 + + cdef intp_t* prepare(self, const int64_t[:] stamps): + if self.use_dst: + + return cnp.PyArray_DATA(self.trans.searchsorted(stamps, side="right") - 1) + + # TODO: make this a Localizer method? would require moving tzlocal func cdef int64_t utc_val_to_local_val(Localizer info, int64_t utc_val, intp_t* pos, Py_ssize_t i): cdef: diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 17aaf680df6a5..38650ac669359 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -31,13 +31,12 @@ from .offsets cimport BaseOffset from .period cimport get_period_ordinal from .timestamps cimport create_timestamp_from_ts from .timezones cimport ( - Localizer, get_dst_info, is_tzlocal, is_utc, ) from .tzconversion cimport ( - tz_convert_utc_to_tzlocal, + Localizer, utc_val_to_local_val, ) From f84b93ecc2092db82b05959fa1f4475c6f3c0c7c Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Mar 2022 15:33:35 -0800 Subject: [PATCH 08/16] use pattern --- pandas/_libs/tslibs/conversion.pyx | 47 +++++++++++++++------------- pandas/_libs/tslibs/tzconversion.pxd | 2 +- pandas/_libs/tslibs/tzconversion.pyx | 26 ++++++--------- 3 files changed, 37 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index e30a91ae3e10a..6680ab1049d12 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -542,6 +542,7 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, datetime dt ndarray[int64_t] trans int64_t[:] deltas + Py_ssize_t pos value = dtstruct_to_dt64(&dts) obj.dts = dts @@ -562,7 +563,7 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, if typ == 'dateutil': pos = trans.searchsorted(obj.value, side='right') - 1 - obj.fold = _infer_tsobject_fold(obj, trans, deltas, pos) + obj.fold = _infer_tsobject_fold(obj.value, trans, deltas, pos) # Keep the converter same as PyDateTime's dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, @@ -732,32 +733,35 @@ cdef inline void _localize_tso(_TSObject obj, tzinfo tz): if is_fixed_offset(tz): # static/fixed tzinfo; in this case we know len(deltas) == 1 # This can come back with `typ` of either "fixed" or None - dt64_to_dtstruct(obj.value + deltas[0], &obj.dts) - elif typ == 'pytz': - # i.e. treat_tz_as_pytz(tz) - pos = trans.searchsorted(obj.value, side='right') - 1 - tz = tz._tzinfos[tz._transition_info[pos]] - dt64_to_dtstruct(obj.value + deltas[pos], &obj.dts) - elif typ == 'dateutil': - # i.e. treat_tz_as_dateutil(tz) - pos = trans.searchsorted(obj.value, side='right') - 1 - dt64_to_dtstruct(obj.value + deltas[pos], &obj.dts) - # dateutil supports fold, so we infer fold from value - obj.fold = _infer_tsobject_fold(obj, trans, deltas, pos) + local_val = obj.value + deltas[0] + else: - # Note: as of 2018-07-17 all tzinfo objects that are _not_ - # either pytz or dateutil have is_fixed_offset(tz) == True, - # so this branch will never be reached. - pass + pos = trans.searchsorted(obj.value, side='right') - 1 + local_val = obj.value + deltas[pos] + + if typ == 'pytz': + # i.e. treat_tz_as_pytz(tz) + tz = tz._tzinfos[tz._transition_info[pos]] + elif typ == 'dateutil': + # i.e. treat_tz_as_dateutil(tz) + # dateutil supports fold, so we infer fold from value + obj.fold = _infer_tsobject_fold(obj.value, trans, deltas, pos) + else: + # Note: as of 2018-07-17 all tzinfo objects that are _not_ + # either pytz or dateutil have is_fixed_offset(tz) == True, + # so this branch will never be reached. + pass + + dt64_to_dtstruct(local_val, &obj.dts) obj.tzinfo = tz cdef inline bint _infer_tsobject_fold( - _TSObject obj, + int64_t value, const int64_t[:] trans, const int64_t[:] deltas, - intp_t pos, + Py_ssize_t pos, ): """ Infer _TSObject fold property from value by assuming 0 and then setting @@ -765,12 +769,13 @@ cdef inline bint _infer_tsobject_fold( Parameters ---------- + val : int64_t obj : _TSObject trans : ndarray[int64_t] ndarray of offset transition points in nanoseconds since epoch. deltas : int64_t[:] array of offsets corresponding to transition points in trans. - pos : intp_t + pos : Py_ssize_t Position of the last transition point before taking fold into account. Returns @@ -791,7 +796,7 @@ cdef inline bint _infer_tsobject_fold( if pos > 0: fold_delta = deltas[pos - 1] - deltas[pos] - if obj.value - fold_delta < trans[pos]: + if value - fold_delta < trans[pos]: fold = 1 return fold diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 0b14439ffe706..2973289cb0ab8 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -28,7 +28,7 @@ cdef class Localizer: int64_t delta str typ - cdef int64_t prepare1(self, int64_t utc_val) + cdef intp_t* prepare1(self, int64_t utc_val) cdef intp_t* prepare(self, const int64_t[:] stamps) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index ee9d150bede06..23cb5206ffbcb 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -399,25 +399,15 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): converted: int64 """ cdef: - int64_t delta - int64_t[:] deltas - ndarray[int64_t, ndim=1] trans - intp_t pos + intp_t* pos Localizer info = Localizer(tz) if val == NPY_NAT: return val - if info.use_utc: - return val - elif info.use_tzlocal: - return _tz_convert_tzlocal_utc(val, tz, to_utc=False) - elif info.use_fixed: - return val + info.delta - else: - trans, deltas, _ = get_dst_info(tz) - pos = trans.searchsorted(val, side="right") - 1 - return val + deltas[pos] + pos = info.prepare1(val) + + return utc_val_to_local_val(info, val, pos, 0) def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): @@ -576,9 +566,13 @@ cdef class Localizer: if typ == "pytz": self.use_pytz = True - cdef int64_t prepare1(self, int64_t utc_val): + cdef intp_t* prepare1(self, int64_t utc_val): + cdef: + intp_t loc + if self.use_dst: - return self.trans.searchsorted(utc_val, side="right") - 1 + loc = self.trans.searchsorted(utc_val, side="right") - 1 + return &loc cdef intp_t* prepare(self, const int64_t[:] stamps): if self.use_dst: From d055889def4672047bc3a5ea029520261427fc55 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Mar 2022 16:15:25 -0800 Subject: [PATCH 09/16] make utc_val_to_local_val a method --- pandas/_libs/tslibs/tzconversion.pxd | 4 +--- pandas/_libs/tslibs/tzconversion.pyx | 32 +++++++++++++--------------- pandas/_libs/tslibs/vectorized.pyx | 20 ++++++----------- 3 files changed, 22 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 2973289cb0ab8..3e3f3f23dfdda 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -30,6 +30,4 @@ cdef class Localizer: cdef intp_t* prepare1(self, int64_t utc_val) cdef intp_t* prepare(self, const int64_t[:] stamps) - - -cdef int64_t utc_val_to_local_val(Localizer info, int64_t utc_val, intp_t* pos, Py_ssize_t i) + cdef int64_t utc_val_to_local_val(self, int64_t utc_val, intp_t* pos, Py_ssize_t i) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 23cb5206ffbcb..a15495763d8a0 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -407,7 +407,7 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): pos = info.prepare1(val) - return utc_val_to_local_val(info, val, pos, 0) + return info.utc_val_to_local_val(val, pos, 0) def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): @@ -469,7 +469,7 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): converted[i] = NPY_NAT continue - converted[i] = utc_val_to_local_val(info, val, pos, i) + converted[i] = info.utc_val_to_local_val(val, pos, i) return converted @@ -579,19 +579,17 @@ cdef class Localizer: return cnp.PyArray_DATA(self.trans.searchsorted(stamps, side="right") - 1) + cdef int64_t utc_val_to_local_val(self, int64_t utc_val, intp_t* pos, Py_ssize_t i): + cdef: + int64_t local_val + + if self.use_utc: + local_val = utc_val + elif self.use_tzlocal: + local_val = _tz_convert_tzlocal_utc(utc_val, self.tz, to_utc=False) + elif self.use_fixed: + local_val = utc_val + self.delta + else: + local_val = utc_val + self.deltas[pos[i]] -# TODO: make this a Localizer method? would require moving tzlocal func -cdef int64_t utc_val_to_local_val(Localizer info, int64_t utc_val, intp_t* pos, Py_ssize_t i): - cdef: - int64_t local_val - - if info.use_utc: - local_val = utc_val - elif info.use_tzlocal: - local_val = _tz_convert_tzlocal_utc(utc_val, info.tz, to_utc=False) - elif info.use_fixed: - local_val = utc_val + info.delta - else: - local_val = utc_val + info.deltas[pos[i]] - - return local_val + return local_val diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 38650ac669359..42f4976c091e8 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -30,15 +30,7 @@ from .np_datetime cimport ( from .offsets cimport BaseOffset from .period cimport get_period_ordinal from .timestamps cimport create_timestamp_from_ts -from .timezones cimport ( - get_dst_info, - is_tzlocal, - is_utc, -) -from .tzconversion cimport ( - Localizer, - utc_val_to_local_val, -) +from .tzconversion cimport Localizer # ------------------------------------------------------------------------- @@ -156,7 +148,7 @@ def ints_to_pydatetime( result[i] = NaT continue - local_val = utc_val_to_local_val(info, value, pos, i) + local_val = info.utc_val_to_local_val(value, pos, i) if info.use_pytz: # find right representation of dst etc in pytz timezone @@ -212,7 +204,7 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: if stamps[i] == NPY_NAT: continue - local_val = utc_val_to_local_val(info, stamps[i], pos, i) + local_val = info.utc_val_to_local_val(stamps[i], pos, i) dt64_to_dtstruct(local_val, &dts) curr_reso = _reso_stamp(&dts) @@ -255,7 +247,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t result[i] = NPY_NAT continue - local_val = utc_val_to_local_val(info, stamps[i], pos, i) + local_val = info.utc_val_to_local_val(stamps[i], pos, i) result[i] = normalize_i8_stamp(local_val) @@ -289,7 +281,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: pos = info.prepare(stamps) for i in range(n): - local_val = utc_val_to_local_val(info, stamps[i], pos, i) + local_val = info.utc_val_to_local_val(stamps[i], pos, i) if local_val % day_nanos != 0: return False @@ -318,7 +310,7 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): result[i] = NPY_NAT continue - local_val = utc_val_to_local_val(info, stamps[i], pos, i) + local_val = info.utc_val_to_local_val(stamps[i], pos, i) dt64_to_dtstruct(local_val, &dts) result[i] = get_period_ordinal(&dts, freq) From 80f950cb7d6ef7b54c7811b9f052501c897616eb Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Mar 2022 21:25:37 -0800 Subject: [PATCH 10/16] try making is_date_array_normalized a method --- pandas/_libs/tslibs/tzconversion.pxd | 3 ++- pandas/_libs/tslibs/tzconversion.pyx | 24 +++++++++++++++++++++++- pandas/_libs/tslibs/vectorized.pyx | 12 +++--------- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 3e3f3f23dfdda..8af9416cc62a3 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -30,4 +30,5 @@ cdef class Localizer: cdef intp_t* prepare1(self, int64_t utc_val) cdef intp_t* prepare(self, const int64_t[:] stamps) - cdef int64_t utc_val_to_local_val(self, int64_t utc_val, intp_t* pos, Py_ssize_t i) + cdef inline int64_t utc_val_to_local_val(self, int64_t utc_val, intp_t* pos, Py_ssize_t i) + cdef bint is_date_array_normalized(self, const int64_t[:] stamps) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index a15495763d8a0..175210c0fc58f 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -579,7 +579,7 @@ cdef class Localizer: return cnp.PyArray_DATA(self.trans.searchsorted(stamps, side="right") - 1) - cdef int64_t utc_val_to_local_val(self, int64_t utc_val, intp_t* pos, Py_ssize_t i): + cdef inline int64_t utc_val_to_local_val(self, int64_t utc_val, intp_t* pos, Py_ssize_t i): cdef: int64_t local_val @@ -593,3 +593,25 @@ cdef class Localizer: local_val = utc_val + self.deltas[pos[i]] return local_val + + # WTF both tests and perf seem sensitive to these two decorators, + # but in a hard-to-reproduce manner. + # In particular pandas/tests/indexes/datetimes/test_timezones.py + @cython.wraparound(False) + @cython.boundscheck(False) + cdef bint is_date_array_normalized(self, const int64_t[:] stamps): + cdef: + Py_ssize_t i, n = len(stamps) + int64_t local_val + int64_t day_nanos = 24 * 3600 * 1_000_000_000 + intp_t* pos + + pos = self.prepare(stamps) + + for i in range(n): + local_val = self.utc_val_to_local_val(stamps[i], pos, i) + + if local_val % day_nanos != 0: + return False + + return True diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 42f4976c091e8..bb91324dd54f4 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -189,6 +189,8 @@ cdef inline int _reso_stamp(npy_datetimestruct *dts): return RESO_DAY +@cython.wraparound(False) +@cython.boundscheck(False) def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: cdef: Py_ssize_t i, n = len(stamps) @@ -278,15 +280,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: intp_t* pos Localizer info = Localizer(tz) - pos = info.prepare(stamps) - - for i in range(n): - local_val = info.utc_val_to_local_val(stamps[i], pos, i) - - if local_val % day_nanos != 0: - return False - - return True + return info.is_date_array_normalized(stamps) # ------------------------------------------------------------------------- From 444e1de436d9c92babf05423f2d24c0583d4f595 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Mar 2022 09:56:24 -0800 Subject: [PATCH 11/16] revert --- pandas/_libs/tslibs/tzconversion.pxd | 1 - pandas/_libs/tslibs/tzconversion.pyx | 22 ---------------------- pandas/_libs/tslibs/vectorized.pyx | 10 +++++++++- 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 8af9416cc62a3..adaa7ccd0b01c 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -31,4 +31,3 @@ cdef class Localizer: cdef intp_t* prepare1(self, int64_t utc_val) cdef intp_t* prepare(self, const int64_t[:] stamps) cdef inline int64_t utc_val_to_local_val(self, int64_t utc_val, intp_t* pos, Py_ssize_t i) - cdef bint is_date_array_normalized(self, const int64_t[:] stamps) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 175210c0fc58f..9c58a7260be9c 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -593,25 +593,3 @@ cdef class Localizer: local_val = utc_val + self.deltas[pos[i]] return local_val - - # WTF both tests and perf seem sensitive to these two decorators, - # but in a hard-to-reproduce manner. - # In particular pandas/tests/indexes/datetimes/test_timezones.py - @cython.wraparound(False) - @cython.boundscheck(False) - cdef bint is_date_array_normalized(self, const int64_t[:] stamps): - cdef: - Py_ssize_t i, n = len(stamps) - int64_t local_val - int64_t day_nanos = 24 * 3600 * 1_000_000_000 - intp_t* pos - - pos = self.prepare(stamps) - - for i in range(n): - local_val = self.utc_val_to_local_val(stamps[i], pos, i) - - if local_val % day_nanos != 0: - return False - - return True diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index bb91324dd54f4..7e8c6710e05a0 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -280,7 +280,15 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: intp_t* pos Localizer info = Localizer(tz) - return info.is_date_array_normalized(stamps) + pos = info.prepare(stamps) + + for i in range(n): + local_val = info.utc_val_to_local_val(stamps[i], pos, i) + + if local_val % day_nanos != 0: + return False + + return True # ------------------------------------------------------------------------- From 9ad95964afc788120df35a1d4ecaabd63375eadb Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Mar 2022 09:58:51 -0800 Subject: [PATCH 12/16] remove unused import --- pandas/_libs/tslibs/timezones.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index ca33490b472ce..224c5be1f3b7d 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -3,8 +3,6 @@ from datetime import ( timezone, ) -import cython - from cpython.datetime cimport ( datetime, timedelta, From 71ecf456af7d4a892b8e4028447a1ccd397e412e Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Mar 2022 09:59:21 -0800 Subject: [PATCH 13/16] cln --- pandas/_libs/tslibs/tzconversion.pxd | 4 +++- pandas/_libs/tslibs/tzconversion.pyx | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index adaa7ccd0b01c..06921adf70136 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -30,4 +30,6 @@ cdef class Localizer: cdef intp_t* prepare1(self, int64_t utc_val) cdef intp_t* prepare(self, const int64_t[:] stamps) - cdef inline int64_t utc_val_to_local_val(self, int64_t utc_val, intp_t* pos, Py_ssize_t i) + cdef inline int64_t utc_val_to_local_val( + self, int64_t utc_val, intp_t* pos, Py_ssize_t i + ) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 9c58a7260be9c..d04da05b1908b 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -577,9 +577,13 @@ cdef class Localizer: cdef intp_t* prepare(self, const int64_t[:] stamps): if self.use_dst: - return cnp.PyArray_DATA(self.trans.searchsorted(stamps, side="right") - 1) + return cnp.PyArray_DATA( + self.trans.searchsorted(stamps, side="right") - 1 + ) - cdef inline int64_t utc_val_to_local_val(self, int64_t utc_val, intp_t* pos, Py_ssize_t i): + cdef inline int64_t utc_val_to_local_val( + self, int64_t utc_val, intp_t* pos, Py_ssize_t i + ): cdef: int64_t local_val From 5d9581d9d5b174125372b1f3edc64ea855d3f2b0 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Mar 2022 17:07:06 -0800 Subject: [PATCH 14/16] small optimizations --- pandas/_libs/tslibs/tzconversion.pxd | 2 +- pandas/_libs/tslibs/tzconversion.pyx | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 06921adf70136..1da9809ef9f79 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -24,7 +24,7 @@ cdef class Localizer: bint use_dst bint use_pytz ndarray trans - int64_t[:] deltas + int64_t[::1] deltas int64_t delta str typ diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index d04da05b1908b..8a2d28d14c9c1 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -545,6 +545,7 @@ cdef class Localizer: # int64_t delta # str typ + @cython.boundscheck(False) def __cinit__(self, tzinfo tz): self.tz = tz if is_utc(tz) or tz is None: @@ -581,6 +582,7 @@ cdef class Localizer: self.trans.searchsorted(stamps, side="right") - 1 ) + @cython.boundscheck(False) cdef inline int64_t utc_val_to_local_val( self, int64_t utc_val, intp_t* pos, Py_ssize_t i ): From a9c4547b653941fca8e77e46fbacafcbcd01784c Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Mar 2022 17:51:59 -0800 Subject: [PATCH 15/16] Avoid invalid pointer return --- pandas/_libs/tslibs/tzconversion.pxd | 2 +- pandas/_libs/tslibs/tzconversion.pyx | 13 +++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 1da9809ef9f79..658c5940035bb 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -28,7 +28,7 @@ cdef class Localizer: int64_t delta str typ - cdef intp_t* prepare1(self, int64_t utc_val) + cdef intp_t prepare1(self, int64_t utc_val) cdef intp_t* prepare(self, const int64_t[:] stamps) cdef inline int64_t utc_val_to_local_val( self, int64_t utc_val, intp_t* pos, Py_ssize_t i diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 8a2d28d14c9c1..b0eeddaff3e3c 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -399,7 +399,7 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): converted: int64 """ cdef: - intp_t* pos + intp_t pos Localizer info = Localizer(tz) if val == NPY_NAT: @@ -407,7 +407,7 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): pos = info.prepare1(val) - return info.utc_val_to_local_val(val, pos, 0) + return info.utc_val_to_local_val(val, &pos, 0) def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): @@ -567,13 +567,10 @@ cdef class Localizer: if typ == "pytz": self.use_pytz = True - cdef intp_t* prepare1(self, int64_t utc_val): - cdef: - intp_t loc - + cdef intp_t prepare1(self, int64_t utc_val): if self.use_dst: - loc = self.trans.searchsorted(utc_val, side="right") - 1 - return &loc + return self.trans.searchsorted(utc_val, side="right") - 1 + return 0 # won't be used cdef intp_t* prepare(self, const int64_t[:] stamps): if self.use_dst: From 18923954e0c5716505f13fde9345ccc000eb227c Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Mar 2022 11:48:36 -0800 Subject: [PATCH 16/16] troublehsoot --- pandas/_libs/tslibs/tzconversion.pxd | 2 +- pandas/_libs/tslibs/tzconversion.pyx | 15 ++++++++++----- pandas/_libs/tslibs/vectorized.pyx | 23 ++++++++++++++++++----- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 658c5940035bb..82e3c87abbbd6 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -29,7 +29,7 @@ cdef class Localizer: str typ cdef intp_t prepare1(self, int64_t utc_val) - cdef intp_t* prepare(self, const int64_t[:] stamps) + cdef ndarray[intp_t] prepare(self, const int64_t[:] stamps) cdef inline int64_t utc_val_to_local_val( self, int64_t utc_val, intp_t* pos, Py_ssize_t i ) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index b0eeddaff3e3c..02ae53580c2c6 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -452,6 +452,7 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): int64_t[:] converted Py_ssize_t i, n = len(vals) int64_t val + ndarray[intp_t] pos_ intp_t* pos Localizer info = Localizer(tz) @@ -461,7 +462,8 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): converted = np.empty(n, dtype=np.int64) - pos = info.prepare(vals) + pos_ = info.prepare(vals) + pos = cnp.PyArray_DATA(pos_) for i in range(n): val = vals[i] @@ -572,12 +574,11 @@ cdef class Localizer: return self.trans.searchsorted(utc_val, side="right") - 1 return 0 # won't be used - cdef intp_t* prepare(self, const int64_t[:] stamps): + cdef ndarray[intp_t] prepare(self, const int64_t[:] stamps): if self.use_dst: - return cnp.PyArray_DATA( - self.trans.searchsorted(stamps, side="right") - 1 - ) + return self.trans.searchsorted(stamps, side="right") - 1 + return placeholder # won't be used @cython.boundscheck(False) cdef inline int64_t utc_val_to_local_val( @@ -596,3 +597,7 @@ cdef class Localizer: local_val = utc_val + self.deltas[pos[i]] return local_val + + +# Placeholder to return from 'prepare' +cdef ndarray placeholder = np.array([], dtype=np.intp) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 7e8c6710e05a0..514f58df2879f 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -1,5 +1,6 @@ import cython +cimport numpy as cnp from cpython.datetime cimport ( date, datetime, @@ -15,6 +16,8 @@ from numpy cimport ( ndarray, ) +cnp.import_array() + from .conversion cimport normalize_i8_stamp from .dtypes import Resolution @@ -115,6 +118,7 @@ def ints_to_pydatetime( """ cdef: Py_ssize_t i, n = len(stamps) + ndarray[intp_t] pos_ intp_t* pos npy_datetimestruct dts object new_tz @@ -138,7 +142,8 @@ def ints_to_pydatetime( "box must be one of 'datetime', 'date', 'time' or 'timestamp'" ) - pos = info.prepare(stamps) + pos_ = info.prepare(stamps) + pos = cnp.PyArray_DATA(pos_) for i in range(n): new_tz = tz @@ -196,11 +201,13 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: Py_ssize_t i, n = len(stamps) npy_datetimestruct dts int reso = RESO_DAY, curr_reso + ndarray[intp_t] pos_ intp_t* pos int64_t local_val Localizer info = Localizer(tz) - pos = info.prepare(stamps) + pos_ = info.prepare(stamps) + pos = cnp.PyArray_DATA(pos_) for i in range(n): if stamps[i] == NPY_NAT: @@ -238,11 +245,13 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t cdef: Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) + ndarray[intp_t] pos_ intp_t* pos int64_t local_val Localizer info = Localizer(tz) - pos = info.prepare(stamps) + pos_ = info.prepare(stamps) + pos = cnp.PyArray_DATA(pos_) for i in range(n): if stamps[i] == NPY_NAT: @@ -277,10 +286,12 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: Py_ssize_t i, n = len(stamps) int64_t local_val int64_t day_nanos = 24 * 3600 * 1_000_000_000 + ndarray[intp_t] pos_ intp_t* pos Localizer info = Localizer(tz) - pos = info.prepare(stamps) + pos_ = info.prepare(stamps) + pos = cnp.PyArray_DATA(pos_) for i in range(n): local_val = info.utc_val_to_local_val(stamps[i], pos, i) @@ -300,12 +311,14 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) + ndarray[intp_t] pos_ intp_t* pos npy_datetimestruct dts int64_t local_val Localizer info = Localizer(tz) - pos = info.prepare(stamps) + pos_ = info.prepare(stamps) + pos = cnp.PyArray_DATA(pos_) for i in range(n): if stamps[i] == NPY_NAT: