advanced_dates #11

Merged
dp merged 4 commits from advanced_dates into main 2026-04-23 22:33:18 +00:00
Showing only changes of commit 857f696305 - Show all commits

View File

@ -2471,6 +2471,12 @@ def _seconds_to_time(v: Any) -> Optional[dt.time]:
_SAS_DATETIME_SAFE_S = 7_500_000_000 _SAS_DATETIME_SAFE_S = 7_500_000_000
_SAS_DATETIME_SAFE_D = 87_000 _SAS_DATETIME_SAFE_D = 87_000
# Number of non-null values :func:`_safe_object_to_datetime` peeks at to
# decide which parse path to use for the whole chunk. Keeps format
# detection to a bounded cost so a 1M-row chunk doesn't pay for a
# full row-walk just to figure out what shape its dates are in.
_DATETIME_FORMAT_SAMPLE = 16
def _safe_numeric_to_datetime( def _safe_numeric_to_datetime(
series: pd.Series, series: pd.Series,
@ -2590,42 +2596,74 @@ def _safe_object_to_datetime(
numeric, unit="s", column_name=column_name, target_type=target_type, numeric, unit="s", column_name=column_name, target_type=target_type,
) )
non_null_count = int(coerced.notna().sum()) # Format sniff: peek at up to ``_DATETIME_FORMAT_SAMPLE`` non-null
if non_null_count: # string values to decide the parse path for the WHOLE chunk. The
# First pass: our regex-based ``DD-MON-YY`` parser. Cheap, # previous version ran a full row-walk + up-to-20 vectorized
# locale-independent, covers the cases ``pd.to_datetime`` warns # ``pd.to_datetime(format=fmt)`` attempts per column per chunk; a
# about. Always parse via the datetime-aware variant so a DATE # single fat chunk (millions of rows × a few date columns) could
# target whose chunk happens to carry time components # pin a CPU for minutes. Sniffing first keeps the hot path to one
# (``23-MAR-20 14:30:00``) still parses without warnings; the # O(n) pass.
# caller's ``.dt.date`` cast truncates the time, matching the non_null = coerced.dropna()
# existing datetime64-input branch. if not non_null.empty:
parsed_py = coerced.map( samples: List[str] = []
lambda v: _parse_flexible_datetime(v) if v is not None else None for v in non_null.head(_DATETIME_FORMAT_SAMPLE):
) if isinstance(v, str):
parsed_ts = pd.to_datetime(parsed_py, errors="coerce") samples.append(v.strip())
if int(parsed_ts.notna().sum()) == non_null_count: else:
return parsed_ts # Mixed object column (e.g. already-parsed Timestamps +
# strings). Skip sniffing; let dateutil handle it.
samples = []
break
# Second pass: vectorized ``pd.to_datetime`` with each explicit if samples:
# format. One ``pd.to_datetime(format=fmt)`` call is O(n) in C; # DD-MON-YY family: one pandas ``Series.map`` with our
# trying a handful of them still beats row-by-row dateutil on # regex parser, then a single ``pd.to_datetime`` to land
# large chunks. Accept the first format that covers every # on ``datetime64[ns]``. ``pd.to_datetime(format=...)``
# non-null cell. # has no ``%b``-with-locale-free semantics, so this is
for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS: # the vectorized win available for this format family.
try: if all(_DDMONYY_RE.match(s) for s in samples):
with np.errstate(over="ignore", invalid="ignore", divide="ignore"): with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
candidate = pd.to_datetime(coerced, format=fmt, errors="coerce") return pd.to_datetime(
except (ValueError, TypeError): coerced.map(
continue lambda v: _parse_flexible_datetime(v)
if int(candidate.notna().sum()) == non_null_count: if isinstance(v, str) else None
return candidate ),
errors="coerce",
)
# Final fallback: ``dateutil`` via ``pd.to_datetime``. Handles # Numeric strptime shapes: pick the first format that
# shapes our explicit list missed (rare edge cases, mixed formats # parses every sample, then run ONE vectorized
# within one column). Same safety net as the numeric path. # ``pd.to_datetime(format=fmt)`` over the full column.
# Bounded to the sample pass -- no 20×O(n) blow-up.
for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS:
ok = True
for s in samples:
try:
dt.datetime.strptime(s, fmt)
except ValueError:
ok = False
break
if ok:
try:
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
return pd.to_datetime(
coerced, format=fmt, errors="coerce",
)
except (ValueError, TypeError):
continue
# Fallback: ``pd.to_datetime`` / dateutil. Handles shapes the
# sniffer missed (mixed formats within one column,
# already-parsed Timestamp/date objects sharing space with
# strings, ISO 8601 with offsets, etc.). Wrap in a warning
# filter because the unformatted path emits ``UserWarning:
# Could not infer format...`` once per chunk and we don't want
# the progress bar drowned.
try: try:
with np.errstate(over="ignore", invalid="ignore", divide="ignore"): with warnings.catch_warnings():
return pd.to_datetime(coerced, errors="coerce") warnings.simplefilter("ignore", category=UserWarning)
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
return pd.to_datetime(coerced, errors="coerce")
except (FloatingPointError, OverflowError, ValueError) as exc: except (FloatingPointError, OverflowError, ValueError) as exc:
tqdm.write( tqdm.write(
f"[error] {target_type} column {column_name!r}: " f"[error] {target_type} column {column_name!r}: "