Revert "Optimize datetime parsing in load_sas.py by implementing a sample-based format detection approach"
This reverts commit 857f696305.
This commit is contained in:
parent
857f696305
commit
998a3e282f
@ -2471,12 +2471,6 @@ def _seconds_to_time(v: Any) -> Optional[dt.time]:
|
|||||||
_SAS_DATETIME_SAFE_S = 7_500_000_000
|
_SAS_DATETIME_SAFE_S = 7_500_000_000
|
||||||
_SAS_DATETIME_SAFE_D = 87_000
|
_SAS_DATETIME_SAFE_D = 87_000
|
||||||
|
|
||||||
# Number of non-null values :func:`_safe_object_to_datetime` peeks at to
|
|
||||||
# decide which parse path to use for the whole chunk. Keeps format
|
|
||||||
# detection to a bounded cost so a 1M-row chunk doesn't pay for a
|
|
||||||
# full row-walk just to figure out what shape its dates are in.
|
|
||||||
_DATETIME_FORMAT_SAMPLE = 16
|
|
||||||
|
|
||||||
|
|
||||||
def _safe_numeric_to_datetime(
|
def _safe_numeric_to_datetime(
|
||||||
series: pd.Series,
|
series: pd.Series,
|
||||||
@ -2596,72 +2590,40 @@ def _safe_object_to_datetime(
|
|||||||
numeric, unit="s", column_name=column_name, target_type=target_type,
|
numeric, unit="s", column_name=column_name, target_type=target_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Format sniff: peek at up to ``_DATETIME_FORMAT_SAMPLE`` non-null
|
non_null_count = int(coerced.notna().sum())
|
||||||
# string values to decide the parse path for the WHOLE chunk. The
|
if non_null_count:
|
||||||
# previous version ran a full row-walk + up-to-20 vectorized
|
# First pass: our regex-based ``DD-MON-YY`` parser. Cheap,
|
||||||
# ``pd.to_datetime(format=fmt)`` attempts per column per chunk; a
|
# locale-independent, covers the cases ``pd.to_datetime`` warns
|
||||||
# single fat chunk (millions of rows × a few date columns) could
|
# about. Always parse via the datetime-aware variant so a DATE
|
||||||
# pin a CPU for minutes. Sniffing first keeps the hot path to one
|
# target whose chunk happens to carry time components
|
||||||
# O(n) pass.
|
# (``23-MAR-20 14:30:00``) still parses without warnings; the
|
||||||
non_null = coerced.dropna()
|
# caller's ``.dt.date`` cast truncates the time, matching the
|
||||||
if not non_null.empty:
|
# existing datetime64-input branch.
|
||||||
samples: List[str] = []
|
parsed_py = coerced.map(
|
||||||
for v in non_null.head(_DATETIME_FORMAT_SAMPLE):
|
lambda v: _parse_flexible_datetime(v) if v is not None else None
|
||||||
if isinstance(v, str):
|
|
||||||
samples.append(v.strip())
|
|
||||||
else:
|
|
||||||
# Mixed object column (e.g. already-parsed Timestamps +
|
|
||||||
# strings). Skip sniffing; let dateutil handle it.
|
|
||||||
samples = []
|
|
||||||
break
|
|
||||||
|
|
||||||
if samples:
|
|
||||||
# DD-MON-YY family: one pandas ``Series.map`` with our
|
|
||||||
# regex parser, then a single ``pd.to_datetime`` to land
|
|
||||||
# on ``datetime64[ns]``. ``pd.to_datetime(format=...)``
|
|
||||||
# has no ``%b``-with-locale-free semantics, so this is
|
|
||||||
# the vectorized win available for this format family.
|
|
||||||
if all(_DDMONYY_RE.match(s) for s in samples):
|
|
||||||
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
|
|
||||||
return pd.to_datetime(
|
|
||||||
coerced.map(
|
|
||||||
lambda v: _parse_flexible_datetime(v)
|
|
||||||
if isinstance(v, str) else None
|
|
||||||
),
|
|
||||||
errors="coerce",
|
|
||||||
)
|
)
|
||||||
|
parsed_ts = pd.to_datetime(parsed_py, errors="coerce")
|
||||||
|
if int(parsed_ts.notna().sum()) == non_null_count:
|
||||||
|
return parsed_ts
|
||||||
|
|
||||||
# Numeric strptime shapes: pick the first format that
|
# Second pass: vectorized ``pd.to_datetime`` with each explicit
|
||||||
# parses every sample, then run ONE vectorized
|
# format. One ``pd.to_datetime(format=fmt)`` call is O(n) in C;
|
||||||
# ``pd.to_datetime(format=fmt)`` over the full column.
|
# trying a handful of them still beats row-by-row dateutil on
|
||||||
# Bounded to the sample pass -- no 20×O(n) blow-up.
|
# large chunks. Accept the first format that covers every
|
||||||
|
# non-null cell.
|
||||||
for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS:
|
for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS:
|
||||||
ok = True
|
|
||||||
for s in samples:
|
|
||||||
try:
|
|
||||||
dt.datetime.strptime(s, fmt)
|
|
||||||
except ValueError:
|
|
||||||
ok = False
|
|
||||||
break
|
|
||||||
if ok:
|
|
||||||
try:
|
try:
|
||||||
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
|
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
|
||||||
return pd.to_datetime(
|
candidate = pd.to_datetime(coerced, format=fmt, errors="coerce")
|
||||||
coerced, format=fmt, errors="coerce",
|
|
||||||
)
|
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
continue
|
continue
|
||||||
|
if int(candidate.notna().sum()) == non_null_count:
|
||||||
|
return candidate
|
||||||
|
|
||||||
# Fallback: ``pd.to_datetime`` / dateutil. Handles shapes the
|
# Final fallback: ``dateutil`` via ``pd.to_datetime``. Handles
|
||||||
# sniffer missed (mixed formats within one column,
|
# shapes our explicit list missed (rare edge cases, mixed formats
|
||||||
# already-parsed Timestamp/date objects sharing space with
|
# within one column). Same safety net as the numeric path.
|
||||||
# strings, ISO 8601 with offsets, etc.). Wrap in a warning
|
|
||||||
# filter because the unformatted path emits ``UserWarning:
|
|
||||||
# Could not infer format...`` once per chunk and we don't want
|
|
||||||
# the progress bar drowned.
|
|
||||||
try:
|
try:
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.simplefilter("ignore", category=UserWarning)
|
|
||||||
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
|
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
|
||||||
return pd.to_datetime(coerced, errors="coerce")
|
return pd.to_datetime(coerced, errors="coerce")
|
||||||
except (FloatingPointError, OverflowError, ValueError) as exc:
|
except (FloatingPointError, OverflowError, ValueError) as exc:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user