diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index 5d97c02..9e18a64 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -2471,6 +2471,12 @@ def _seconds_to_time(v: Any) -> Optional[dt.time]: _SAS_DATETIME_SAFE_S = 7_500_000_000 _SAS_DATETIME_SAFE_D = 87_000 +# Number of non-null values :func:`_safe_object_to_datetime` peeks at to +# decide which parse path to use for the whole chunk. Keeps format +# detection to a bounded cost so a 1M-row chunk doesn't pay for a +# full row-walk just to figure out what shape its dates are in. +_DATETIME_FORMAT_SAMPLE = 16 + def _safe_numeric_to_datetime( series: pd.Series, @@ -2590,42 +2596,74 @@ def _safe_object_to_datetime( numeric, unit="s", column_name=column_name, target_type=target_type, ) - non_null_count = int(coerced.notna().sum()) - if non_null_count: - # First pass: our regex-based ``DD-MON-YY`` parser. Cheap, - # locale-independent, covers the cases ``pd.to_datetime`` warns - # about. Always parse via the datetime-aware variant so a DATE - # target whose chunk happens to carry time components - # (``23-MAR-20 14:30:00``) still parses without warnings; the - # caller's ``.dt.date`` cast truncates the time, matching the - # existing datetime64-input branch. - parsed_py = coerced.map( - lambda v: _parse_flexible_datetime(v) if v is not None else None - ) - parsed_ts = pd.to_datetime(parsed_py, errors="coerce") - if int(parsed_ts.notna().sum()) == non_null_count: - return parsed_ts + # Format sniff: peek at up to ``_DATETIME_FORMAT_SAMPLE`` non-null + # string values to decide the parse path for the WHOLE chunk. The + # previous version ran a full row-walk + up-to-20 vectorized + # ``pd.to_datetime(format=fmt)`` attempts per column per chunk; a + # single fat chunk (millions of rows × a few date columns) could + # pin a CPU for minutes. Sniffing first keeps the hot path to one + # O(n) pass. + non_null = coerced.dropna() + if not non_null.empty: + samples: List[str] = [] + for v in non_null.head(_DATETIME_FORMAT_SAMPLE): + if isinstance(v, str): + samples.append(v.strip()) + else: + # Mixed object column (e.g. already-parsed Timestamps + + # strings). Skip sniffing; let dateutil handle it. + samples = [] + break - # Second pass: vectorized ``pd.to_datetime`` with each explicit - # format. One ``pd.to_datetime(format=fmt)`` call is O(n) in C; - # trying a handful of them still beats row-by-row dateutil on - # large chunks. Accept the first format that covers every - # non-null cell. - for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS: - try: + if samples: + # DD-MON-YY family: one pandas ``Series.map`` with our + # regex parser, then a single ``pd.to_datetime`` to land + # on ``datetime64[ns]``. ``pd.to_datetime(format=...)`` + # has no ``%b``-with-locale-free semantics, so this is + # the vectorized win available for this format family. + if all(_DDMONYY_RE.match(s) for s in samples): with np.errstate(over="ignore", invalid="ignore", divide="ignore"): - candidate = pd.to_datetime(coerced, format=fmt, errors="coerce") - except (ValueError, TypeError): - continue - if int(candidate.notna().sum()) == non_null_count: - return candidate + return pd.to_datetime( + coerced.map( + lambda v: _parse_flexible_datetime(v) + if isinstance(v, str) else None + ), + errors="coerce", + ) - # Final fallback: ``dateutil`` via ``pd.to_datetime``. Handles - # shapes our explicit list missed (rare edge cases, mixed formats - # within one column). Same safety net as the numeric path. + # Numeric strptime shapes: pick the first format that + # parses every sample, then run ONE vectorized + # ``pd.to_datetime(format=fmt)`` over the full column. + # Bounded to the sample pass -- no 20×O(n) blow-up. + for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS: + ok = True + for s in samples: + try: + dt.datetime.strptime(s, fmt) + except ValueError: + ok = False + break + if ok: + try: + with np.errstate(over="ignore", invalid="ignore", divide="ignore"): + return pd.to_datetime( + coerced, format=fmt, errors="coerce", + ) + except (ValueError, TypeError): + continue + + # Fallback: ``pd.to_datetime`` / dateutil. Handles shapes the + # sniffer missed (mixed formats within one column, + # already-parsed Timestamp/date objects sharing space with + # strings, ISO 8601 with offsets, etc.). Wrap in a warning + # filter because the unformatted path emits ``UserWarning: + # Could not infer format...`` once per chunk and we don't want + # the progress bar drowned. try: - with np.errstate(over="ignore", invalid="ignore", divide="ignore"): - return pd.to_datetime(coerced, errors="coerce") + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + with np.errstate(over="ignore", invalid="ignore", divide="ignore"): + return pd.to_datetime(coerced, errors="coerce") except (FloatingPointError, OverflowError, ValueError) as exc: tqdm.write( f"[error] {target_type} column {column_name!r}: "