diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index 9e18a64..5d97c02 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -2471,12 +2471,6 @@ def _seconds_to_time(v: Any) -> Optional[dt.time]: _SAS_DATETIME_SAFE_S = 7_500_000_000 _SAS_DATETIME_SAFE_D = 87_000 -# Number of non-null values :func:`_safe_object_to_datetime` peeks at to -# decide which parse path to use for the whole chunk. Keeps format -# detection to a bounded cost so a 1M-row chunk doesn't pay for a -# full row-walk just to figure out what shape its dates are in. -_DATETIME_FORMAT_SAMPLE = 16 - def _safe_numeric_to_datetime( series: pd.Series, @@ -2596,74 +2590,42 @@ def _safe_object_to_datetime( numeric, unit="s", column_name=column_name, target_type=target_type, ) - # Format sniff: peek at up to ``_DATETIME_FORMAT_SAMPLE`` non-null - # string values to decide the parse path for the WHOLE chunk. The - # previous version ran a full row-walk + up-to-20 vectorized - # ``pd.to_datetime(format=fmt)`` attempts per column per chunk; a - # single fat chunk (millions of rows × a few date columns) could - # pin a CPU for minutes. Sniffing first keeps the hot path to one - # O(n) pass. - non_null = coerced.dropna() - if not non_null.empty: - samples: List[str] = [] - for v in non_null.head(_DATETIME_FORMAT_SAMPLE): - if isinstance(v, str): - samples.append(v.strip()) - else: - # Mixed object column (e.g. already-parsed Timestamps + - # strings). Skip sniffing; let dateutil handle it. - samples = [] - break + non_null_count = int(coerced.notna().sum()) + if non_null_count: + # First pass: our regex-based ``DD-MON-YY`` parser. Cheap, + # locale-independent, covers the cases ``pd.to_datetime`` warns + # about. Always parse via the datetime-aware variant so a DATE + # target whose chunk happens to carry time components + # (``23-MAR-20 14:30:00``) still parses without warnings; the + # caller's ``.dt.date`` cast truncates the time, matching the + # existing datetime64-input branch. + parsed_py = coerced.map( + lambda v: _parse_flexible_datetime(v) if v is not None else None + ) + parsed_ts = pd.to_datetime(parsed_py, errors="coerce") + if int(parsed_ts.notna().sum()) == non_null_count: + return parsed_ts - if samples: - # DD-MON-YY family: one pandas ``Series.map`` with our - # regex parser, then a single ``pd.to_datetime`` to land - # on ``datetime64[ns]``. ``pd.to_datetime(format=...)`` - # has no ``%b``-with-locale-free semantics, so this is - # the vectorized win available for this format family. - if all(_DDMONYY_RE.match(s) for s in samples): + # Second pass: vectorized ``pd.to_datetime`` with each explicit + # format. One ``pd.to_datetime(format=fmt)`` call is O(n) in C; + # trying a handful of them still beats row-by-row dateutil on + # large chunks. Accept the first format that covers every + # non-null cell. + for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS: + try: with np.errstate(over="ignore", invalid="ignore", divide="ignore"): - return pd.to_datetime( - coerced.map( - lambda v: _parse_flexible_datetime(v) - if isinstance(v, str) else None - ), - errors="coerce", - ) + candidate = pd.to_datetime(coerced, format=fmt, errors="coerce") + except (ValueError, TypeError): + continue + if int(candidate.notna().sum()) == non_null_count: + return candidate - # Numeric strptime shapes: pick the first format that - # parses every sample, then run ONE vectorized - # ``pd.to_datetime(format=fmt)`` over the full column. - # Bounded to the sample pass -- no 20×O(n) blow-up. - for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS: - ok = True - for s in samples: - try: - dt.datetime.strptime(s, fmt) - except ValueError: - ok = False - break - if ok: - try: - with np.errstate(over="ignore", invalid="ignore", divide="ignore"): - return pd.to_datetime( - coerced, format=fmt, errors="coerce", - ) - except (ValueError, TypeError): - continue - - # Fallback: ``pd.to_datetime`` / dateutil. Handles shapes the - # sniffer missed (mixed formats within one column, - # already-parsed Timestamp/date objects sharing space with - # strings, ISO 8601 with offsets, etc.). Wrap in a warning - # filter because the unformatted path emits ``UserWarning: - # Could not infer format...`` once per chunk and we don't want - # the progress bar drowned. + # Final fallback: ``dateutil`` via ``pd.to_datetime``. Handles + # shapes our explicit list missed (rare edge cases, mixed formats + # within one column). Same safety net as the numeric path. try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=UserWarning) - with np.errstate(over="ignore", invalid="ignore", divide="ignore"): - return pd.to_datetime(coerced, errors="coerce") + with np.errstate(over="ignore", invalid="ignore", divide="ignore"): + return pd.to_datetime(coerced, errors="coerce") except (FloatingPointError, OverflowError, ValueError) as exc: tqdm.write( f"[error] {target_type} column {column_name!r}: "