Optimize datetime parsing in load_sas.py by implementing a sample-based format detection approach

Introduced a new mechanism to sample non-null values for determining the appropriate datetime parsing strategy, significantly reducing processing time for large datasets. This change replaces the previous full row-walk method with a more efficient sampling technique, enhancing performance while maintaining robust handling of various date formats. Updated comments for clarity on the new approach.
2026-04-22 12:54:19 -05:00 · 2026-04-22 12:54:19 -05:00 · 857f696305
commit 857f696305
parent c3fa943e77
1 changed files with 70 additions and 32 deletions
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -2471,6 +2471,12 @@ def _seconds_to_time(v: Any) -> Optional[dt.time]:
 _SAS_DATETIME_SAFE_S = 7_500_000_000
 _SAS_DATETIME_SAFE_D = 87_000
 # Number of non-null values :func:`_safe_object_to_datetime` peeks at to
 # decide which parse path to use for the whole chunk. Keeps format
 # detection to a bounded cost so a 1M-row chunk doesn't pay for a
 # full row-walk just to figure out what shape its dates are in.
 _DATETIME_FORMAT_SAMPLE = 16
 def _safe_numeric_to_datetime(
    series: pd.Series,
@ -2590,42 +2596,74 @@ def _safe_object_to_datetime(
            numeric, unit="s", column_name=column_name, target_type=target_type,
        )
-    non_null_count = int(coerced.notna().sum())
+    # Format sniff: peek at up to ``_DATETIME_FORMAT_SAMPLE`` non-null
-    if non_null_count:
+    # string values to decide the parse path for the WHOLE chunk. The
-        # First pass: our regex-based ``DD-MON-YY`` parser. Cheap,
+    # previous version ran a full row-walk + up-to-20 vectorized
-        # locale-independent, covers the cases ``pd.to_datetime`` warns
+    # ``pd.to_datetime(format=fmt)`` attempts per column per chunk; a
-        # about. Always parse via the datetime-aware variant so a DATE
+    # single fat chunk (millions of rows × a few date columns) could
-        # target whose chunk happens to carry time components
+    # pin a CPU for minutes. Sniffing first keeps the hot path to one
-        # (``23-MAR-20 14:30:00``) still parses without warnings; the
+    # O(n) pass.
-        # caller's ``.dt.date`` cast truncates the time, matching the
+    non_null = coerced.dropna()
-        # existing datetime64-input branch.
+    if not non_null.empty:
-        parsed_py = coerced.map(
+        samples: List[str] = []
-            lambda v: _parse_flexible_datetime(v) if v is not None else None
+        for v in non_null.head(_DATETIME_FORMAT_SAMPLE):
-        )
+            if isinstance(v, str):
-        parsed_ts = pd.to_datetime(parsed_py, errors="coerce")
+                samples.append(v.strip())
-        if int(parsed_ts.notna().sum()) == non_null_count:
+            else:
-            return parsed_ts
+                # Mixed object column (e.g. already-parsed Timestamps +
                # strings). Skip sniffing; let dateutil handle it.
                samples = []
                break
-        # Second pass: vectorized ``pd.to_datetime`` with each explicit
+        if samples:
-        # format. One ``pd.to_datetime(format=fmt)`` call is O(n) in C;
+            # DD-MON-YY family: one pandas ``Series.map`` with our
-        # trying a handful of them still beats row-by-row dateutil on
+            # regex parser, then a single ``pd.to_datetime`` to land
-        # large chunks. Accept the first format that covers every
+            # on ``datetime64[ns]``. ``pd.to_datetime(format=...)``
-        # non-null cell.
+            # has no ``%b``-with-locale-free semantics, so this is
-        for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS:
+            # the vectorized win available for this format family.
-            try:
+            if all(_DDMONYY_RE.match(s) for s in samples):
                with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
-                    candidate = pd.to_datetime(coerced, format=fmt, errors="coerce")
+                    return pd.to_datetime(
-            except (ValueError, TypeError):
+                        coerced.map(
-                continue
+                            lambda v: _parse_flexible_datetime(v)
-            if int(candidate.notna().sum()) == non_null_count:
+                            if isinstance(v, str) else None
-                return candidate
+                        ),
                        errors="coerce",
                    )
-    # Final fallback: ``dateutil`` via ``pd.to_datetime``. Handles
+            # Numeric strptime shapes: pick the first format that
-    # shapes our explicit list missed (rare edge cases, mixed formats
+            # parses every sample, then run ONE vectorized
-    # within one column). Same safety net as the numeric path.
+            # ``pd.to_datetime(format=fmt)`` over the full column.
            # Bounded to the sample pass -- no 20×O(n) blow-up.
            for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS:
                ok = True
                for s in samples:
                    try:
                        dt.datetime.strptime(s, fmt)
                    except ValueError:
                        ok = False
                        break
                if ok:
                    try:
                        with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
                            return pd.to_datetime(
                                coerced, format=fmt, errors="coerce",
                            )
                    except (ValueError, TypeError):
                        continue
    # Fallback: ``pd.to_datetime`` / dateutil. Handles shapes the
    # sniffer missed (mixed formats within one column,
    # already-parsed Timestamp/date objects sharing space with
    # strings, ISO 8601 with offsets, etc.). Wrap in a warning
    # filter because the unformatted path emits ``UserWarning:
    # Could not infer format...`` once per chunk and we don't want
    # the progress bar drowned.
    try:
-        with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
+        with warnings.catch_warnings():
-            return pd.to_datetime(coerced, errors="coerce")
+            warnings.simplefilter("ignore", category=UserWarning)
            with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
                return pd.to_datetime(coerced, errors="coerce")
    except (FloatingPointError, OverflowError, ValueError) as exc:
        tqdm.write(
            f"[error] {target_type} column {column_name!r}: "