Add safe numeric to datetime conversion in load_sas.py to handle edge cases

Implemented the _safe_numeric_to_datetime function to convert numeric SAS-epoch series to datetime64[ns] while managing potential overflow and non-finite values. This enhancement improves error handling during data processing by masking invalid entries before conversion, ensuring robust handling of SAS date formats in the _prepare_for_copy function.
2026-04-21 15:55:25 -05:00 · 2026-04-21 15:55:25 -05:00 · c283b42876
commit c283b42876
parent a46f0518f6
1 changed files with 70 additions and 4 deletions
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -232,6 +232,7 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple

+import numpy as np
 import pandas as pd
 import psycopg2
 import psycopg2.extensions
@ -1944,6 +1945,71 @@ def _seconds_to_time(v: Any) -> Optional[dt.time]:
    return dt.time(h, m, s)


+# Safe outer bound (in seconds) for the numeric->datetime conversion below.
+# Picked so that ``value * 1e9`` (the multiply pandas does internally to get
+# nanoseconds) stays well inside float64 range *and* the resulting timestamp
+# stays inside the datetime64[ns] window (~1677-2262). 1e13 seconds is roughly
+# year 318888 -- absurdly far past anything a real SAS file would carry, but
+# small enough that ``1e13 * 1e9 = 1e22`` is a comfortable ~286 orders of
+# magnitude under the float64 ceiling, so the multiply can't overflow.
+_SAS_DATETIME_SAFE_S = 1e13
+_SAS_DATETIME_SAFE_D = _SAS_DATETIME_SAFE_S / 86400.0
+
+
+def _safe_numeric_to_datetime(
+    series: pd.Series,
+    *,
+    unit: str,
+    column_name: str,
+    target_type: str,
+) -> pd.Series:
+    """Convert a numeric SAS-epoch series to ``datetime64[ns]`` without letting
+    one stray cell take down the worker.
+
+    Two failure modes we've hit in production:
+
+    * ``np.inf`` / ``-np.inf`` slipping through pyreadstat (SAS missing-value
+      sentinels, divide-by-zero in the source, uninitialized cells).
+    * Absurdly large finite floats (e.g. ``1.7e308``) where ``value * 1e9``
+      overflows float64.
+
+    Both cases trigger ``FloatingPointError: overflow encountered in multiply``
+    inside ``pd.to_datetime`` because pandas wraps the multiply in
+    ``np.errstate(over="raise")`` -- our outer ``errors="coerce"`` never gets
+    a chance to turn the bad value into ``NaT``.
+
+    Strategy: mask non-finite and out-of-range values to NaN *before* calling
+    ``to_datetime``, then run the conversion under a permissive ``errstate``
+    as a belt-and-suspenders. Emit one stderr line per chunk per affected
+    column so silent data loss doesn't sneak by.
+    """
+    finite_mask = np.isfinite(series.to_numpy(dtype="float64", na_value=np.nan))
+    if unit == "s":
+        bound = _SAS_DATETIME_SAFE_S
+    elif unit == "D":
+        bound = _SAS_DATETIME_SAFE_D
+    else:
+        bound = _SAS_DATETIME_SAFE_S
+    in_range_mask = series.abs() < bound
+    keep_mask = finite_mask & in_range_mask.fillna(False).to_numpy()
+    # Count cells we *would* drop that weren't already NaN, so we don't double-
+    # report rows that were missing in the source file.
+    was_present = ~series.isna().to_numpy()
+    coerced = int(((~keep_mask) & was_present).sum())
+    if coerced:
+        tqdm.write(
+            f"[warn] {target_type} column {column_name!r}: {coerced:,} "
+            f"row(s) had non-representable values (Inf/NaN/out-of-range), "
+            f"coerced to NULL",
+            file=sys.stderr,
+        )
+    cleaned = series.where(keep_mask, other=np.nan)
+    with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
+        return pd.to_datetime(
+            cleaned, unit=unit, origin="1960-01-01", errors="coerce",
+        )
+
+
 def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.DataFrame:
    """Materialize a copy of ``df`` with each column in the right shape for
    ``to_csv`` so the CSV lands as valid input for the target Postgres type.
@ -1992,8 +2058,8 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
                # 1960-01-01, the SAS epoch. Without this branch the raw
                # number would hit COPY and Postgres rejects it with
                # ``invalid input syntax for type date``.
-                parsed = pd.to_datetime(
-                    series, unit="D", origin="1960-01-01", errors="coerce",
+                parsed = _safe_numeric_to_datetime(
+                    series, unit="D", column_name=name, target_type="DATE",
                )
                out[name] = parsed.dt.date
            else:
@ -2010,8 +2076,8 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
                # *seconds* since 1960-01-01 (fractional seconds for
                # ``DATETIMEw.d``). Example caught in the wild:
                # ``1915465463.615`` -> 2020-09-13 05:44:23.615.
-                out[name] = pd.to_datetime(
-                    series, unit="s", origin="1960-01-01", errors="coerce",
+                out[name] = _safe_numeric_to_datetime(
+                    series, unit="s", column_name=name, target_type="TIMESTAMP",
                )
            else:
                out[name] = series