diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index caed87a..cc867c4 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -232,6 +232,7 @@ from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Tuple +import numpy as np import pandas as pd import psycopg2 import psycopg2.extensions @@ -1944,6 +1945,71 @@ def _seconds_to_time(v: Any) -> Optional[dt.time]: return dt.time(h, m, s) +# Safe outer bound (in seconds) for the numeric->datetime conversion below. +# Picked so that ``value * 1e9`` (the multiply pandas does internally to get +# nanoseconds) stays well inside float64 range *and* the resulting timestamp +# stays inside the datetime64[ns] window (~1677-2262). 1e13 seconds is roughly +# year 318888 -- absurdly far past anything a real SAS file would carry, but +# small enough that ``1e13 * 1e9 = 1e22`` is a comfortable ~286 orders of +# magnitude under the float64 ceiling, so the multiply can't overflow. +_SAS_DATETIME_SAFE_S = 1e13 +_SAS_DATETIME_SAFE_D = _SAS_DATETIME_SAFE_S / 86400.0 + + +def _safe_numeric_to_datetime( + series: pd.Series, + *, + unit: str, + column_name: str, + target_type: str, +) -> pd.Series: + """Convert a numeric SAS-epoch series to ``datetime64[ns]`` without letting + one stray cell take down the worker. + + Two failure modes we've hit in production: + + * ``np.inf`` / ``-np.inf`` slipping through pyreadstat (SAS missing-value + sentinels, divide-by-zero in the source, uninitialized cells). + * Absurdly large finite floats (e.g. ``1.7e308``) where ``value * 1e9`` + overflows float64. + + Both cases trigger ``FloatingPointError: overflow encountered in multiply`` + inside ``pd.to_datetime`` because pandas wraps the multiply in + ``np.errstate(over="raise")`` -- our outer ``errors="coerce"`` never gets + a chance to turn the bad value into ``NaT``. + + Strategy: mask non-finite and out-of-range values to NaN *before* calling + ``to_datetime``, then run the conversion under a permissive ``errstate`` + as a belt-and-suspenders. Emit one stderr line per chunk per affected + column so silent data loss doesn't sneak by. + """ + finite_mask = np.isfinite(series.to_numpy(dtype="float64", na_value=np.nan)) + if unit == "s": + bound = _SAS_DATETIME_SAFE_S + elif unit == "D": + bound = _SAS_DATETIME_SAFE_D + else: + bound = _SAS_DATETIME_SAFE_S + in_range_mask = series.abs() < bound + keep_mask = finite_mask & in_range_mask.fillna(False).to_numpy() + # Count cells we *would* drop that weren't already NaN, so we don't double- + # report rows that were missing in the source file. + was_present = ~series.isna().to_numpy() + coerced = int(((~keep_mask) & was_present).sum()) + if coerced: + tqdm.write( + f"[warn] {target_type} column {column_name!r}: {coerced:,} " + f"row(s) had non-representable values (Inf/NaN/out-of-range), " + f"coerced to NULL", + file=sys.stderr, + ) + cleaned = series.where(keep_mask, other=np.nan) + with np.errstate(over="ignore", invalid="ignore", divide="ignore"): + return pd.to_datetime( + cleaned, unit=unit, origin="1960-01-01", errors="coerce", + ) + + def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.DataFrame: """Materialize a copy of ``df`` with each column in the right shape for ``to_csv`` so the CSV lands as valid input for the target Postgres type. @@ -1992,8 +2058,8 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da # 1960-01-01, the SAS epoch. Without this branch the raw # number would hit COPY and Postgres rejects it with # ``invalid input syntax for type date``. - parsed = pd.to_datetime( - series, unit="D", origin="1960-01-01", errors="coerce", + parsed = _safe_numeric_to_datetime( + series, unit="D", column_name=name, target_type="DATE", ) out[name] = parsed.dt.date else: @@ -2010,8 +2076,8 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da # *seconds* since 1960-01-01 (fractional seconds for # ``DATETIMEw.d``). Example caught in the wild: # ``1915465463.615`` -> 2020-09-13 05:44:23.615. - out[name] = pd.to_datetime( - series, unit="s", origin="1960-01-01", errors="coerce", + out[name] = _safe_numeric_to_datetime( + series, unit="s", column_name=name, target_type="TIMESTAMP", ) else: out[name] = series