Enhance error reporting in load_folder.py and load_sas.py for better debugging

Updated error handling in the _worker_load_append_file function to include full tracebacks in exception messages, improving context for failures during file loading. Additionally, modified the _safe_numeric_to_datetime function to provide detailed warnings when conversion errors occur, ensuring users are informed of potential data issues. These changes aim to facilitate easier debugging and enhance the robustness of the data loading process.
This commit is contained in:
David Peterson 2026-04-21 16:56:27 -05:00
parent eff82c73ce
commit 64e7ff0b0a
2 changed files with 133 additions and 33 deletions

View File

@ -1148,7 +1148,15 @@ def _worker_load_append_file(
finally: finally:
conn.close() conn.close()
except Exception as e: except Exception as e:
return (path_str, 0, f"{type(e).__name__}: {e}") import traceback as _traceback
tb = _traceback.format_exc()
# Keep the one-line summary (what the tqdm [FAIL] print uses) but
# tack on the full traceback so the final cluster-failure block
# shows the file/line that crashed. Without this, ``ProcessPool``
# workers lose every frame of context - you get "FloatingPointError:
# overflow encountered in multiply" with no hint of where inside
# the pandas/numpy/pyarrow stack it happened.
return (path_str, 0, f"{type(e).__name__}: {e}\n{tb}")
finally: finally:
# Hand memory back to the OS before the worker is recycled (or before # Hand memory back to the OS before the worker is recycled (or before
# ``max_tasks_per_child`` rotates this process). Three layers, each # ``max_tasks_per_child`` rotates this process). Three layers, each

View File

@ -254,6 +254,22 @@ from tqdm import tqdm
# at import time - narrow category match so nothing else is suppressed. # at import time - narrow category match so nothing else is suppressed.
warnings.filterwarnings("ignore", category=PerformanceWarning) warnings.filterwarnings("ignore", category=PerformanceWarning)
# Turn numpy's "raise on float overflow" (and friends) into silent inf/nan
# production, module-wide. Pandas ships with ``np.errstate(over="raise")``
# wrapped around several internal ops (most painfully, the multiply inside
# ``pd.to_datetime(unit="s")`` that converts SAS epoch -> nanoseconds).
# Our data routinely carries ``inf`` / huge sentinels, which trip that
# ``raise`` and blow up an entire worker before ``errors="coerce"`` gets
# a chance to turn them into NaT. Even with ``_safe_numeric_to_datetime``
# pre-masking the obvious cases, other code paths (pandas object-dtype
# datetime parsing, pyarrow type promotion, pyreadstat) can also trigger.
# Setting a process-wide ``seterr`` is a heavier hammer than an
# ``errstate`` block but survives library internals that don't explicitly
# rewrap it. Downside: a real overflow bug in new code would now silently
# produce inf/nan instead of raising - acceptable for a bulk loader where
# "don't crash on bad rows, null them and move on" is the whole point.
np.seterr(over="ignore", invalid="ignore", divide="ignore")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -1976,15 +1992,16 @@ def _seconds_to_time(v: Any) -> Optional[dt.time]:
return dt.time(h, m, s) return dt.time(h, m, s)
# Safe outer bound (in seconds) for the numeric->datetime conversion below. # Safe outer bound for the numeric->datetime conversion below. The true
# Picked so that ``value * 1e9`` (the multiply pandas does internally to get # ceiling is ``pd.Timestamp.max`` (2262-04-11), which in seconds since 1960
# nanoseconds) stays well inside float64 range *and* the resulting timestamp # is ~9.52e9. We pick a much tighter bound - year ~2200, ~7.6e9 seconds,
# stays inside the datetime64[ns] window (~1677-2262). 1e13 seconds is roughly # ~87600 days - because (a) any real SAS data past ~2100 is garbage anyway,
# year 318888 -- absurdly far past anything a real SAS file would carry, but # and (b) staying well inside the float64 + datetime64[ns] windows gives
# small enough that ``1e13 * 1e9 = 1e22`` is a comfortable ~286 orders of # pandas' internals zero room to trip the ``over="raise"`` they wrap
# magnitude under the float64 ceiling, so the multiply can't overflow. # around the ns-multiply. ``7.5e9 * 1e9 = 7.5e18``, comfortably under both
_SAS_DATETIME_SAFE_S = 1e13 # ``int64.max`` (~9.22e18) and float64 overflow (~1.8e308).
_SAS_DATETIME_SAFE_D = _SAS_DATETIME_SAFE_S / 86400.0 _SAS_DATETIME_SAFE_S = 7_500_000_000
_SAS_DATETIME_SAFE_D = 87_000
def _safe_numeric_to_datetime( def _safe_numeric_to_datetime(
@ -1997,35 +2014,57 @@ def _safe_numeric_to_datetime(
"""Convert a numeric SAS-epoch series to ``datetime64[ns]`` without letting """Convert a numeric SAS-epoch series to ``datetime64[ns]`` without letting
one stray cell take down the worker. one stray cell take down the worker.
Two failure modes we've hit in production: Failure modes seen in production:
* ``np.inf`` / ``-np.inf`` slipping through pyreadstat (SAS missing-value * ``np.inf`` / ``-np.inf`` slipping through pyreadstat (SAS missing-value
sentinels, divide-by-zero in the source, uninitialized cells). sentinels, divide-by-zero in the source, uninitialized cells).
* Absurdly large finite floats (e.g. ``1.7e308``) where ``value * 1e9`` * Absurdly large finite floats (e.g. ``1.7e308``) where ``value * 1e9``
overflows float64. overflows float64.
* Values between ``pd.Timestamp.max`` and float64 safety (~9.5e9 to 1e308
seconds) where the nanosecond multiply silently produces garbage or
overflows int64.
Both cases trigger ``FloatingPointError: overflow encountered in multiply`` All of these trigger ``FloatingPointError: overflow encountered in multiply``
inside ``pd.to_datetime`` because pandas wraps the multiply in inside ``pd.to_datetime`` because pandas wraps the multiply in
``np.errstate(over="raise")`` -- our outer ``errors="coerce"`` never gets ``np.errstate(over="raise")`` -- our outer ``errors="coerce"`` never
a chance to turn the bad value into ``NaT``. gets a chance to turn the bad value into ``NaT``.
Strategy: mask non-finite and out-of-range values to NaN *before* calling Strategy, belt + suspenders + airbag:
``to_datetime``, then run the conversion under a permissive ``errstate``
as a belt-and-suspenders. Emit one stderr line per chunk per affected 1. Coerce to float64 up front. Object-dtype branches hand us mixed
column so silent data loss doesn't sneak by. int/float/str; ``pd.to_numeric(errors="coerce")`` parses what it can
and NaNs the rest, so we hit the rest of this function with a
pristine float series.
2. Mask non-finite values and anything outside the safe epoch window to
NaN *before* ``pd.to_datetime`` sees them.
3. Run the conversion under a permissive ``errstate``.
4. If that still raises (some pandas version internally re-enables
``over="raise"`` in a way ``errstate`` can't override), catch it
and return all-NaT for the column with a loud warning. Better a
NULL column in one chunk than a dead worker + no diagnostics.
Emits one stderr line per chunk per affected column so silent data
loss doesn't sneak by.
""" """
finite_mask = np.isfinite(series.to_numpy(dtype="float64", na_value=np.nan)) if not pd.api.types.is_float_dtype(series):
series = pd.to_numeric(series, errors="coerce").astype("float64")
arr = series.to_numpy(dtype="float64", copy=False, na_value=np.nan)
if unit == "s": if unit == "s":
bound = _SAS_DATETIME_SAFE_S bound = _SAS_DATETIME_SAFE_S
elif unit == "D": elif unit == "D":
bound = _SAS_DATETIME_SAFE_D bound = _SAS_DATETIME_SAFE_D
else: else:
bound = _SAS_DATETIME_SAFE_S bound = _SAS_DATETIME_SAFE_S
in_range_mask = series.abs() < bound with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
keep_mask = finite_mask & in_range_mask.fillna(False).to_numpy() finite_mask = np.isfinite(arr)
# Count cells we *would* drop that weren't already NaN, so we don't double- # ``np.abs(inf) -> inf``, ``np.abs(nan) -> nan``; both compare False
# report rows that were missing in the source file. # to ``bound``, so ``in_range_mask`` already excludes non-finite
was_present = ~series.isna().to_numpy() # values. The explicit ``finite_mask &`` below is belt-and-suspenders
# in case a future numpy changes that semantic.
in_range_mask = np.abs(arr) < bound
keep_mask = finite_mask & in_range_mask
was_present = ~np.isnan(arr)
coerced = int(((~keep_mask) & was_present).sum()) coerced = int(((~keep_mask) & was_present).sum())
if coerced: if coerced:
tqdm.write( tqdm.write(
@ -2034,11 +2073,56 @@ def _safe_numeric_to_datetime(
f"coerced to NULL", f"coerced to NULL",
file=sys.stderr, file=sys.stderr,
) )
cleaned = series.where(keep_mask, other=np.nan) cleaned_arr = np.where(keep_mask, arr, np.nan)
cleaned = pd.Series(cleaned_arr, index=series.index)
try:
with np.errstate(over="ignore", invalid="ignore", divide="ignore"): with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
return pd.to_datetime( return pd.to_datetime(
cleaned, unit=unit, origin="1960-01-01", errors="coerce", cleaned, unit=unit, origin="1960-01-01", errors="coerce",
) )
except (FloatingPointError, OverflowError, ValueError) as exc:
tqdm.write(
f"[error] {target_type} column {column_name!r}: "
f"pd.to_datetime raised {type(exc).__name__}: {exc}; "
f"returning NaT for the entire chunk. This usually means one "
f"or more values slipped past the pre-mask (bound={bound}). "
f"Consider setting the column to TEXT via column_types if this "
f"recurs.",
file=sys.stderr,
)
return pd.Series(pd.NaT, index=series.index, dtype="datetime64[ns]")
def _safe_object_to_datetime(
series: pd.Series,
*,
column_name: str,
target_type: str,
) -> pd.Series:
"""Object-dtype to datetime. Shares the safety net (errstate +
try/except) with :func:`_safe_numeric_to_datetime`. If the column is
actually numeric-flavored (e.g. SAS wrote numbers into an object
column), route to the numeric path; otherwise parse with ``to_datetime``
on the object itself.
"""
coerced = series.replace({"": None})
numeric = pd.to_numeric(coerced, errors="coerce")
all_numeric = numeric.notna().sum() == coerced.notna().sum()
if all_numeric and coerced.notna().any():
return _safe_numeric_to_datetime(
numeric, unit="s", column_name=column_name, target_type=target_type,
)
try:
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
return pd.to_datetime(coerced, errors="coerce")
except (FloatingPointError, OverflowError, ValueError) as exc:
tqdm.write(
f"[error] {target_type} column {column_name!r}: "
f"pd.to_datetime raised {type(exc).__name__}: {exc}; "
f"returning NaT for the entire chunk.",
file=sys.stderr,
)
return pd.Series(pd.NaT, index=series.index, dtype="datetime64[ns]")
def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.DataFrame: def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.DataFrame:
@ -2077,9 +2161,14 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
elif pd.api.types.is_object_dtype(series): elif pd.api.types.is_object_dtype(series):
# Vectorized parse: empty strings / None / unparseable -> NaT, # Vectorized parse: empty strings / None / unparseable -> NaT,
# then .dt.date yields date objects or NaT. NaT serializes as # then .dt.date yields date objects or NaT. NaT serializes as
# an empty CSV field (matching ``NULL ''`` in COPY). # an empty CSV field (matching ``NULL ''`` in COPY). Routed
parsed = pd.to_datetime( # through ``_safe_object_to_datetime`` so an object column
series.replace({"": None}), errors="coerce" # that actually contains SAS-epoch numerics (seen when one
# file of a cluster stores the column as NUM and another as
# CHAR + the union flipped it to TEXT-then-DATE) can't trip
# the overflow-in-multiply bug.
parsed = _safe_object_to_datetime(
series, column_name=name, target_type="DATE",
) )
out[name] = parsed.dt.date out[name] = parsed.dt.date
elif pd.api.types.is_numeric_dtype(series): elif pd.api.types.is_numeric_dtype(series):
@ -2099,8 +2188,11 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
if pd.api.types.is_datetime64_any_dtype(series): if pd.api.types.is_datetime64_any_dtype(series):
out[name] = series out[name] = series
elif pd.api.types.is_object_dtype(series): elif pd.api.types.is_object_dtype(series):
out[name] = pd.to_datetime( # Same rationale as the DATE object branch above: route
series.replace({"": None}), errors="coerce" # through the safety net so numeric-flavored object columns
# can't blow us up during the ns multiply.
out[name] = _safe_object_to_datetime(
series, column_name=name, target_type="TIMESTAMP",
) )
elif pd.api.types.is_numeric_dtype(series): elif pd.api.types.is_numeric_dtype(series):
# Same story as the DATE branch above, but SAS datetimes are # Same story as the DATE branch above, but SAS datetimes are