Add null string sentinel handling in load_sas.py for improved missing value detection

Introduced a frozenset of string literals that represent SQL NULL values, enhancing the inference and nullability detection processes. Implemented helper functions to identify null strings and unify missing value checks for CHAR/TEXT columns. Updated the _null_sentinel_mask function to replace these sentinel values with None, ensuring consistent handling across various data types during data loading. This change improves robustness in managing missing data scenarios.
This commit is contained in:
David Peterson 2026-04-22 19:20:07 -05:00
parent 998a3e282f
commit c3d1f72556

View File

@ -315,6 +315,83 @@ The chunk size can be overridden at runtime via the
changes. Explicit ``chunksize=`` kwargs still win over both.""" changes. Explicit ``chunksize=`` kwargs still win over both."""
NULL_STRING_SENTINELS: frozenset = frozenset({
"null",
"na",
"n/a",
"#n/a",
".",
"none",
"nan",
})
"""Lowercased string literals treated as SQL ``NULL`` across inference,
nullability detection, and COPY preparation. Seen in the wild when a
source system exports missing values as the literal text ``"null"``
(yes, really; some SAS CHAR columns hold it verbatim) or uses the
SAS/Stata ``.`` missing sentinel or spreadsheet-style ``NA`` / ``N/A``.
Kept narrow on purpose:
* ``"null"``, ``"none"``, ``"nan"`` the common spelled-out missings.
* ``"na"``, ``"n/a"``, ``"#n/a"`` spreadsheet / R conventions.
* ``"."`` SAS / Stata missing sentinel as CHAR export.
Matching is case-insensitive and ignores leading / trailing whitespace.
Extend this set in a calling module (``import load_sas;
load_sas.NULL_STRING_SENTINELS = frozenset({...})``) if your source
ships additional sentinels. Don't add ambiguous tokens (``"0"``,
``"unknown"``) - those are legitimate data in plenty of schemas."""
def _is_null_string(value: Any) -> bool:
"""True if ``value`` is a string whose lowercased/stripped form is
in :data:`NULL_STRING_SENTINELS`. Safe to call on any Python value;
non-strings return False so the helper can be dropped into the same
row-walks that also see floats / dates / None."""
if not isinstance(value, str):
return False
s = value.strip()
if not s:
return False
return s.lower() in NULL_STRING_SENTINELS
def _is_char_missing(value: Any) -> bool:
"""True if ``value`` should be treated as missing for a CHAR/TEXT
column. Unifies the three-way check (None / NaN / empty-or-sentinel
string) that used to live inline in several helpers so extending
the sentinel set in one place propagates everywhere."""
if value is None:
return True
if isinstance(value, float) and pd.isna(value):
return True
if isinstance(value, str):
s = value.strip()
if not s:
return True
if s.lower() in NULL_STRING_SENTINELS:
return True
return False
def _null_sentinel_mask(series: pd.Series) -> pd.Series:
"""Return a copy of ``series`` with empty strings and any value in
:data:`NULL_STRING_SENTINELS` replaced by ``None``.
Previously the coercion paths (numeric / datetime / TEXT) only
rewrote the empty string. That meant the literal text ``"null"``
sailed through ``pd.to_numeric(errors="coerce")`` as ``NaN`` (fine
for numerics; by accident) but ``pd.to_datetime(errors="coerce")``
handed it to ``dateutil`` which happily parsed it as... today's
date (dateutil treats bare words as "use current date for missing
fields"). Routing through this helper fixes both problems in one
pass. Non-string values are left alone so already-parsed
Timestamps / dates / numbers pass through untouched.
"""
if not pd.api.types.is_object_dtype(series):
return series
return series.map(lambda v: None if _is_char_missing(v) else v)
VALID_IF_EXISTS = ("fail", "replace", "append") VALID_IF_EXISTS = ("fail", "replace", "append")
VALID_FILE_TYPES = ("sas", "text") VALID_FILE_TYPES = ("sas", "text")
@ -1181,12 +1258,12 @@ def union_column_types(
def _all_null(series: pd.Series) -> bool: def _all_null(series: pd.Series) -> bool:
if pd.api.types.is_object_dtype(series): if pd.api.types.is_object_dtype(series):
return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all()) return bool(series.map(_is_char_missing).all())
return bool(series.isna().all()) return bool(series.isna().all())
def _char_missing_mask(series: pd.Series) -> pd.Series: def _char_missing_mask(series: pd.Series) -> pd.Series:
return series.map(lambda v: v is None or (isinstance(v, float) and pd.isna(v)) or (isinstance(v, str) and v == "")) return series.map(_is_char_missing)
def _is_nullable(series: pd.Series) -> bool: def _is_nullable(series: pd.Series) -> bool:
@ -2021,6 +2098,12 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
except (TypeError, ValueError): except (TypeError, ValueError):
pass pass
# Sentinel strings (``"null"``, ``"NA"``, ``"."``, ...) collapse to
# Python None up front so every type branch below can skip its own
# empty-string dance.
if _is_null_string(value):
return None
pg_upper = pg_type.upper() pg_upper = pg_type.upper()
if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"): if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"):
@ -2582,7 +2665,7 @@ def _safe_object_to_datetime(
differs from SAS/Oracle convention in corner cases; applying our differs from SAS/Oracle convention in corner cases; applying our
own parser keeps behavior predictable. own parser keeps behavior predictable.
""" """
coerced = series.replace({"": None}) coerced = _null_sentinel_mask(series)
numeric = pd.to_numeric(coerced, errors="coerce") numeric = pd.to_numeric(coerced, errors="coerce")
all_numeric = numeric.notna().sum() == coerced.notna().sum() all_numeric = numeric.notna().sum() == coerced.notna().sum()
if all_numeric and coerced.notna().any(): if all_numeric and coerced.notna().any():
@ -2657,13 +2740,13 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
if pg in ("INTEGER", "BIGINT", "SMALLINT"): if pg in ("INTEGER", "BIGINT", "SMALLINT"):
if pd.api.types.is_object_dtype(series): if pd.api.types.is_object_dtype(series):
series = pd.to_numeric( series = pd.to_numeric(
series.replace({"": None}), errors="coerce" _null_sentinel_mask(series), errors="coerce"
) )
out[name] = series.astype("Int64") out[name] = series.astype("Int64")
elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"): elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"):
if pd.api.types.is_object_dtype(series): if pd.api.types.is_object_dtype(series):
series = pd.to_numeric( series = pd.to_numeric(
series.replace({"": None}), errors="coerce" _null_sentinel_mask(series), errors="coerce"
) )
out[name] = series.astype("float64") out[name] = series.astype("float64")
elif pg == "DATE": elif pg == "DATE":
@ -2722,6 +2805,12 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
# in the COPY statement turns the blanks back into SQL NULL. # in the COPY statement turns the blanks back into SQL NULL.
# astype(str) stringifies NaN/None to the literal "nan"/"None", # astype(str) stringifies NaN/None to the literal "nan"/"None",
# so we mask those after the fact rather than branching per cell. # so we mask those after the fact rather than branching per cell.
# Object columns also get the sentinel sweep
# (:data:`NULL_STRING_SENTINELS`) so a literal ``"null"`` /
# ``"NA"`` / ``"."`` value lands as SQL NULL on the way in,
# matching what the numeric / date branches above do.
if pd.api.types.is_object_dtype(series):
series = _null_sentinel_mask(series)
na_mask = series.isna() na_mask = series.isna()
if pd.api.types.is_numeric_dtype(series): if pd.api.types.is_numeric_dtype(series):
# Hit when a column was auto-unioned to TEXT because at # Hit when a column was auto-unioned to TEXT because at