Add null string sentinel handling in load_sas.py for improved missing value detection

Introduced a frozenset of string literals that represent SQL NULL values, enhancing the inference and nullability detection processes. Implemented helper functions to identify null strings and unify missing value checks for CHAR/TEXT columns. Updated the _null_sentinel_mask function to replace these sentinel values with None, ensuring consistent handling across various data types during data loading. This change improves robustness in managing missing data scenarios.
2026-04-22 19:20:07 -05:00 · 2026-04-22 19:20:07 -05:00 · c3d1f72556
commit c3d1f72556
parent 998a3e282f
1 changed files with 94 additions and 5 deletions
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -315,6 +315,83 @@ The chunk size can be overridden at runtime via the
 changes. Explicit ``chunksize=`` kwargs still win over both."""
 NULL_STRING_SENTINELS: frozenset = frozenset({
    "null",
    "na",
    "n/a",
    "#n/a",
    ".",
    "none",
    "nan",
 })
 """Lowercased string literals treated as SQL ``NULL`` across inference,
 nullability detection, and COPY preparation. Seen in the wild when a
 source system exports missing values as the literal text ``"null"``
 (yes, really; some SAS CHAR columns hold it verbatim) or uses the
 SAS/Stata ``.`` missing sentinel or spreadsheet-style ``NA`` / ``N/A``.
 Kept narrow on purpose:
 * ``"null"``, ``"none"``, ``"nan"`` — the common spelled-out missings.
 * ``"na"``, ``"n/a"``, ``"#n/a"`` — spreadsheet / R conventions.
 * ``"."`` — SAS / Stata missing sentinel as CHAR export.
 Matching is case-insensitive and ignores leading / trailing whitespace.
 Extend this set in a calling module (``import load_sas;
 load_sas.NULL_STRING_SENTINELS = frozenset({...})``) if your source
 ships additional sentinels. Don't add ambiguous tokens (``"0"``,
 ``"unknown"``) - those are legitimate data in plenty of schemas."""
 def _is_null_string(value: Any) -> bool:
    """True if ``value`` is a string whose lowercased/stripped form is
    in :data:`NULL_STRING_SENTINELS`. Safe to call on any Python value;
    non-strings return False so the helper can be dropped into the same
    row-walks that also see floats / dates / None."""
    if not isinstance(value, str):
        return False
    s = value.strip()
    if not s:
        return False
    return s.lower() in NULL_STRING_SENTINELS
 def _is_char_missing(value: Any) -> bool:
    """True if ``value`` should be treated as missing for a CHAR/TEXT
    column. Unifies the three-way check (None / NaN / empty-or-sentinel
    string) that used to live inline in several helpers so extending
    the sentinel set in one place propagates everywhere."""
    if value is None:
        return True
    if isinstance(value, float) and pd.isna(value):
        return True
    if isinstance(value, str):
        s = value.strip()
        if not s:
            return True
        if s.lower() in NULL_STRING_SENTINELS:
            return True
    return False
 def _null_sentinel_mask(series: pd.Series) -> pd.Series:
    """Return a copy of ``series`` with empty strings and any value in
    :data:`NULL_STRING_SENTINELS` replaced by ``None``.
    Previously the coercion paths (numeric / datetime / TEXT) only
    rewrote the empty string. That meant the literal text ``"null"``
    sailed through ``pd.to_numeric(errors="coerce")`` as ``NaN`` (fine
    for numerics; by accident) but ``pd.to_datetime(errors="coerce")``
    handed it to ``dateutil`` which happily parsed it as... today's
    date (dateutil treats bare words as "use current date for missing
    fields"). Routing through this helper fixes both problems in one
    pass. Non-string values are left alone so already-parsed
    Timestamps / dates / numbers pass through untouched.
    """
    if not pd.api.types.is_object_dtype(series):
        return series
    return series.map(lambda v: None if _is_char_missing(v) else v)
 VALID_IF_EXISTS = ("fail", "replace", "append")
 VALID_FILE_TYPES = ("sas", "text")
@ -1181,12 +1258,12 @@ def union_column_types(
 def _all_null(series: pd.Series) -> bool:
    if pd.api.types.is_object_dtype(series):
-        return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all())
+        return bool(series.map(_is_char_missing).all())
    return bool(series.isna().all())
 def _char_missing_mask(series: pd.Series) -> pd.Series:
-    return series.map(lambda v: v is None or (isinstance(v, float) and pd.isna(v)) or (isinstance(v, str) and v == ""))
+    return series.map(_is_char_missing)
 def _is_nullable(series: pd.Series) -> bool:
@ -2021,6 +2098,12 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
    except (TypeError, ValueError):
        pass
    # Sentinel strings (``"null"``, ``"NA"``, ``"."``, ...) collapse to
    # Python None up front so every type branch below can skip its own
    # empty-string dance.
    if _is_null_string(value):
        return None
    pg_upper = pg_type.upper()
    if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"):
@ -2582,7 +2665,7 @@ def _safe_object_to_datetime(
      differs from SAS/Oracle convention in corner cases; applying our
      own parser keeps behavior predictable.
    """
-    coerced = series.replace({"": None})
+    coerced = _null_sentinel_mask(series)
    numeric = pd.to_numeric(coerced, errors="coerce")
    all_numeric = numeric.notna().sum() == coerced.notna().sum()
    if all_numeric and coerced.notna().any():
@ -2657,13 +2740,13 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
        if pg in ("INTEGER", "BIGINT", "SMALLINT"):
            if pd.api.types.is_object_dtype(series):
                series = pd.to_numeric(
-                    series.replace({"": None}), errors="coerce"
+                    _null_sentinel_mask(series), errors="coerce"
                )
            out[name] = series.astype("Int64")
        elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"):
            if pd.api.types.is_object_dtype(series):
                series = pd.to_numeric(
-                    series.replace({"": None}), errors="coerce"
+                    _null_sentinel_mask(series), errors="coerce"
                )
            out[name] = series.astype("float64")
        elif pg == "DATE":
@ -2722,6 +2805,12 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
            # in the COPY statement turns the blanks back into SQL NULL.
            # astype(str) stringifies NaN/None to the literal "nan"/"None",
            # so we mask those after the fact rather than branching per cell.
            # Object columns also get the sentinel sweep
            # (:data:`NULL_STRING_SENTINELS`) so a literal ``"null"`` /
            # ``"NA"`` / ``"."`` value lands as SQL NULL on the way in,
            # matching what the numeric / date branches above do.
            if pd.api.types.is_object_dtype(series):
                series = _null_sentinel_mask(series)
            na_mask = series.isna()
            if pd.api.types.is_numeric_dtype(series):
                # Hit when a column was auto-unioned to TEXT because at