From c3d1f72556712d151149d09827b0c2179f94222c Mon Sep 17 00:00:00 2001
From: David Peterson <dpeterson@americafirstpolicy.com>
Date: Wed, 22 Apr 2026 19:20:07 -0500
Subject: [PATCH] Add null string sentinel handling in load_sas.py for improved
 missing value detection

Introduced a frozenset of string literals that represent SQL NULL values, enhancing the inference and nullability detection processes. Implemented helper functions to identify null strings and unify missing value checks for CHAR/TEXT columns. Updated the _null_sentinel_mask function to replace these sentinel values with None, ensuring consistent handling across various data types during data loading. This change improves robustness in managing missing data scenarios.
---
 generic_loader/load_sas.py | 99 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 94 insertions(+), 5 deletions(-)

diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py
index 5d97c02..8ec0792 100644
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@@ -315,6 +315,83 @@ The chunk size can be overridden at runtime via the
 changes. Explicit ``chunksize=`` kwargs still win over both."""
 
 
+NULL_STRING_SENTINELS: frozenset = frozenset({
+    "null",
+    "na",
+    "n/a",
+    "#n/a",
+    ".",
+    "none",
+    "nan",
+})
+"""Lowercased string literals treated as SQL ``NULL`` across inference,
+nullability detection, and COPY preparation. Seen in the wild when a
+source system exports missing values as the literal text ``"null"``
+(yes, really; some SAS CHAR columns hold it verbatim) or uses the
+SAS/Stata ``.`` missing sentinel or spreadsheet-style ``NA`` / ``N/A``.
+
+Kept narrow on purpose:
+* ``"null"``, ``"none"``, ``"nan"`` — the common spelled-out missings.
+* ``"na"``, ``"n/a"``, ``"#n/a"`` — spreadsheet / R conventions.
+* ``"."`` — SAS / Stata missing sentinel as CHAR export.
+
+Matching is case-insensitive and ignores leading / trailing whitespace.
+Extend this set in a calling module (``import load_sas;
+load_sas.NULL_STRING_SENTINELS = frozenset({...})``) if your source
+ships additional sentinels. Don't add ambiguous tokens (``"0"``,
+``"unknown"``) - those are legitimate data in plenty of schemas."""
+
+
+def _is_null_string(value: Any) -> bool:
+    """True if ``value`` is a string whose lowercased/stripped form is
+    in :data:`NULL_STRING_SENTINELS`. Safe to call on any Python value;
+    non-strings return False so the helper can be dropped into the same
+    row-walks that also see floats / dates / None."""
+    if not isinstance(value, str):
+        return False
+    s = value.strip()
+    if not s:
+        return False
+    return s.lower() in NULL_STRING_SENTINELS
+
+
+def _is_char_missing(value: Any) -> bool:
+    """True if ``value`` should be treated as missing for a CHAR/TEXT
+    column. Unifies the three-way check (None / NaN / empty-or-sentinel
+    string) that used to live inline in several helpers so extending
+    the sentinel set in one place propagates everywhere."""
+    if value is None:
+        return True
+    if isinstance(value, float) and pd.isna(value):
+        return True
+    if isinstance(value, str):
+        s = value.strip()
+        if not s:
+            return True
+        if s.lower() in NULL_STRING_SENTINELS:
+            return True
+    return False
+
+
+def _null_sentinel_mask(series: pd.Series) -> pd.Series:
+    """Return a copy of ``series`` with empty strings and any value in
+    :data:`NULL_STRING_SENTINELS` replaced by ``None``.
+
+    Previously the coercion paths (numeric / datetime / TEXT) only
+    rewrote the empty string. That meant the literal text ``"null"``
+    sailed through ``pd.to_numeric(errors="coerce")`` as ``NaN`` (fine
+    for numerics; by accident) but ``pd.to_datetime(errors="coerce")``
+    handed it to ``dateutil`` which happily parsed it as... today's
+    date (dateutil treats bare words as "use current date for missing
+    fields"). Routing through this helper fixes both problems in one
+    pass. Non-string values are left alone so already-parsed
+    Timestamps / dates / numbers pass through untouched.
+    """
+    if not pd.api.types.is_object_dtype(series):
+        return series
+    return series.map(lambda v: None if _is_char_missing(v) else v)
+
+
 VALID_IF_EXISTS = ("fail", "replace", "append")
 
 VALID_FILE_TYPES = ("sas", "text")
@@ -1181,12 +1258,12 @@ def union_column_types(
 
 def _all_null(series: pd.Series) -> bool:
     if pd.api.types.is_object_dtype(series):
-        return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all())
+        return bool(series.map(_is_char_missing).all())
     return bool(series.isna().all())
 
 
 def _char_missing_mask(series: pd.Series) -> pd.Series:
-    return series.map(lambda v: v is None or (isinstance(v, float) and pd.isna(v)) or (isinstance(v, str) and v == ""))
+    return series.map(_is_char_missing)
 
 
 def _is_nullable(series: pd.Series) -> bool:
@@ -2021,6 +2098,12 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
     except (TypeError, ValueError):
         pass
 
+    # Sentinel strings (``"null"``, ``"NA"``, ``"."``, ...) collapse to
+    # Python None up front so every type branch below can skip its own
+    # empty-string dance.
+    if _is_null_string(value):
+        return None
+
     pg_upper = pg_type.upper()
 
     if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"):
@@ -2582,7 +2665,7 @@ def _safe_object_to_datetime(
       differs from SAS/Oracle convention in corner cases; applying our
       own parser keeps behavior predictable.
     """
-    coerced = series.replace({"": None})
+    coerced = _null_sentinel_mask(series)
     numeric = pd.to_numeric(coerced, errors="coerce")
     all_numeric = numeric.notna().sum() == coerced.notna().sum()
     if all_numeric and coerced.notna().any():
@@ -2657,13 +2740,13 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
         if pg in ("INTEGER", "BIGINT", "SMALLINT"):
             if pd.api.types.is_object_dtype(series):
                 series = pd.to_numeric(
-                    series.replace({"": None}), errors="coerce"
+                    _null_sentinel_mask(series), errors="coerce"
                 )
             out[name] = series.astype("Int64")
         elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"):
             if pd.api.types.is_object_dtype(series):
                 series = pd.to_numeric(
-                    series.replace({"": None}), errors="coerce"
+                    _null_sentinel_mask(series), errors="coerce"
                 )
             out[name] = series.astype("float64")
         elif pg == "DATE":
@@ -2722,6 +2805,12 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
             # in the COPY statement turns the blanks back into SQL NULL.
             # astype(str) stringifies NaN/None to the literal "nan"/"None",
             # so we mask those after the fact rather than branching per cell.
+            # Object columns also get the sentinel sweep
+            # (:data:`NULL_STRING_SENTINELS`) so a literal ``"null"`` /
+            # ``"NA"`` / ``"."`` value lands as SQL NULL on the way in,
+            # matching what the numeric / date branches above do.
+            if pd.api.types.is_object_dtype(series):
+                series = _null_sentinel_mask(series)
             na_mask = series.isna()
             if pd.api.types.is_numeric_dtype(series):
                 # Hit when a column was auto-unioned to TEXT because at