From c3d1f72556712d151149d09827b0c2179f94222c Mon Sep 17 00:00:00 2001 From: David Peterson Date: Wed, 22 Apr 2026 19:20:07 -0500 Subject: [PATCH] Add null string sentinel handling in load_sas.py for improved missing value detection Introduced a frozenset of string literals that represent SQL NULL values, enhancing the inference and nullability detection processes. Implemented helper functions to identify null strings and unify missing value checks for CHAR/TEXT columns. Updated the _null_sentinel_mask function to replace these sentinel values with None, ensuring consistent handling across various data types during data loading. This change improves robustness in managing missing data scenarios. --- generic_loader/load_sas.py | 99 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 94 insertions(+), 5 deletions(-) diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index 5d97c02..8ec0792 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -315,6 +315,83 @@ The chunk size can be overridden at runtime via the changes. Explicit ``chunksize=`` kwargs still win over both.""" +NULL_STRING_SENTINELS: frozenset = frozenset({ + "null", + "na", + "n/a", + "#n/a", + ".", + "none", + "nan", +}) +"""Lowercased string literals treated as SQL ``NULL`` across inference, +nullability detection, and COPY preparation. Seen in the wild when a +source system exports missing values as the literal text ``"null"`` +(yes, really; some SAS CHAR columns hold it verbatim) or uses the +SAS/Stata ``.`` missing sentinel or spreadsheet-style ``NA`` / ``N/A``. + +Kept narrow on purpose: +* ``"null"``, ``"none"``, ``"nan"`` — the common spelled-out missings. +* ``"na"``, ``"n/a"``, ``"#n/a"`` — spreadsheet / R conventions. +* ``"."`` — SAS / Stata missing sentinel as CHAR export. + +Matching is case-insensitive and ignores leading / trailing whitespace. +Extend this set in a calling module (``import load_sas; +load_sas.NULL_STRING_SENTINELS = frozenset({...})``) if your source +ships additional sentinels. Don't add ambiguous tokens (``"0"``, +``"unknown"``) - those are legitimate data in plenty of schemas.""" + + +def _is_null_string(value: Any) -> bool: + """True if ``value`` is a string whose lowercased/stripped form is + in :data:`NULL_STRING_SENTINELS`. Safe to call on any Python value; + non-strings return False so the helper can be dropped into the same + row-walks that also see floats / dates / None.""" + if not isinstance(value, str): + return False + s = value.strip() + if not s: + return False + return s.lower() in NULL_STRING_SENTINELS + + +def _is_char_missing(value: Any) -> bool: + """True if ``value`` should be treated as missing for a CHAR/TEXT + column. Unifies the three-way check (None / NaN / empty-or-sentinel + string) that used to live inline in several helpers so extending + the sentinel set in one place propagates everywhere.""" + if value is None: + return True + if isinstance(value, float) and pd.isna(value): + return True + if isinstance(value, str): + s = value.strip() + if not s: + return True + if s.lower() in NULL_STRING_SENTINELS: + return True + return False + + +def _null_sentinel_mask(series: pd.Series) -> pd.Series: + """Return a copy of ``series`` with empty strings and any value in + :data:`NULL_STRING_SENTINELS` replaced by ``None``. + + Previously the coercion paths (numeric / datetime / TEXT) only + rewrote the empty string. That meant the literal text ``"null"`` + sailed through ``pd.to_numeric(errors="coerce")`` as ``NaN`` (fine + for numerics; by accident) but ``pd.to_datetime(errors="coerce")`` + handed it to ``dateutil`` which happily parsed it as... today's + date (dateutil treats bare words as "use current date for missing + fields"). Routing through this helper fixes both problems in one + pass. Non-string values are left alone so already-parsed + Timestamps / dates / numbers pass through untouched. + """ + if not pd.api.types.is_object_dtype(series): + return series + return series.map(lambda v: None if _is_char_missing(v) else v) + + VALID_IF_EXISTS = ("fail", "replace", "append") VALID_FILE_TYPES = ("sas", "text") @@ -1181,12 +1258,12 @@ def union_column_types( def _all_null(series: pd.Series) -> bool: if pd.api.types.is_object_dtype(series): - return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all()) + return bool(series.map(_is_char_missing).all()) return bool(series.isna().all()) def _char_missing_mask(series: pd.Series) -> pd.Series: - return series.map(lambda v: v is None or (isinstance(v, float) and pd.isna(v)) or (isinstance(v, str) and v == "")) + return series.map(_is_char_missing) def _is_nullable(series: pd.Series) -> bool: @@ -2021,6 +2098,12 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any: except (TypeError, ValueError): pass + # Sentinel strings (``"null"``, ``"NA"``, ``"."``, ...) collapse to + # Python None up front so every type branch below can skip its own + # empty-string dance. + if _is_null_string(value): + return None + pg_upper = pg_type.upper() if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"): @@ -2582,7 +2665,7 @@ def _safe_object_to_datetime( differs from SAS/Oracle convention in corner cases; applying our own parser keeps behavior predictable. """ - coerced = series.replace({"": None}) + coerced = _null_sentinel_mask(series) numeric = pd.to_numeric(coerced, errors="coerce") all_numeric = numeric.notna().sum() == coerced.notna().sum() if all_numeric and coerced.notna().any(): @@ -2657,13 +2740,13 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da if pg in ("INTEGER", "BIGINT", "SMALLINT"): if pd.api.types.is_object_dtype(series): series = pd.to_numeric( - series.replace({"": None}), errors="coerce" + _null_sentinel_mask(series), errors="coerce" ) out[name] = series.astype("Int64") elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"): if pd.api.types.is_object_dtype(series): series = pd.to_numeric( - series.replace({"": None}), errors="coerce" + _null_sentinel_mask(series), errors="coerce" ) out[name] = series.astype("float64") elif pg == "DATE": @@ -2722,6 +2805,12 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da # in the COPY statement turns the blanks back into SQL NULL. # astype(str) stringifies NaN/None to the literal "nan"/"None", # so we mask those after the fact rather than branching per cell. + # Object columns also get the sentinel sweep + # (:data:`NULL_STRING_SENTINELS`) so a literal ``"null"`` / + # ``"NA"`` / ``"."`` value lands as SQL NULL on the way in, + # matching what the numeric / date branches above do. + if pd.api.types.is_object_dtype(series): + series = _null_sentinel_mask(series) na_mask = series.isna() if pd.api.types.is_numeric_dtype(series): # Hit when a column was auto-unioned to TEXT because at