Add null string sentinel handling in load_sas.py for improved missing value detection
Introduced a frozenset of string literals that represent SQL NULL values, enhancing the inference and nullability detection processes. Implemented helper functions to identify null strings and unify missing value checks for CHAR/TEXT columns. Updated the _null_sentinel_mask function to replace these sentinel values with None, ensuring consistent handling across various data types during data loading. This change improves robustness in managing missing data scenarios.
This commit is contained in:
parent
998a3e282f
commit
c3d1f72556
@ -315,6 +315,83 @@ The chunk size can be overridden at runtime via the
|
|||||||
changes. Explicit ``chunksize=`` kwargs still win over both."""
|
changes. Explicit ``chunksize=`` kwargs still win over both."""
|
||||||
|
|
||||||
|
|
||||||
|
NULL_STRING_SENTINELS: frozenset = frozenset({
|
||||||
|
"null",
|
||||||
|
"na",
|
||||||
|
"n/a",
|
||||||
|
"#n/a",
|
||||||
|
".",
|
||||||
|
"none",
|
||||||
|
"nan",
|
||||||
|
})
|
||||||
|
"""Lowercased string literals treated as SQL ``NULL`` across inference,
|
||||||
|
nullability detection, and COPY preparation. Seen in the wild when a
|
||||||
|
source system exports missing values as the literal text ``"null"``
|
||||||
|
(yes, really; some SAS CHAR columns hold it verbatim) or uses the
|
||||||
|
SAS/Stata ``.`` missing sentinel or spreadsheet-style ``NA`` / ``N/A``.
|
||||||
|
|
||||||
|
Kept narrow on purpose:
|
||||||
|
* ``"null"``, ``"none"``, ``"nan"`` — the common spelled-out missings.
|
||||||
|
* ``"na"``, ``"n/a"``, ``"#n/a"`` — spreadsheet / R conventions.
|
||||||
|
* ``"."`` — SAS / Stata missing sentinel as CHAR export.
|
||||||
|
|
||||||
|
Matching is case-insensitive and ignores leading / trailing whitespace.
|
||||||
|
Extend this set in a calling module (``import load_sas;
|
||||||
|
load_sas.NULL_STRING_SENTINELS = frozenset({...})``) if your source
|
||||||
|
ships additional sentinels. Don't add ambiguous tokens (``"0"``,
|
||||||
|
``"unknown"``) - those are legitimate data in plenty of schemas."""
|
||||||
|
|
||||||
|
|
||||||
|
def _is_null_string(value: Any) -> bool:
|
||||||
|
"""True if ``value`` is a string whose lowercased/stripped form is
|
||||||
|
in :data:`NULL_STRING_SENTINELS`. Safe to call on any Python value;
|
||||||
|
non-strings return False so the helper can be dropped into the same
|
||||||
|
row-walks that also see floats / dates / None."""
|
||||||
|
if not isinstance(value, str):
|
||||||
|
return False
|
||||||
|
s = value.strip()
|
||||||
|
if not s:
|
||||||
|
return False
|
||||||
|
return s.lower() in NULL_STRING_SENTINELS
|
||||||
|
|
||||||
|
|
||||||
|
def _is_char_missing(value: Any) -> bool:
|
||||||
|
"""True if ``value`` should be treated as missing for a CHAR/TEXT
|
||||||
|
column. Unifies the three-way check (None / NaN / empty-or-sentinel
|
||||||
|
string) that used to live inline in several helpers so extending
|
||||||
|
the sentinel set in one place propagates everywhere."""
|
||||||
|
if value is None:
|
||||||
|
return True
|
||||||
|
if isinstance(value, float) and pd.isna(value):
|
||||||
|
return True
|
||||||
|
if isinstance(value, str):
|
||||||
|
s = value.strip()
|
||||||
|
if not s:
|
||||||
|
return True
|
||||||
|
if s.lower() in NULL_STRING_SENTINELS:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _null_sentinel_mask(series: pd.Series) -> pd.Series:
|
||||||
|
"""Return a copy of ``series`` with empty strings and any value in
|
||||||
|
:data:`NULL_STRING_SENTINELS` replaced by ``None``.
|
||||||
|
|
||||||
|
Previously the coercion paths (numeric / datetime / TEXT) only
|
||||||
|
rewrote the empty string. That meant the literal text ``"null"``
|
||||||
|
sailed through ``pd.to_numeric(errors="coerce")`` as ``NaN`` (fine
|
||||||
|
for numerics; by accident) but ``pd.to_datetime(errors="coerce")``
|
||||||
|
handed it to ``dateutil`` which happily parsed it as... today's
|
||||||
|
date (dateutil treats bare words as "use current date for missing
|
||||||
|
fields"). Routing through this helper fixes both problems in one
|
||||||
|
pass. Non-string values are left alone so already-parsed
|
||||||
|
Timestamps / dates / numbers pass through untouched.
|
||||||
|
"""
|
||||||
|
if not pd.api.types.is_object_dtype(series):
|
||||||
|
return series
|
||||||
|
return series.map(lambda v: None if _is_char_missing(v) else v)
|
||||||
|
|
||||||
|
|
||||||
VALID_IF_EXISTS = ("fail", "replace", "append")
|
VALID_IF_EXISTS = ("fail", "replace", "append")
|
||||||
|
|
||||||
VALID_FILE_TYPES = ("sas", "text")
|
VALID_FILE_TYPES = ("sas", "text")
|
||||||
@ -1181,12 +1258,12 @@ def union_column_types(
|
|||||||
|
|
||||||
def _all_null(series: pd.Series) -> bool:
|
def _all_null(series: pd.Series) -> bool:
|
||||||
if pd.api.types.is_object_dtype(series):
|
if pd.api.types.is_object_dtype(series):
|
||||||
return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all())
|
return bool(series.map(_is_char_missing).all())
|
||||||
return bool(series.isna().all())
|
return bool(series.isna().all())
|
||||||
|
|
||||||
|
|
||||||
def _char_missing_mask(series: pd.Series) -> pd.Series:
|
def _char_missing_mask(series: pd.Series) -> pd.Series:
|
||||||
return series.map(lambda v: v is None or (isinstance(v, float) and pd.isna(v)) or (isinstance(v, str) and v == ""))
|
return series.map(_is_char_missing)
|
||||||
|
|
||||||
|
|
||||||
def _is_nullable(series: pd.Series) -> bool:
|
def _is_nullable(series: pd.Series) -> bool:
|
||||||
@ -2021,6 +2098,12 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
|
|||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Sentinel strings (``"null"``, ``"NA"``, ``"."``, ...) collapse to
|
||||||
|
# Python None up front so every type branch below can skip its own
|
||||||
|
# empty-string dance.
|
||||||
|
if _is_null_string(value):
|
||||||
|
return None
|
||||||
|
|
||||||
pg_upper = pg_type.upper()
|
pg_upper = pg_type.upper()
|
||||||
|
|
||||||
if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"):
|
if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"):
|
||||||
@ -2582,7 +2665,7 @@ def _safe_object_to_datetime(
|
|||||||
differs from SAS/Oracle convention in corner cases; applying our
|
differs from SAS/Oracle convention in corner cases; applying our
|
||||||
own parser keeps behavior predictable.
|
own parser keeps behavior predictable.
|
||||||
"""
|
"""
|
||||||
coerced = series.replace({"": None})
|
coerced = _null_sentinel_mask(series)
|
||||||
numeric = pd.to_numeric(coerced, errors="coerce")
|
numeric = pd.to_numeric(coerced, errors="coerce")
|
||||||
all_numeric = numeric.notna().sum() == coerced.notna().sum()
|
all_numeric = numeric.notna().sum() == coerced.notna().sum()
|
||||||
if all_numeric and coerced.notna().any():
|
if all_numeric and coerced.notna().any():
|
||||||
@ -2657,13 +2740,13 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
|
|||||||
if pg in ("INTEGER", "BIGINT", "SMALLINT"):
|
if pg in ("INTEGER", "BIGINT", "SMALLINT"):
|
||||||
if pd.api.types.is_object_dtype(series):
|
if pd.api.types.is_object_dtype(series):
|
||||||
series = pd.to_numeric(
|
series = pd.to_numeric(
|
||||||
series.replace({"": None}), errors="coerce"
|
_null_sentinel_mask(series), errors="coerce"
|
||||||
)
|
)
|
||||||
out[name] = series.astype("Int64")
|
out[name] = series.astype("Int64")
|
||||||
elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"):
|
elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"):
|
||||||
if pd.api.types.is_object_dtype(series):
|
if pd.api.types.is_object_dtype(series):
|
||||||
series = pd.to_numeric(
|
series = pd.to_numeric(
|
||||||
series.replace({"": None}), errors="coerce"
|
_null_sentinel_mask(series), errors="coerce"
|
||||||
)
|
)
|
||||||
out[name] = series.astype("float64")
|
out[name] = series.astype("float64")
|
||||||
elif pg == "DATE":
|
elif pg == "DATE":
|
||||||
@ -2722,6 +2805,12 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
|
|||||||
# in the COPY statement turns the blanks back into SQL NULL.
|
# in the COPY statement turns the blanks back into SQL NULL.
|
||||||
# astype(str) stringifies NaN/None to the literal "nan"/"None",
|
# astype(str) stringifies NaN/None to the literal "nan"/"None",
|
||||||
# so we mask those after the fact rather than branching per cell.
|
# so we mask those after the fact rather than branching per cell.
|
||||||
|
# Object columns also get the sentinel sweep
|
||||||
|
# (:data:`NULL_STRING_SENTINELS`) so a literal ``"null"`` /
|
||||||
|
# ``"NA"`` / ``"."`` value lands as SQL NULL on the way in,
|
||||||
|
# matching what the numeric / date branches above do.
|
||||||
|
if pd.api.types.is_object_dtype(series):
|
||||||
|
series = _null_sentinel_mask(series)
|
||||||
na_mask = series.isna()
|
na_mask = series.isna()
|
||||||
if pd.api.types.is_numeric_dtype(series):
|
if pd.api.types.is_numeric_dtype(series):
|
||||||
# Hit when a column was auto-unioned to TEXT because at
|
# Hit when a column was auto-unioned to TEXT because at
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user