advanced_dates #11
@ -315,6 +315,83 @@ The chunk size can be overridden at runtime via the
|
||||
changes. Explicit ``chunksize=`` kwargs still win over both."""
|
||||
|
||||
|
||||
NULL_STRING_SENTINELS: frozenset = frozenset({
|
||||
"null",
|
||||
"na",
|
||||
"n/a",
|
||||
"#n/a",
|
||||
".",
|
||||
"none",
|
||||
"nan",
|
||||
})
|
||||
"""Lowercased string literals treated as SQL ``NULL`` across inference,
|
||||
nullability detection, and COPY preparation. Seen in the wild when a
|
||||
source system exports missing values as the literal text ``"null"``
|
||||
(yes, really; some SAS CHAR columns hold it verbatim) or uses the
|
||||
SAS/Stata ``.`` missing sentinel or spreadsheet-style ``NA`` / ``N/A``.
|
||||
|
||||
Kept narrow on purpose:
|
||||
* ``"null"``, ``"none"``, ``"nan"`` — the common spelled-out missings.
|
||||
* ``"na"``, ``"n/a"``, ``"#n/a"`` — spreadsheet / R conventions.
|
||||
* ``"."`` — SAS / Stata missing sentinel as CHAR export.
|
||||
|
||||
Matching is case-insensitive and ignores leading / trailing whitespace.
|
||||
Extend this set in a calling module (``import load_sas;
|
||||
load_sas.NULL_STRING_SENTINELS = frozenset({...})``) if your source
|
||||
ships additional sentinels. Don't add ambiguous tokens (``"0"``,
|
||||
``"unknown"``) - those are legitimate data in plenty of schemas."""
|
||||
|
||||
|
||||
def _is_null_string(value: Any) -> bool:
|
||||
"""True if ``value`` is a string whose lowercased/stripped form is
|
||||
in :data:`NULL_STRING_SENTINELS`. Safe to call on any Python value;
|
||||
non-strings return False so the helper can be dropped into the same
|
||||
row-walks that also see floats / dates / None."""
|
||||
if not isinstance(value, str):
|
||||
return False
|
||||
s = value.strip()
|
||||
if not s:
|
||||
return False
|
||||
return s.lower() in NULL_STRING_SENTINELS
|
||||
|
||||
|
||||
def _is_char_missing(value: Any) -> bool:
|
||||
"""True if ``value`` should be treated as missing for a CHAR/TEXT
|
||||
column. Unifies the three-way check (None / NaN / empty-or-sentinel
|
||||
string) that used to live inline in several helpers so extending
|
||||
the sentinel set in one place propagates everywhere."""
|
||||
if value is None:
|
||||
return True
|
||||
if isinstance(value, float) and pd.isna(value):
|
||||
return True
|
||||
if isinstance(value, str):
|
||||
s = value.strip()
|
||||
if not s:
|
||||
return True
|
||||
if s.lower() in NULL_STRING_SENTINELS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _null_sentinel_mask(series: pd.Series) -> pd.Series:
|
||||
"""Return a copy of ``series`` with empty strings and any value in
|
||||
:data:`NULL_STRING_SENTINELS` replaced by ``None``.
|
||||
|
||||
Previously the coercion paths (numeric / datetime / TEXT) only
|
||||
rewrote the empty string. That meant the literal text ``"null"``
|
||||
sailed through ``pd.to_numeric(errors="coerce")`` as ``NaN`` (fine
|
||||
for numerics; by accident) but ``pd.to_datetime(errors="coerce")``
|
||||
handed it to ``dateutil`` which happily parsed it as... today's
|
||||
date (dateutil treats bare words as "use current date for missing
|
||||
fields"). Routing through this helper fixes both problems in one
|
||||
pass. Non-string values are left alone so already-parsed
|
||||
Timestamps / dates / numbers pass through untouched.
|
||||
"""
|
||||
if not pd.api.types.is_object_dtype(series):
|
||||
return series
|
||||
return series.map(lambda v: None if _is_char_missing(v) else v)
|
||||
|
||||
|
||||
VALID_IF_EXISTS = ("fail", "replace", "append")
|
||||
|
||||
VALID_FILE_TYPES = ("sas", "text")
|
||||
@ -1181,12 +1258,12 @@ def union_column_types(
|
||||
|
||||
def _all_null(series: pd.Series) -> bool:
|
||||
if pd.api.types.is_object_dtype(series):
|
||||
return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all())
|
||||
return bool(series.map(_is_char_missing).all())
|
||||
return bool(series.isna().all())
|
||||
|
||||
|
||||
def _char_missing_mask(series: pd.Series) -> pd.Series:
|
||||
return series.map(lambda v: v is None or (isinstance(v, float) and pd.isna(v)) or (isinstance(v, str) and v == ""))
|
||||
return series.map(_is_char_missing)
|
||||
|
||||
|
||||
def _is_nullable(series: pd.Series) -> bool:
|
||||
@ -2021,6 +2098,12 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
# Sentinel strings (``"null"``, ``"NA"``, ``"."``, ...) collapse to
|
||||
# Python None up front so every type branch below can skip its own
|
||||
# empty-string dance.
|
||||
if _is_null_string(value):
|
||||
return None
|
||||
|
||||
pg_upper = pg_type.upper()
|
||||
|
||||
if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"):
|
||||
@ -2582,7 +2665,7 @@ def _safe_object_to_datetime(
|
||||
differs from SAS/Oracle convention in corner cases; applying our
|
||||
own parser keeps behavior predictable.
|
||||
"""
|
||||
coerced = series.replace({"": None})
|
||||
coerced = _null_sentinel_mask(series)
|
||||
numeric = pd.to_numeric(coerced, errors="coerce")
|
||||
all_numeric = numeric.notna().sum() == coerced.notna().sum()
|
||||
if all_numeric and coerced.notna().any():
|
||||
@ -2657,13 +2740,13 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
|
||||
if pg in ("INTEGER", "BIGINT", "SMALLINT"):
|
||||
if pd.api.types.is_object_dtype(series):
|
||||
series = pd.to_numeric(
|
||||
series.replace({"": None}), errors="coerce"
|
||||
_null_sentinel_mask(series), errors="coerce"
|
||||
)
|
||||
out[name] = series.astype("Int64")
|
||||
elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"):
|
||||
if pd.api.types.is_object_dtype(series):
|
||||
series = pd.to_numeric(
|
||||
series.replace({"": None}), errors="coerce"
|
||||
_null_sentinel_mask(series), errors="coerce"
|
||||
)
|
||||
out[name] = series.astype("float64")
|
||||
elif pg == "DATE":
|
||||
@ -2722,6 +2805,12 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
|
||||
# in the COPY statement turns the blanks back into SQL NULL.
|
||||
# astype(str) stringifies NaN/None to the literal "nan"/"None",
|
||||
# so we mask those after the fact rather than branching per cell.
|
||||
# Object columns also get the sentinel sweep
|
||||
# (:data:`NULL_STRING_SENTINELS`) so a literal ``"null"`` /
|
||||
# ``"NA"`` / ``"."`` value lands as SQL NULL on the way in,
|
||||
# matching what the numeric / date branches above do.
|
||||
if pd.api.types.is_object_dtype(series):
|
||||
series = _null_sentinel_mask(series)
|
||||
na_mask = series.isna()
|
||||
if pd.api.types.is_numeric_dtype(series):
|
||||
# Hit when a column was auto-unioned to TEXT because at
|
||||
|
||||
Loading…
Reference in New Issue
Block a user