advanced_dates #11

Merged
dp merged 4 commits from advanced_dates into main 2026-04-23 22:33:18 +00:00
Showing only changes of commit c3d1f72556 - Show all commits

View File

@ -315,6 +315,83 @@ The chunk size can be overridden at runtime via the
changes. Explicit ``chunksize=`` kwargs still win over both.""" changes. Explicit ``chunksize=`` kwargs still win over both."""
NULL_STRING_SENTINELS: frozenset = frozenset({
"null",
"na",
"n/a",
"#n/a",
".",
"none",
"nan",
})
"""Lowercased string literals treated as SQL ``NULL`` across inference,
nullability detection, and COPY preparation. Seen in the wild when a
source system exports missing values as the literal text ``"null"``
(yes, really; some SAS CHAR columns hold it verbatim) or uses the
SAS/Stata ``.`` missing sentinel or spreadsheet-style ``NA`` / ``N/A``.
Kept narrow on purpose:
* ``"null"``, ``"none"``, ``"nan"`` the common spelled-out missings.
* ``"na"``, ``"n/a"``, ``"#n/a"`` spreadsheet / R conventions.
* ``"."`` SAS / Stata missing sentinel as CHAR export.
Matching is case-insensitive and ignores leading / trailing whitespace.
Extend this set in a calling module (``import load_sas;
load_sas.NULL_STRING_SENTINELS = frozenset({...})``) if your source
ships additional sentinels. Don't add ambiguous tokens (``"0"``,
``"unknown"``) - those are legitimate data in plenty of schemas."""
def _is_null_string(value: Any) -> bool:
"""True if ``value`` is a string whose lowercased/stripped form is
in :data:`NULL_STRING_SENTINELS`. Safe to call on any Python value;
non-strings return False so the helper can be dropped into the same
row-walks that also see floats / dates / None."""
if not isinstance(value, str):
return False
s = value.strip()
if not s:
return False
return s.lower() in NULL_STRING_SENTINELS
def _is_char_missing(value: Any) -> bool:
"""True if ``value`` should be treated as missing for a CHAR/TEXT
column. Unifies the three-way check (None / NaN / empty-or-sentinel
string) that used to live inline in several helpers so extending
the sentinel set in one place propagates everywhere."""
if value is None:
return True
if isinstance(value, float) and pd.isna(value):
return True
if isinstance(value, str):
s = value.strip()
if not s:
return True
if s.lower() in NULL_STRING_SENTINELS:
return True
return False
def _null_sentinel_mask(series: pd.Series) -> pd.Series:
"""Return a copy of ``series`` with empty strings and any value in
:data:`NULL_STRING_SENTINELS` replaced by ``None``.
Previously the coercion paths (numeric / datetime / TEXT) only
rewrote the empty string. That meant the literal text ``"null"``
sailed through ``pd.to_numeric(errors="coerce")`` as ``NaN`` (fine
for numerics; by accident) but ``pd.to_datetime(errors="coerce")``
handed it to ``dateutil`` which happily parsed it as... today's
date (dateutil treats bare words as "use current date for missing
fields"). Routing through this helper fixes both problems in one
pass. Non-string values are left alone so already-parsed
Timestamps / dates / numbers pass through untouched.
"""
if not pd.api.types.is_object_dtype(series):
return series
return series.map(lambda v: None if _is_char_missing(v) else v)
VALID_IF_EXISTS = ("fail", "replace", "append") VALID_IF_EXISTS = ("fail", "replace", "append")
VALID_FILE_TYPES = ("sas", "text") VALID_FILE_TYPES = ("sas", "text")
@ -1181,12 +1258,12 @@ def union_column_types(
def _all_null(series: pd.Series) -> bool: def _all_null(series: pd.Series) -> bool:
if pd.api.types.is_object_dtype(series): if pd.api.types.is_object_dtype(series):
return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all()) return bool(series.map(_is_char_missing).all())
return bool(series.isna().all()) return bool(series.isna().all())
def _char_missing_mask(series: pd.Series) -> pd.Series: def _char_missing_mask(series: pd.Series) -> pd.Series:
return series.map(lambda v: v is None or (isinstance(v, float) and pd.isna(v)) or (isinstance(v, str) and v == "")) return series.map(_is_char_missing)
def _is_nullable(series: pd.Series) -> bool: def _is_nullable(series: pd.Series) -> bool:
@ -2021,6 +2098,12 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
except (TypeError, ValueError): except (TypeError, ValueError):
pass pass
# Sentinel strings (``"null"``, ``"NA"``, ``"."``, ...) collapse to
# Python None up front so every type branch below can skip its own
# empty-string dance.
if _is_null_string(value):
return None
pg_upper = pg_type.upper() pg_upper = pg_type.upper()
if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"): if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"):
@ -2582,7 +2665,7 @@ def _safe_object_to_datetime(
differs from SAS/Oracle convention in corner cases; applying our differs from SAS/Oracle convention in corner cases; applying our
own parser keeps behavior predictable. own parser keeps behavior predictable.
""" """
coerced = series.replace({"": None}) coerced = _null_sentinel_mask(series)
numeric = pd.to_numeric(coerced, errors="coerce") numeric = pd.to_numeric(coerced, errors="coerce")
all_numeric = numeric.notna().sum() == coerced.notna().sum() all_numeric = numeric.notna().sum() == coerced.notna().sum()
if all_numeric and coerced.notna().any(): if all_numeric and coerced.notna().any():
@ -2657,13 +2740,13 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
if pg in ("INTEGER", "BIGINT", "SMALLINT"): if pg in ("INTEGER", "BIGINT", "SMALLINT"):
if pd.api.types.is_object_dtype(series): if pd.api.types.is_object_dtype(series):
series = pd.to_numeric( series = pd.to_numeric(
series.replace({"": None}), errors="coerce" _null_sentinel_mask(series), errors="coerce"
) )
out[name] = series.astype("Int64") out[name] = series.astype("Int64")
elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"): elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"):
if pd.api.types.is_object_dtype(series): if pd.api.types.is_object_dtype(series):
series = pd.to_numeric( series = pd.to_numeric(
series.replace({"": None}), errors="coerce" _null_sentinel_mask(series), errors="coerce"
) )
out[name] = series.astype("float64") out[name] = series.astype("float64")
elif pg == "DATE": elif pg == "DATE":
@ -2722,6 +2805,12 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
# in the COPY statement turns the blanks back into SQL NULL. # in the COPY statement turns the blanks back into SQL NULL.
# astype(str) stringifies NaN/None to the literal "nan"/"None", # astype(str) stringifies NaN/None to the literal "nan"/"None",
# so we mask those after the fact rather than branching per cell. # so we mask those after the fact rather than branching per cell.
# Object columns also get the sentinel sweep
# (:data:`NULL_STRING_SENTINELS`) so a literal ``"null"`` /
# ``"NA"`` / ``"."`` value lands as SQL NULL on the way in,
# matching what the numeric / date branches above do.
if pd.api.types.is_object_dtype(series):
series = _null_sentinel_mask(series)
na_mask = series.isna() na_mask = series.isna()
if pd.api.types.is_numeric_dtype(series): if pd.api.types.is_numeric_dtype(series):
# Hit when a column was auto-unioned to TEXT because at # Hit when a column was auto-unioned to TEXT because at