Compare commits
No commits in common. "c3d1f72556712d151149d09827b0c2179f94222c" and "f63d684d51177df294d72895103eec3dda3b350e" have entirely different histories.
c3d1f72556
...
f63d684d51
@ -315,83 +315,6 @@ The chunk size can be overridden at runtime via the
|
|||||||
changes. Explicit ``chunksize=`` kwargs still win over both."""
|
changes. Explicit ``chunksize=`` kwargs still win over both."""
|
||||||
|
|
||||||
|
|
||||||
NULL_STRING_SENTINELS: frozenset = frozenset({
|
|
||||||
"null",
|
|
||||||
"na",
|
|
||||||
"n/a",
|
|
||||||
"#n/a",
|
|
||||||
".",
|
|
||||||
"none",
|
|
||||||
"nan",
|
|
||||||
})
|
|
||||||
"""Lowercased string literals treated as SQL ``NULL`` across inference,
|
|
||||||
nullability detection, and COPY preparation. Seen in the wild when a
|
|
||||||
source system exports missing values as the literal text ``"null"``
|
|
||||||
(yes, really; some SAS CHAR columns hold it verbatim) or uses the
|
|
||||||
SAS/Stata ``.`` missing sentinel or spreadsheet-style ``NA`` / ``N/A``.
|
|
||||||
|
|
||||||
Kept narrow on purpose:
|
|
||||||
* ``"null"``, ``"none"``, ``"nan"`` — the common spelled-out missings.
|
|
||||||
* ``"na"``, ``"n/a"``, ``"#n/a"`` — spreadsheet / R conventions.
|
|
||||||
* ``"."`` — SAS / Stata missing sentinel as CHAR export.
|
|
||||||
|
|
||||||
Matching is case-insensitive and ignores leading / trailing whitespace.
|
|
||||||
Extend this set in a calling module (``import load_sas;
|
|
||||||
load_sas.NULL_STRING_SENTINELS = frozenset({...})``) if your source
|
|
||||||
ships additional sentinels. Don't add ambiguous tokens (``"0"``,
|
|
||||||
``"unknown"``) - those are legitimate data in plenty of schemas."""
|
|
||||||
|
|
||||||
|
|
||||||
def _is_null_string(value: Any) -> bool:
|
|
||||||
"""True if ``value`` is a string whose lowercased/stripped form is
|
|
||||||
in :data:`NULL_STRING_SENTINELS`. Safe to call on any Python value;
|
|
||||||
non-strings return False so the helper can be dropped into the same
|
|
||||||
row-walks that also see floats / dates / None."""
|
|
||||||
if not isinstance(value, str):
|
|
||||||
return False
|
|
||||||
s = value.strip()
|
|
||||||
if not s:
|
|
||||||
return False
|
|
||||||
return s.lower() in NULL_STRING_SENTINELS
|
|
||||||
|
|
||||||
|
|
||||||
def _is_char_missing(value: Any) -> bool:
|
|
||||||
"""True if ``value`` should be treated as missing for a CHAR/TEXT
|
|
||||||
column. Unifies the three-way check (None / NaN / empty-or-sentinel
|
|
||||||
string) that used to live inline in several helpers so extending
|
|
||||||
the sentinel set in one place propagates everywhere."""
|
|
||||||
if value is None:
|
|
||||||
return True
|
|
||||||
if isinstance(value, float) and pd.isna(value):
|
|
||||||
return True
|
|
||||||
if isinstance(value, str):
|
|
||||||
s = value.strip()
|
|
||||||
if not s:
|
|
||||||
return True
|
|
||||||
if s.lower() in NULL_STRING_SENTINELS:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _null_sentinel_mask(series: pd.Series) -> pd.Series:
|
|
||||||
"""Return a copy of ``series`` with empty strings and any value in
|
|
||||||
:data:`NULL_STRING_SENTINELS` replaced by ``None``.
|
|
||||||
|
|
||||||
Previously the coercion paths (numeric / datetime / TEXT) only
|
|
||||||
rewrote the empty string. That meant the literal text ``"null"``
|
|
||||||
sailed through ``pd.to_numeric(errors="coerce")`` as ``NaN`` (fine
|
|
||||||
for numerics; by accident) but ``pd.to_datetime(errors="coerce")``
|
|
||||||
handed it to ``dateutil`` which happily parsed it as... today's
|
|
||||||
date (dateutil treats bare words as "use current date for missing
|
|
||||||
fields"). Routing through this helper fixes both problems in one
|
|
||||||
pass. Non-string values are left alone so already-parsed
|
|
||||||
Timestamps / dates / numbers pass through untouched.
|
|
||||||
"""
|
|
||||||
if not pd.api.types.is_object_dtype(series):
|
|
||||||
return series
|
|
||||||
return series.map(lambda v: None if _is_char_missing(v) else v)
|
|
||||||
|
|
||||||
|
|
||||||
VALID_IF_EXISTS = ("fail", "replace", "append")
|
VALID_IF_EXISTS = ("fail", "replace", "append")
|
||||||
|
|
||||||
VALID_FILE_TYPES = ("sas", "text")
|
VALID_FILE_TYPES = ("sas", "text")
|
||||||
@ -1258,12 +1181,12 @@ def union_column_types(
|
|||||||
|
|
||||||
def _all_null(series: pd.Series) -> bool:
|
def _all_null(series: pd.Series) -> bool:
|
||||||
if pd.api.types.is_object_dtype(series):
|
if pd.api.types.is_object_dtype(series):
|
||||||
return bool(series.map(_is_char_missing).all())
|
return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all())
|
||||||
return bool(series.isna().all())
|
return bool(series.isna().all())
|
||||||
|
|
||||||
|
|
||||||
def _char_missing_mask(series: pd.Series) -> pd.Series:
|
def _char_missing_mask(series: pd.Series) -> pd.Series:
|
||||||
return series.map(_is_char_missing)
|
return series.map(lambda v: v is None or (isinstance(v, float) and pd.isna(v)) or (isinstance(v, str) and v == ""))
|
||||||
|
|
||||||
|
|
||||||
def _is_nullable(series: pd.Series) -> bool:
|
def _is_nullable(series: pd.Series) -> bool:
|
||||||
@ -1338,197 +1261,20 @@ def _try_float_coerce(values: List[str]) -> bool:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
# Locale-independent month lookup so ``DD-MON-YY`` / ``DDMONYYYY`` style
|
|
||||||
# strings (Oracle's default ``DD-MON-YY`` export, SAS ``DATE7.`` /
|
|
||||||
# ``DATE9.`` rendered to text, spreadsheets spitting out ``23-Mar-2020``)
|
|
||||||
# parse correctly regardless of the host's ``LC_TIME``. ``strptime("%b")``
|
|
||||||
# is locale-dependent and silently fails on non-English systems; this
|
|
||||||
# dict sidesteps that entirely.
|
|
||||||
_MONTH_LOOKUP: Dict[str, int] = {
|
|
||||||
"JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6,
|
|
||||||
"JUL": 7, "AUG": 8, "SEP": 9, "SEPT": 9, "OCT": 10, "NOV": 11, "DEC": 12,
|
|
||||||
"JANUARY": 1, "FEBRUARY": 2, "MARCH": 3, "APRIL": 4, "JUNE": 6,
|
|
||||||
"JULY": 7, "AUGUST": 8, "SEPTEMBER": 9, "OCTOBER": 10,
|
|
||||||
"NOVEMBER": 11, "DECEMBER": 12,
|
|
||||||
}
|
|
||||||
|
|
||||||
# ``DD[sep]MON[sep]YY`` with an optional ``HH:MM[:SS[.ffff]] [AM|PM]``
|
|
||||||
# suffix. ``sep`` can be ``-``, ``/``, space, or empty so the same
|
|
||||||
# regex covers ``23-MAR-20``, ``23-MAR-2020``, ``23MAR2020`` (SAS
|
|
||||||
# ``DATE9.``), ``23 Mar 2020`` (Excel), and ``23-MAR-20 14:30:00``
|
|
||||||
# (Oracle ``TO_CHAR`` default with timestamp). Time portion is lenient
|
|
||||||
# on separator (``:`` or ``.``) since Oracle's default timestamp
|
|
||||||
# rendering uses dots (``02.30.45.123456``) while most others use
|
|
||||||
# colons.
|
|
||||||
_DDMONYY_RE = re.compile(
|
|
||||||
r"""
|
|
||||||
^\s*
|
|
||||||
(?P<day>\d{1,2})
|
|
||||||
[-/\s]?
|
|
||||||
(?P<month>[A-Za-z]{3,9})
|
|
||||||
[-/\s]?
|
|
||||||
(?P<year>\d{2}|\d{4})
|
|
||||||
(?:
|
|
||||||
[\sT:]+
|
|
||||||
(?P<hour>\d{1,2}) [:.] (?P<minute>\d{2})
|
|
||||||
(?:
|
|
||||||
[:.] (?P<second>\d{2})
|
|
||||||
(?: \. (?P<micro>\d+) )?
|
|
||||||
)?
|
|
||||||
\s*
|
|
||||||
(?P<ampm>[AaPp][Mm])?
|
|
||||||
)?
|
|
||||||
\s*$
|
|
||||||
""",
|
|
||||||
re.VERBOSE,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Strptime fallbacks for all-numeric shapes the regex above can't
|
|
||||||
# disambiguate. Order matters: unambiguous 4-digit-year layouts first,
|
|
||||||
# then US-style ``mm/dd`` before EU-style ``dd/mm`` (the former is
|
|
||||||
# dominant in the kinds of exports this loader sees). Columns whose
|
|
||||||
# true format is ``DD/MM/YY`` should pin the Postgres type via
|
|
||||||
# ``column_types: {col: TEXT}`` and parse themselves downstream.
|
|
||||||
_EXTRA_DATE_FORMATS: Tuple[str, ...] = (
|
|
||||||
"%Y/%m/%d",
|
|
||||||
"%Y%m%d",
|
|
||||||
"%m/%d/%Y",
|
|
||||||
"%m/%d/%y",
|
|
||||||
"%m-%d-%Y",
|
|
||||||
"%m-%d-%y",
|
|
||||||
"%d/%m/%Y",
|
|
||||||
"%d/%m/%y",
|
|
||||||
"%d-%m-%Y",
|
|
||||||
"%d-%m-%y",
|
|
||||||
)
|
|
||||||
|
|
||||||
_EXTRA_DATETIME_FORMATS: Tuple[str, ...] = (
|
|
||||||
"%Y-%m-%d %H:%M:%S",
|
|
||||||
"%Y-%m-%d %H:%M:%S.%f",
|
|
||||||
"%Y-%m-%dT%H:%M:%S",
|
|
||||||
"%Y-%m-%dT%H:%M:%S.%f",
|
|
||||||
"%m/%d/%Y %H:%M:%S",
|
|
||||||
"%m/%d/%Y %H:%M",
|
|
||||||
"%m/%d/%y %H:%M:%S",
|
|
||||||
"%m/%d/%y %H:%M",
|
|
||||||
"%d/%m/%Y %H:%M:%S",
|
|
||||||
"%d/%m/%y %H:%M:%S",
|
|
||||||
"%Y/%m/%d %H:%M:%S",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_flexible_date(value: Any) -> Optional[dt.date]:
|
|
||||||
"""Parse ``value`` to ``datetime.date`` using ISO first, then the
|
|
||||||
``DD-MON-YY`` family, then the numeric fallbacks in
|
|
||||||
:data:`_EXTRA_DATE_FORMATS`. Returns ``None`` if nothing matches.
|
|
||||||
|
|
||||||
Non-string / empty / non-finite inputs return ``None`` rather than
|
|
||||||
raising so callers can use this as a drop-in replacement for the old
|
|
||||||
``dt.date.fromisoformat`` + ``try``/``except`` pattern.
|
|
||||||
"""
|
|
||||||
if value is None:
|
|
||||||
return None
|
|
||||||
if not isinstance(value, str):
|
|
||||||
return None
|
|
||||||
s = value.strip()
|
|
||||||
if not s:
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
return dt.date.fromisoformat(s)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
pass
|
|
||||||
m = _DDMONYY_RE.match(s)
|
|
||||||
# Reject inputs that carry a time component so ``_try_date_coerce``
|
|
||||||
# doesn't silently swallow ``TIMESTAMP`` columns (``23-MAR-20 14:30:00``)
|
|
||||||
# and misclassify them as ``DATE``.
|
|
||||||
if m and m.group("hour") is None:
|
|
||||||
month = _MONTH_LOOKUP.get(m.group("month").upper())
|
|
||||||
if month is not None:
|
|
||||||
try:
|
|
||||||
day = int(m.group("day"))
|
|
||||||
year = int(m.group("year"))
|
|
||||||
if len(m.group("year")) == 2:
|
|
||||||
# Pivot year = 69 matches SAS / Oracle / Excel
|
|
||||||
# conventions: ``00..68`` -> 2000s, ``69..99`` -> 1900s.
|
|
||||||
year = 2000 + year if year < 69 else 1900 + year
|
|
||||||
return dt.date(year, month, day)
|
|
||||||
except ValueError:
|
|
||||||
return None
|
|
||||||
for fmt in _EXTRA_DATE_FORMATS:
|
|
||||||
try:
|
|
||||||
return dt.datetime.strptime(s, fmt).date()
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_flexible_datetime(value: Any) -> Optional[dt.datetime]:
|
|
||||||
"""Parse ``value`` to ``datetime.datetime``. Same format coverage as
|
|
||||||
:func:`_parse_flexible_date` plus explicit datetime shapes; a
|
|
||||||
date-only input is promoted to midnight so callers can treat a
|
|
||||||
column that mixes ``23-MAR-20`` and ``23-MAR-20 14:30:00`` as
|
|
||||||
``TIMESTAMP`` end-to-end.
|
|
||||||
"""
|
|
||||||
if value is None:
|
|
||||||
return None
|
|
||||||
if not isinstance(value, str):
|
|
||||||
return None
|
|
||||||
s = value.strip()
|
|
||||||
if not s:
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
return dt.datetime.fromisoformat(s)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
pass
|
|
||||||
m = _DDMONYY_RE.match(s)
|
|
||||||
if m:
|
|
||||||
month = _MONTH_LOOKUP.get(m.group("month").upper())
|
|
||||||
if month is not None:
|
|
||||||
try:
|
|
||||||
day = int(m.group("day"))
|
|
||||||
year = int(m.group("year"))
|
|
||||||
if len(m.group("year")) == 2:
|
|
||||||
year = 2000 + year if year < 69 else 1900 + year
|
|
||||||
hour = int(m.group("hour")) if m.group("hour") else 0
|
|
||||||
minute = int(m.group("minute")) if m.group("minute") else 0
|
|
||||||
second = int(m.group("second")) if m.group("second") else 0
|
|
||||||
micro = 0
|
|
||||||
if m.group("micro"):
|
|
||||||
# ``%f`` expects 1-6 digits; pad / truncate to match.
|
|
||||||
micro_s = m.group("micro")[:6].ljust(6, "0")
|
|
||||||
micro = int(micro_s)
|
|
||||||
ampm = m.group("ampm")
|
|
||||||
if ampm:
|
|
||||||
ap = ampm.upper()
|
|
||||||
if ap == "PM" and hour < 12:
|
|
||||||
hour += 12
|
|
||||||
elif ap == "AM" and hour == 12:
|
|
||||||
hour = 0
|
|
||||||
return dt.datetime(year, month, day, hour, minute, second, micro)
|
|
||||||
except ValueError:
|
|
||||||
return None
|
|
||||||
for fmt in _EXTRA_DATETIME_FORMATS:
|
|
||||||
try:
|
|
||||||
return dt.datetime.strptime(s, fmt)
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
# Final fallback: accept a date-only string and promote to midnight.
|
|
||||||
d = _parse_flexible_date(s)
|
|
||||||
if d is not None:
|
|
||||||
return dt.datetime(d.year, d.month, d.day)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _try_date_coerce(values: List[str]) -> bool:
|
def _try_date_coerce(values: List[str]) -> bool:
|
||||||
for v in values:
|
for v in values:
|
||||||
if _parse_flexible_date(v) is None:
|
try:
|
||||||
|
dt.date.fromisoformat(v)
|
||||||
|
except (ValueError, TypeError):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _try_datetime_coerce(values: List[str]) -> bool:
|
def _try_datetime_coerce(values: List[str]) -> bool:
|
||||||
for v in values:
|
for v in values:
|
||||||
if _parse_flexible_datetime(v) is None:
|
try:
|
||||||
|
dt.datetime.fromisoformat(v)
|
||||||
|
except (ValueError, TypeError):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -2098,12 +1844,6 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
|
|||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Sentinel strings (``"null"``, ``"NA"``, ``"."``, ...) collapse to
|
|
||||||
# Python None up front so every type branch below can skip its own
|
|
||||||
# empty-string dance.
|
|
||||||
if _is_null_string(value):
|
|
||||||
return None
|
|
||||||
|
|
||||||
pg_upper = pg_type.upper()
|
pg_upper = pg_type.upper()
|
||||||
|
|
||||||
if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"):
|
if pg_upper in ("INTEGER", "BIGINT", "SMALLINT", "INT", "INT4", "INT8", "INT2"):
|
||||||
@ -2136,7 +1876,10 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
|
|||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
if value.strip() == "":
|
if value.strip() == "":
|
||||||
return None
|
return None
|
||||||
return _parse_flexible_date(value)
|
try:
|
||||||
|
return dt.date.fromisoformat(value.strip())
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if pg_upper in ("TIMESTAMP", "TIMESTAMP WITHOUT TIME ZONE",
|
if pg_upper in ("TIMESTAMP", "TIMESTAMP WITHOUT TIME ZONE",
|
||||||
@ -2150,7 +1893,10 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
|
|||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
if value.strip() == "":
|
if value.strip() == "":
|
||||||
return None
|
return None
|
||||||
return _parse_flexible_datetime(value)
|
try:
|
||||||
|
return dt.datetime.fromisoformat(value.strip())
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if pg_upper in ("TIME", "TIME WITHOUT TIME ZONE",
|
if pg_upper in ("TIME", "TIME WITHOUT TIME ZONE",
|
||||||
@ -2653,59 +2399,16 @@ def _safe_object_to_datetime(
|
|||||||
"""Object-dtype to datetime. Shares the safety net (errstate +
|
"""Object-dtype to datetime. Shares the safety net (errstate +
|
||||||
try/except) with :func:`_safe_numeric_to_datetime`. If the column is
|
try/except) with :func:`_safe_numeric_to_datetime`. If the column is
|
||||||
actually numeric-flavored (e.g. SAS wrote numbers into an object
|
actually numeric-flavored (e.g. SAS wrote numbers into an object
|
||||||
column), route to the numeric path; otherwise try our explicit
|
column), route to the numeric path; otherwise parse with ``to_datetime``
|
||||||
``DD-MON-YY`` / strptime format set before falling back to the
|
on the object itself.
|
||||||
generic ``pd.to_datetime`` dateutil parser.
|
|
||||||
|
|
||||||
The explicit-format pre-pass exists because:
|
|
||||||
* ``pd.to_datetime`` on unformatted object columns emits a
|
|
||||||
``UserWarning`` per chunk and parses row-by-row via ``dateutil``
|
|
||||||
-- 10-100× slower than a single vectorized strptime.
|
|
||||||
* ``dateutil`` *will* parse ``23-MAR-20`` but its 2-digit-year pivot
|
|
||||||
differs from SAS/Oracle convention in corner cases; applying our
|
|
||||||
own parser keeps behavior predictable.
|
|
||||||
"""
|
"""
|
||||||
coerced = _null_sentinel_mask(series)
|
coerced = series.replace({"": None})
|
||||||
numeric = pd.to_numeric(coerced, errors="coerce")
|
numeric = pd.to_numeric(coerced, errors="coerce")
|
||||||
all_numeric = numeric.notna().sum() == coerced.notna().sum()
|
all_numeric = numeric.notna().sum() == coerced.notna().sum()
|
||||||
if all_numeric and coerced.notna().any():
|
if all_numeric and coerced.notna().any():
|
||||||
return _safe_numeric_to_datetime(
|
return _safe_numeric_to_datetime(
|
||||||
numeric, unit="s", column_name=column_name, target_type=target_type,
|
numeric, unit="s", column_name=column_name, target_type=target_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
non_null_count = int(coerced.notna().sum())
|
|
||||||
if non_null_count:
|
|
||||||
# First pass: our regex-based ``DD-MON-YY`` parser. Cheap,
|
|
||||||
# locale-independent, covers the cases ``pd.to_datetime`` warns
|
|
||||||
# about. Always parse via the datetime-aware variant so a DATE
|
|
||||||
# target whose chunk happens to carry time components
|
|
||||||
# (``23-MAR-20 14:30:00``) still parses without warnings; the
|
|
||||||
# caller's ``.dt.date`` cast truncates the time, matching the
|
|
||||||
# existing datetime64-input branch.
|
|
||||||
parsed_py = coerced.map(
|
|
||||||
lambda v: _parse_flexible_datetime(v) if v is not None else None
|
|
||||||
)
|
|
||||||
parsed_ts = pd.to_datetime(parsed_py, errors="coerce")
|
|
||||||
if int(parsed_ts.notna().sum()) == non_null_count:
|
|
||||||
return parsed_ts
|
|
||||||
|
|
||||||
# Second pass: vectorized ``pd.to_datetime`` with each explicit
|
|
||||||
# format. One ``pd.to_datetime(format=fmt)`` call is O(n) in C;
|
|
||||||
# trying a handful of them still beats row-by-row dateutil on
|
|
||||||
# large chunks. Accept the first format that covers every
|
|
||||||
# non-null cell.
|
|
||||||
for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS:
|
|
||||||
try:
|
|
||||||
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
|
|
||||||
candidate = pd.to_datetime(coerced, format=fmt, errors="coerce")
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
continue
|
|
||||||
if int(candidate.notna().sum()) == non_null_count:
|
|
||||||
return candidate
|
|
||||||
|
|
||||||
# Final fallback: ``dateutil`` via ``pd.to_datetime``. Handles
|
|
||||||
# shapes our explicit list missed (rare edge cases, mixed formats
|
|
||||||
# within one column). Same safety net as the numeric path.
|
|
||||||
try:
|
try:
|
||||||
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
|
with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
|
||||||
return pd.to_datetime(coerced, errors="coerce")
|
return pd.to_datetime(coerced, errors="coerce")
|
||||||
@ -2740,13 +2443,13 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
|
|||||||
if pg in ("INTEGER", "BIGINT", "SMALLINT"):
|
if pg in ("INTEGER", "BIGINT", "SMALLINT"):
|
||||||
if pd.api.types.is_object_dtype(series):
|
if pd.api.types.is_object_dtype(series):
|
||||||
series = pd.to_numeric(
|
series = pd.to_numeric(
|
||||||
_null_sentinel_mask(series), errors="coerce"
|
series.replace({"": None}), errors="coerce"
|
||||||
)
|
)
|
||||||
out[name] = series.astype("Int64")
|
out[name] = series.astype("Int64")
|
||||||
elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"):
|
elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"):
|
||||||
if pd.api.types.is_object_dtype(series):
|
if pd.api.types.is_object_dtype(series):
|
||||||
series = pd.to_numeric(
|
series = pd.to_numeric(
|
||||||
_null_sentinel_mask(series), errors="coerce"
|
series.replace({"": None}), errors="coerce"
|
||||||
)
|
)
|
||||||
out[name] = series.astype("float64")
|
out[name] = series.astype("float64")
|
||||||
elif pg == "DATE":
|
elif pg == "DATE":
|
||||||
@ -2805,12 +2508,6 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
|
|||||||
# in the COPY statement turns the blanks back into SQL NULL.
|
# in the COPY statement turns the blanks back into SQL NULL.
|
||||||
# astype(str) stringifies NaN/None to the literal "nan"/"None",
|
# astype(str) stringifies NaN/None to the literal "nan"/"None",
|
||||||
# so we mask those after the fact rather than branching per cell.
|
# so we mask those after the fact rather than branching per cell.
|
||||||
# Object columns also get the sentinel sweep
|
|
||||||
# (:data:`NULL_STRING_SENTINELS`) so a literal ``"null"`` /
|
|
||||||
# ``"NA"`` / ``"."`` value lands as SQL NULL on the way in,
|
|
||||||
# matching what the numeric / date branches above do.
|
|
||||||
if pd.api.types.is_object_dtype(series):
|
|
||||||
series = _null_sentinel_mask(series)
|
|
||||||
na_mask = series.isna()
|
na_mask = series.isna()
|
||||||
if pd.api.types.is_numeric_dtype(series):
|
if pd.api.types.is_numeric_dtype(series):
|
||||||
# Hit when a column was auto-unioned to TEXT because at
|
# Hit when a column was auto-unioned to TEXT because at
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user