diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index 3c61892..5d97c02 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -1261,20 +1261,197 @@ def _try_float_coerce(values: List[str]) -> bool: return True +# Locale-independent month lookup so ``DD-MON-YY`` / ``DDMONYYYY`` style +# strings (Oracle's default ``DD-MON-YY`` export, SAS ``DATE7.`` / +# ``DATE9.`` rendered to text, spreadsheets spitting out ``23-Mar-2020``) +# parse correctly regardless of the host's ``LC_TIME``. ``strptime("%b")`` +# is locale-dependent and silently fails on non-English systems; this +# dict sidesteps that entirely. +_MONTH_LOOKUP: Dict[str, int] = { + "JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6, + "JUL": 7, "AUG": 8, "SEP": 9, "SEPT": 9, "OCT": 10, "NOV": 11, "DEC": 12, + "JANUARY": 1, "FEBRUARY": 2, "MARCH": 3, "APRIL": 4, "JUNE": 6, + "JULY": 7, "AUGUST": 8, "SEPTEMBER": 9, "OCTOBER": 10, + "NOVEMBER": 11, "DECEMBER": 12, +} + +# ``DD[sep]MON[sep]YY`` with an optional ``HH:MM[:SS[.ffff]] [AM|PM]`` +# suffix. ``sep`` can be ``-``, ``/``, space, or empty so the same +# regex covers ``23-MAR-20``, ``23-MAR-2020``, ``23MAR2020`` (SAS +# ``DATE9.``), ``23 Mar 2020`` (Excel), and ``23-MAR-20 14:30:00`` +# (Oracle ``TO_CHAR`` default with timestamp). Time portion is lenient +# on separator (``:`` or ``.``) since Oracle's default timestamp +# rendering uses dots (``02.30.45.123456``) while most others use +# colons. +_DDMONYY_RE = re.compile( + r""" + ^\s* + (?P\d{1,2}) + [-/\s]? + (?P[A-Za-z]{3,9}) + [-/\s]? + (?P\d{2}|\d{4}) + (?: + [\sT:]+ + (?P\d{1,2}) [:.] (?P\d{2}) + (?: + [:.] (?P\d{2}) + (?: \. (?P\d+) )? + )? + \s* + (?P[AaPp][Mm])? + )? + \s*$ + """, + re.VERBOSE, +) + +# Strptime fallbacks for all-numeric shapes the regex above can't +# disambiguate. Order matters: unambiguous 4-digit-year layouts first, +# then US-style ``mm/dd`` before EU-style ``dd/mm`` (the former is +# dominant in the kinds of exports this loader sees). Columns whose +# true format is ``DD/MM/YY`` should pin the Postgres type via +# ``column_types: {col: TEXT}`` and parse themselves downstream. +_EXTRA_DATE_FORMATS: Tuple[str, ...] = ( + "%Y/%m/%d", + "%Y%m%d", + "%m/%d/%Y", + "%m/%d/%y", + "%m-%d-%Y", + "%m-%d-%y", + "%d/%m/%Y", + "%d/%m/%y", + "%d-%m-%Y", + "%d-%m-%y", +) + +_EXTRA_DATETIME_FORMATS: Tuple[str, ...] = ( + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M:%S.%f", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M:%S.%f", + "%m/%d/%Y %H:%M:%S", + "%m/%d/%Y %H:%M", + "%m/%d/%y %H:%M:%S", + "%m/%d/%y %H:%M", + "%d/%m/%Y %H:%M:%S", + "%d/%m/%y %H:%M:%S", + "%Y/%m/%d %H:%M:%S", +) + + +def _parse_flexible_date(value: Any) -> Optional[dt.date]: + """Parse ``value`` to ``datetime.date`` using ISO first, then the + ``DD-MON-YY`` family, then the numeric fallbacks in + :data:`_EXTRA_DATE_FORMATS`. Returns ``None`` if nothing matches. + + Non-string / empty / non-finite inputs return ``None`` rather than + raising so callers can use this as a drop-in replacement for the old + ``dt.date.fromisoformat`` + ``try``/``except`` pattern. + """ + if value is None: + return None + if not isinstance(value, str): + return None + s = value.strip() + if not s: + return None + try: + return dt.date.fromisoformat(s) + except (ValueError, TypeError): + pass + m = _DDMONYY_RE.match(s) + # Reject inputs that carry a time component so ``_try_date_coerce`` + # doesn't silently swallow ``TIMESTAMP`` columns (``23-MAR-20 14:30:00``) + # and misclassify them as ``DATE``. + if m and m.group("hour") is None: + month = _MONTH_LOOKUP.get(m.group("month").upper()) + if month is not None: + try: + day = int(m.group("day")) + year = int(m.group("year")) + if len(m.group("year")) == 2: + # Pivot year = 69 matches SAS / Oracle / Excel + # conventions: ``00..68`` -> 2000s, ``69..99`` -> 1900s. + year = 2000 + year if year < 69 else 1900 + year + return dt.date(year, month, day) + except ValueError: + return None + for fmt in _EXTRA_DATE_FORMATS: + try: + return dt.datetime.strptime(s, fmt).date() + except ValueError: + continue + return None + + +def _parse_flexible_datetime(value: Any) -> Optional[dt.datetime]: + """Parse ``value`` to ``datetime.datetime``. Same format coverage as + :func:`_parse_flexible_date` plus explicit datetime shapes; a + date-only input is promoted to midnight so callers can treat a + column that mixes ``23-MAR-20`` and ``23-MAR-20 14:30:00`` as + ``TIMESTAMP`` end-to-end. + """ + if value is None: + return None + if not isinstance(value, str): + return None + s = value.strip() + if not s: + return None + try: + return dt.datetime.fromisoformat(s) + except (ValueError, TypeError): + pass + m = _DDMONYY_RE.match(s) + if m: + month = _MONTH_LOOKUP.get(m.group("month").upper()) + if month is not None: + try: + day = int(m.group("day")) + year = int(m.group("year")) + if len(m.group("year")) == 2: + year = 2000 + year if year < 69 else 1900 + year + hour = int(m.group("hour")) if m.group("hour") else 0 + minute = int(m.group("minute")) if m.group("minute") else 0 + second = int(m.group("second")) if m.group("second") else 0 + micro = 0 + if m.group("micro"): + # ``%f`` expects 1-6 digits; pad / truncate to match. + micro_s = m.group("micro")[:6].ljust(6, "0") + micro = int(micro_s) + ampm = m.group("ampm") + if ampm: + ap = ampm.upper() + if ap == "PM" and hour < 12: + hour += 12 + elif ap == "AM" and hour == 12: + hour = 0 + return dt.datetime(year, month, day, hour, minute, second, micro) + except ValueError: + return None + for fmt in _EXTRA_DATETIME_FORMATS: + try: + return dt.datetime.strptime(s, fmt) + except ValueError: + continue + # Final fallback: accept a date-only string and promote to midnight. + d = _parse_flexible_date(s) + if d is not None: + return dt.datetime(d.year, d.month, d.day) + return None + + def _try_date_coerce(values: List[str]) -> bool: for v in values: - try: - dt.date.fromisoformat(v) - except (ValueError, TypeError): + if _parse_flexible_date(v) is None: return False return True def _try_datetime_coerce(values: List[str]) -> bool: for v in values: - try: - dt.datetime.fromisoformat(v) - except (ValueError, TypeError): + if _parse_flexible_datetime(v) is None: return False return True @@ -1876,10 +2053,7 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any: if isinstance(value, str): if value.strip() == "": return None - try: - return dt.date.fromisoformat(value.strip()) - except (ValueError, TypeError): - return None + return _parse_flexible_date(value) return None if pg_upper in ("TIMESTAMP", "TIMESTAMP WITHOUT TIME ZONE", @@ -1893,10 +2067,7 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any: if isinstance(value, str): if value.strip() == "": return None - try: - return dt.datetime.fromisoformat(value.strip()) - except (ValueError, TypeError): - return None + return _parse_flexible_datetime(value) return None if pg_upper in ("TIME", "TIME WITHOUT TIME ZONE", @@ -2399,8 +2570,17 @@ def _safe_object_to_datetime( """Object-dtype to datetime. Shares the safety net (errstate + try/except) with :func:`_safe_numeric_to_datetime`. If the column is actually numeric-flavored (e.g. SAS wrote numbers into an object - column), route to the numeric path; otherwise parse with ``to_datetime`` - on the object itself. + column), route to the numeric path; otherwise try our explicit + ``DD-MON-YY`` / strptime format set before falling back to the + generic ``pd.to_datetime`` dateutil parser. + + The explicit-format pre-pass exists because: + * ``pd.to_datetime`` on unformatted object columns emits a + ``UserWarning`` per chunk and parses row-by-row via ``dateutil`` + -- 10-100× slower than a single vectorized strptime. + * ``dateutil`` *will* parse ``23-MAR-20`` but its 2-digit-year pivot + differs from SAS/Oracle convention in corner cases; applying our + own parser keeps behavior predictable. """ coerced = series.replace({"": None}) numeric = pd.to_numeric(coerced, errors="coerce") @@ -2409,6 +2589,40 @@ def _safe_object_to_datetime( return _safe_numeric_to_datetime( numeric, unit="s", column_name=column_name, target_type=target_type, ) + + non_null_count = int(coerced.notna().sum()) + if non_null_count: + # First pass: our regex-based ``DD-MON-YY`` parser. Cheap, + # locale-independent, covers the cases ``pd.to_datetime`` warns + # about. Always parse via the datetime-aware variant so a DATE + # target whose chunk happens to carry time components + # (``23-MAR-20 14:30:00``) still parses without warnings; the + # caller's ``.dt.date`` cast truncates the time, matching the + # existing datetime64-input branch. + parsed_py = coerced.map( + lambda v: _parse_flexible_datetime(v) if v is not None else None + ) + parsed_ts = pd.to_datetime(parsed_py, errors="coerce") + if int(parsed_ts.notna().sum()) == non_null_count: + return parsed_ts + + # Second pass: vectorized ``pd.to_datetime`` with each explicit + # format. One ``pd.to_datetime(format=fmt)`` call is O(n) in C; + # trying a handful of them still beats row-by-row dateutil on + # large chunks. Accept the first format that covers every + # non-null cell. + for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS: + try: + with np.errstate(over="ignore", invalid="ignore", divide="ignore"): + candidate = pd.to_datetime(coerced, format=fmt, errors="coerce") + except (ValueError, TypeError): + continue + if int(candidate.notna().sum()) == non_null_count: + return candidate + + # Final fallback: ``dateutil`` via ``pd.to_datetime``. Handles + # shapes our explicit list missed (rare edge cases, mixed formats + # within one column). Same safety net as the numeric path. try: with np.errstate(over="ignore", invalid="ignore", divide="ignore"): return pd.to_datetime(coerced, errors="coerce")