Enhance date and datetime parsing in load_sas.py with flexible regex and fallback formats

Introduced a locale-independent month lookup and improved date parsing functions to handle various date formats, including SAS and Oracle styles. The new _parse_flexible_date and _parse_flexible_datetime functions provide robust parsing capabilities, accommodating both date-only and datetime inputs. Updated _try_date_coerce and _try_datetime_coerce to utilize these new functions, ensuring better handling of diverse date formats during data loading.
2026-04-22 12:28:19 -05:00 · 2026-04-22 12:28:19 -05:00 · c3fa943e77
commit c3fa943e77
parent f63d684d51
1 changed files with 230 additions and 16 deletions
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -1261,20 +1261,197 @@ def _try_float_coerce(values: List[str]) -> bool:
    return True
 # Locale-independent month lookup so ``DD-MON-YY`` / ``DDMONYYYY`` style
 # strings (Oracle's default ``DD-MON-YY`` export, SAS ``DATE7.`` /
 # ``DATE9.`` rendered to text, spreadsheets spitting out ``23-Mar-2020``)
 # parse correctly regardless of the host's ``LC_TIME``. ``strptime("%b")``
 # is locale-dependent and silently fails on non-English systems; this
 # dict sidesteps that entirely.
 _MONTH_LOOKUP: Dict[str, int] = {
    "JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6,
    "JUL": 7, "AUG": 8, "SEP": 9, "SEPT": 9, "OCT": 10, "NOV": 11, "DEC": 12,
    "JANUARY": 1, "FEBRUARY": 2, "MARCH": 3, "APRIL": 4, "JUNE": 6,
    "JULY": 7, "AUGUST": 8, "SEPTEMBER": 9, "OCTOBER": 10,
    "NOVEMBER": 11, "DECEMBER": 12,
 }
 # ``DD[sep]MON[sep]YY`` with an optional ``HH:MM[:SS[.ffff]] [AM|PM]``
 # suffix. ``sep`` can be ``-``, ``/``, space, or empty so the same
 # regex covers ``23-MAR-20``, ``23-MAR-2020``, ``23MAR2020`` (SAS
 # ``DATE9.``), ``23 Mar 2020`` (Excel), and ``23-MAR-20 14:30:00``
 # (Oracle ``TO_CHAR`` default with timestamp). Time portion is lenient
 # on separator (``:`` or ``.``) since Oracle's default timestamp
 # rendering uses dots (``02.30.45.123456``) while most others use
 # colons.
 _DDMONYY_RE = re.compile(
    r"""
    ^\s*
    (?P<day>\d{1,2})
    [-/\s]?
    (?P<month>[A-Za-z]{3,9})
    [-/\s]?
    (?P<year>\d{2}|\d{4})
    (?:
        [\sT:]+
        (?P<hour>\d{1,2}) [:.] (?P<minute>\d{2})
        (?:
            [:.] (?P<second>\d{2})
            (?: \. (?P<micro>\d+) )?
        )?
        \s*
        (?P<ampm>[AaPp][Mm])?
    )?
    \s*$
    """,
    re.VERBOSE,
 )
 # Strptime fallbacks for all-numeric shapes the regex above can't
 # disambiguate. Order matters: unambiguous 4-digit-year layouts first,
 # then US-style ``mm/dd`` before EU-style ``dd/mm`` (the former is
 # dominant in the kinds of exports this loader sees). Columns whose
 # true format is ``DD/MM/YY`` should pin the Postgres type via
 # ``column_types: {col: TEXT}`` and parse themselves downstream.
 _EXTRA_DATE_FORMATS: Tuple[str, ...] = (
    "%Y/%m/%d",
    "%Y%m%d",
    "%m/%d/%Y",
    "%m/%d/%y",
    "%m-%d-%Y",
    "%m-%d-%y",
    "%d/%m/%Y",
    "%d/%m/%y",
    "%d-%m-%Y",
    "%d-%m-%y",
 )
 _EXTRA_DATETIME_FORMATS: Tuple[str, ...] = (
    "%Y-%m-%d %H:%M:%S",
    "%Y-%m-%d %H:%M:%S.%f",
    "%Y-%m-%dT%H:%M:%S",
    "%Y-%m-%dT%H:%M:%S.%f",
    "%m/%d/%Y %H:%M:%S",
    "%m/%d/%Y %H:%M",
    "%m/%d/%y %H:%M:%S",
    "%m/%d/%y %H:%M",
    "%d/%m/%Y %H:%M:%S",
    "%d/%m/%y %H:%M:%S",
    "%Y/%m/%d %H:%M:%S",
 )
 def _parse_flexible_date(value: Any) -> Optional[dt.date]:
    """Parse ``value`` to ``datetime.date`` using ISO first, then the
    ``DD-MON-YY`` family, then the numeric fallbacks in
    :data:`_EXTRA_DATE_FORMATS`. Returns ``None`` if nothing matches.
    Non-string / empty / non-finite inputs return ``None`` rather than
    raising so callers can use this as a drop-in replacement for the old
    ``dt.date.fromisoformat`` + ``try``/``except`` pattern.
    """
    if value is None:
        return None
    if not isinstance(value, str):
        return None
    s = value.strip()
    if not s:
        return None
    try:
        return dt.date.fromisoformat(s)
    except (ValueError, TypeError):
        pass
    m = _DDMONYY_RE.match(s)
    # Reject inputs that carry a time component so ``_try_date_coerce``
    # doesn't silently swallow ``TIMESTAMP`` columns (``23-MAR-20 14:30:00``)
    # and misclassify them as ``DATE``.
    if m and m.group("hour") is None:
        month = _MONTH_LOOKUP.get(m.group("month").upper())
        if month is not None:
            try:
                day = int(m.group("day"))
                year = int(m.group("year"))
                if len(m.group("year")) == 2:
                    # Pivot year = 69 matches SAS / Oracle / Excel
                    # conventions: ``00..68`` -> 2000s, ``69..99`` -> 1900s.
                    year = 2000 + year if year < 69 else 1900 + year
                return dt.date(year, month, day)
            except ValueError:
                return None
    for fmt in _EXTRA_DATE_FORMATS:
        try:
            return dt.datetime.strptime(s, fmt).date()
        except ValueError:
            continue
    return None
 def _parse_flexible_datetime(value: Any) -> Optional[dt.datetime]:
    """Parse ``value`` to ``datetime.datetime``. Same format coverage as
    :func:`_parse_flexible_date` plus explicit datetime shapes; a
    date-only input is promoted to midnight so callers can treat a
    column that mixes ``23-MAR-20`` and ``23-MAR-20 14:30:00`` as
    ``TIMESTAMP`` end-to-end.
    """
    if value is None:
        return None
    if not isinstance(value, str):
        return None
    s = value.strip()
    if not s:
        return None
    try:
        return dt.datetime.fromisoformat(s)
    except (ValueError, TypeError):
        pass
    m = _DDMONYY_RE.match(s)
    if m:
        month = _MONTH_LOOKUP.get(m.group("month").upper())
        if month is not None:
            try:
                day = int(m.group("day"))
                year = int(m.group("year"))
                if len(m.group("year")) == 2:
                    year = 2000 + year if year < 69 else 1900 + year
                hour = int(m.group("hour")) if m.group("hour") else 0
                minute = int(m.group("minute")) if m.group("minute") else 0
                second = int(m.group("second")) if m.group("second") else 0
                micro = 0
                if m.group("micro"):
                    # ``%f`` expects 1-6 digits; pad / truncate to match.
                    micro_s = m.group("micro")[:6].ljust(6, "0")
                    micro = int(micro_s)
                ampm = m.group("ampm")
                if ampm:
                    ap = ampm.upper()
                    if ap == "PM" and hour < 12:
                        hour += 12
                    elif ap == "AM" and hour == 12:
                        hour = 0
                return dt.datetime(year, month, day, hour, minute, second, micro)
            except ValueError:
                return None
    for fmt in _EXTRA_DATETIME_FORMATS:
        try:
            return dt.datetime.strptime(s, fmt)
        except ValueError:
            continue
    # Final fallback: accept a date-only string and promote to midnight.
    d = _parse_flexible_date(s)
    if d is not None:
        return dt.datetime(d.year, d.month, d.day)
    return None
 def _try_date_coerce(values: List[str]) -> bool:
    for v in values:
-        try:
+        if _parse_flexible_date(v) is None:
            dt.date.fromisoformat(v)
        except (ValueError, TypeError):
            return False
    return True
 def _try_datetime_coerce(values: List[str]) -> bool:
    for v in values:
-        try:
+        if _parse_flexible_datetime(v) is None:
            dt.datetime.fromisoformat(v)
        except (ValueError, TypeError):
            return False
    return True
@ -1876,10 +2053,7 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
        if isinstance(value, str):
            if value.strip() == "":
                return None
-            try:
+            return _parse_flexible_date(value)
                return dt.date.fromisoformat(value.strip())
            except (ValueError, TypeError):
                return None
        return None
    if pg_upper in ("TIMESTAMP", "TIMESTAMP WITHOUT TIME ZONE",
@ -1893,10 +2067,7 @@ def _normalize_partition_value(value: Any, pg_type: str) -> Any:
        if isinstance(value, str):
            if value.strip() == "":
                return None
-            try:
+            return _parse_flexible_datetime(value)
                return dt.datetime.fromisoformat(value.strip())
            except (ValueError, TypeError):
                return None
        return None
    if pg_upper in ("TIME", "TIME WITHOUT TIME ZONE",
@ -2399,8 +2570,17 @@ def _safe_object_to_datetime(
    """Object-dtype to datetime. Shares the safety net (errstate +
    try/except) with :func:`_safe_numeric_to_datetime`. If the column is
    actually numeric-flavored (e.g. SAS wrote numbers into an object
-    column), route to the numeric path; otherwise parse with ``to_datetime``
+    column), route to the numeric path; otherwise try our explicit
-    on the object itself.
+    ``DD-MON-YY`` / strptime format set before falling back to the
    generic ``pd.to_datetime`` dateutil parser.
    The explicit-format pre-pass exists because:
    * ``pd.to_datetime`` on unformatted object columns emits a
      ``UserWarning`` per chunk and parses row-by-row via ``dateutil``
      -- 10-100× slower than a single vectorized strptime.
    * ``dateutil`` *will* parse ``23-MAR-20`` but its 2-digit-year pivot
      differs from SAS/Oracle convention in corner cases; applying our
      own parser keeps behavior predictable.
    """
    coerced = series.replace({"": None})
    numeric = pd.to_numeric(coerced, errors="coerce")
@ -2409,6 +2589,40 @@ def _safe_object_to_datetime(
        return _safe_numeric_to_datetime(
            numeric, unit="s", column_name=column_name, target_type=target_type,
        )
    non_null_count = int(coerced.notna().sum())
    if non_null_count:
        # First pass: our regex-based ``DD-MON-YY`` parser. Cheap,
        # locale-independent, covers the cases ``pd.to_datetime`` warns
        # about. Always parse via the datetime-aware variant so a DATE
        # target whose chunk happens to carry time components
        # (``23-MAR-20 14:30:00``) still parses without warnings; the
        # caller's ``.dt.date`` cast truncates the time, matching the
        # existing datetime64-input branch.
        parsed_py = coerced.map(
            lambda v: _parse_flexible_datetime(v) if v is not None else None
        )
        parsed_ts = pd.to_datetime(parsed_py, errors="coerce")
        if int(parsed_ts.notna().sum()) == non_null_count:
            return parsed_ts
        # Second pass: vectorized ``pd.to_datetime`` with each explicit
        # format. One ``pd.to_datetime(format=fmt)`` call is O(n) in C;
        # trying a handful of them still beats row-by-row dateutil on
        # large chunks. Accept the first format that covers every
        # non-null cell.
        for fmt in _EXTRA_DATETIME_FORMATS + _EXTRA_DATE_FORMATS:
            try:
                with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
                    candidate = pd.to_datetime(coerced, format=fmt, errors="coerce")
            except (ValueError, TypeError):
                continue
            if int(candidate.notna().sum()) == non_null_count:
                return candidate
    # Final fallback: ``dateutil`` via ``pd.to_datetime``. Handles
    # shapes our explicit list missed (rare edge cases, mixed formats
    # within one column). Same safety net as the numeric path.
    try:
        with np.errstate(over="ignore", invalid="ignore", divide="ignore"):
            return pd.to_datetime(coerced, errors="coerce")