From 126548927653281faf5a0dd0935483317a24c9e9 Mon Sep 17 00:00:00 2001 From: David Peterson Date: Tue, 21 Apr 2026 08:16:17 -0500 Subject: [PATCH] Enhance date and timestamp handling in _prepare_for_copy function in load_sas.py Added support for numeric date and datetime conversions from SAS formats. Implemented logic to handle float64 representations of dates (days since 1960-01-01) and datetimes (seconds since 1960-01-01), ensuring proper parsing and preventing errors during data copying to Postgres. This enhancement improves compatibility with various SAS date formats. --- generic_loader/load_sas.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index 8dfe18f..2818bbc 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -1794,6 +1794,17 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da series.replace({"": None}), errors="coerce" ) out[name] = parsed.dt.date + elif pd.api.types.is_numeric_dtype(series): + # pyreadstat couldn't decode the SAS format (some + # ``DATEw.``/``YYMMDDw.`` variants and all custom formats slip + # through) so the column came back as float64: days since + # 1960-01-01, the SAS epoch. Without this branch the raw + # number would hit COPY and Postgres rejects it with + # ``invalid input syntax for type date``. + parsed = pd.to_datetime( + series, unit="D", origin="1960-01-01", errors="coerce", + ) + out[name] = parsed.dt.date else: out[name] = series elif pg in ("TIMESTAMP", "TIMESTAMP WITHOUT TIME ZONE", "TIMESTAMP WITH TIME ZONE"): @@ -1803,6 +1814,14 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da out[name] = pd.to_datetime( series.replace({"": None}), errors="coerce" ) + elif pd.api.types.is_numeric_dtype(series): + # Same story as the DATE branch above, but SAS datetimes are + # *seconds* since 1960-01-01 (fractional seconds for + # ``DATETIMEw.d``). Example caught in the wild: + # ``1915465463.615`` -> 2020-09-13 05:44:23.615. + out[name] = pd.to_datetime( + series, unit="s", origin="1960-01-01", errors="coerce", + ) else: out[name] = series elif pg in ("TIME", "TIME WITHOUT TIME ZONE", "TIME WITH TIME ZONE"):