2026-04-21 22:32:18 +00:00
1 changed files with 19 additions and 10 deletions
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -791,10 +791,14 @@ def union_column_types(
      formats.
    * **All NUM, any decimal hint → DOUBLE PRECISION.** A ``w.d`` format
      with ``d > 0`` in any file implies fractional values somewhere.
-    * **All NUM, otherwise → BIGINT.** Default to BIGINT per user
-      preference: integer-presenting NUM columns drift between
-      INTEGER/BIGINT/DOUBLE across files, and the few extra bytes are
-      worth not re-failing every load.
+    * **All NUM, no useful hint → DOUBLE PRECISION.** SAS numeric
+      formats are *display* formats, not storage constraints - a
+      ``BEST12.`` / ``F8.`` / blank-format column can still hold floats,
+      and pyreadstat hands back plain ``float64`` regardless. Defaulting
+      to ``DOUBLE PRECISION`` here costs the same 8 bytes as ``BIGINT``
+      but can't fail on real data. For columns that truly are
+      integer-only and you want ``BIGINT`` semantics in queries, pin
+      them via a ``column_types`` override.

    Columns missing from a given file are simply skipped for that file;
    the union is computed over whichever files *did* supply the column.
@ -819,13 +823,18 @@ def union_column_types(
            result[col] = "TIMESTAMP"
        elif "DATE" in driven:
            result[col] = "DATE"
-        elif any(_format_hints_decimal(f) for f in formats):
-            result[col] = "DOUBLE PRECISION"
        else:
-            # Safe default: BIGINT. The user explicitly accepted wasting a
-            # few bytes here to avoid INTEGER→BIGINT widening failures on
-            # multi-year clusters.
-            result[col] = "BIGINT"
+            # Safe default: DOUBLE PRECISION. The BIGINT default we tried
+            # first failed the moment a file contained a fractional
+            # value in a column whose format didn't carry a decimal
+            # hint (very common: SAS ``BEST12.`` / ``F8.`` are display
+            # formats, not storage constraints, so the underlying
+            # 8-byte float can hold any value). Same storage cost as
+            # BIGINT, handles both integer- and float-valued data, and
+            # keeps loads from failing mid-cluster. Use a
+            # ``column_types`` override to pin specific columns to
+            # ``BIGINT`` when you want integer semantics in queries.
+            result[col] = "DOUBLE PRECISION"
    return result