diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index fe52537..0100fc0 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -791,10 +791,14 @@ def union_column_types( formats. * **All NUM, any decimal hint → DOUBLE PRECISION.** A ``w.d`` format with ``d > 0`` in any file implies fractional values somewhere. - * **All NUM, otherwise → BIGINT.** Default to BIGINT per user - preference: integer-presenting NUM columns drift between - INTEGER/BIGINT/DOUBLE across files, and the few extra bytes are - worth not re-failing every load. + * **All NUM, no useful hint → DOUBLE PRECISION.** SAS numeric + formats are *display* formats, not storage constraints - a + ``BEST12.`` / ``F8.`` / blank-format column can still hold floats, + and pyreadstat hands back plain ``float64`` regardless. Defaulting + to ``DOUBLE PRECISION`` here costs the same 8 bytes as ``BIGINT`` + but can't fail on real data. For columns that truly are + integer-only and you want ``BIGINT`` semantics in queries, pin + them via a ``column_types`` override. Columns missing from a given file are simply skipped for that file; the union is computed over whichever files *did* supply the column. @@ -819,13 +823,18 @@ def union_column_types( result[col] = "TIMESTAMP" elif "DATE" in driven: result[col] = "DATE" - elif any(_format_hints_decimal(f) for f in formats): - result[col] = "DOUBLE PRECISION" else: - # Safe default: BIGINT. The user explicitly accepted wasting a - # few bytes here to avoid INTEGER→BIGINT widening failures on - # multi-year clusters. - result[col] = "BIGINT" + # Safe default: DOUBLE PRECISION. The BIGINT default we tried + # first failed the moment a file contained a fractional + # value in a column whose format didn't carry a decimal + # hint (very common: SAS ``BEST12.`` / ``F8.`` are display + # formats, not storage constraints, so the underlying + # 8-byte float can hold any value). Same storage cost as + # BIGINT, handles both integer- and float-valued data, and + # keeps loads from failing mid-cluster. Use a + # ``column_types`` override to pin specific columns to + # ``BIGINT`` when you want integer semantics in queries. + result[col] = "DOUBLE PRECISION" return result