advanced_analyzer #8
@ -791,10 +791,14 @@ def union_column_types(
|
||||
formats.
|
||||
* **All NUM, any decimal hint → DOUBLE PRECISION.** A ``w.d`` format
|
||||
with ``d > 0`` in any file implies fractional values somewhere.
|
||||
* **All NUM, otherwise → BIGINT.** Default to BIGINT per user
|
||||
preference: integer-presenting NUM columns drift between
|
||||
INTEGER/BIGINT/DOUBLE across files, and the few extra bytes are
|
||||
worth not re-failing every load.
|
||||
* **All NUM, no useful hint → DOUBLE PRECISION.** SAS numeric
|
||||
formats are *display* formats, not storage constraints - a
|
||||
``BEST12.`` / ``F8.`` / blank-format column can still hold floats,
|
||||
and pyreadstat hands back plain ``float64`` regardless. Defaulting
|
||||
to ``DOUBLE PRECISION`` here costs the same 8 bytes as ``BIGINT``
|
||||
but can't fail on real data. For columns that truly are
|
||||
integer-only and you want ``BIGINT`` semantics in queries, pin
|
||||
them via a ``column_types`` override.
|
||||
|
||||
Columns missing from a given file are simply skipped for that file;
|
||||
the union is computed over whichever files *did* supply the column.
|
||||
@ -819,13 +823,18 @@ def union_column_types(
|
||||
result[col] = "TIMESTAMP"
|
||||
elif "DATE" in driven:
|
||||
result[col] = "DATE"
|
||||
elif any(_format_hints_decimal(f) for f in formats):
|
||||
result[col] = "DOUBLE PRECISION"
|
||||
else:
|
||||
# Safe default: BIGINT. The user explicitly accepted wasting a
|
||||
# few bytes here to avoid INTEGER→BIGINT widening failures on
|
||||
# multi-year clusters.
|
||||
result[col] = "BIGINT"
|
||||
# Safe default: DOUBLE PRECISION. The BIGINT default we tried
|
||||
# first failed the moment a file contained a fractional
|
||||
# value in a column whose format didn't carry a decimal
|
||||
# hint (very common: SAS ``BEST12.`` / ``F8.`` are display
|
||||
# formats, not storage constraints, so the underlying
|
||||
# 8-byte float can hold any value). Same storage cost as
|
||||
# BIGINT, handles both integer- and float-valued data, and
|
||||
# keeps loads from failing mid-cluster. Use a
|
||||
# ``column_types`` override to pin specific columns to
|
||||
# ``BIGINT`` when you want integer semantics in queries.
|
||||
result[col] = "DOUBLE PRECISION"
|
||||
return result
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user