advanced_analyzer #8
@ -791,10 +791,14 @@ def union_column_types(
|
|||||||
formats.
|
formats.
|
||||||
* **All NUM, any decimal hint → DOUBLE PRECISION.** A ``w.d`` format
|
* **All NUM, any decimal hint → DOUBLE PRECISION.** A ``w.d`` format
|
||||||
with ``d > 0`` in any file implies fractional values somewhere.
|
with ``d > 0`` in any file implies fractional values somewhere.
|
||||||
* **All NUM, otherwise → BIGINT.** Default to BIGINT per user
|
* **All NUM, no useful hint → DOUBLE PRECISION.** SAS numeric
|
||||||
preference: integer-presenting NUM columns drift between
|
formats are *display* formats, not storage constraints - a
|
||||||
INTEGER/BIGINT/DOUBLE across files, and the few extra bytes are
|
``BEST12.`` / ``F8.`` / blank-format column can still hold floats,
|
||||||
worth not re-failing every load.
|
and pyreadstat hands back plain ``float64`` regardless. Defaulting
|
||||||
|
to ``DOUBLE PRECISION`` here costs the same 8 bytes as ``BIGINT``
|
||||||
|
but can't fail on real data. For columns that truly are
|
||||||
|
integer-only and you want ``BIGINT`` semantics in queries, pin
|
||||||
|
them via a ``column_types`` override.
|
||||||
|
|
||||||
Columns missing from a given file are simply skipped for that file;
|
Columns missing from a given file are simply skipped for that file;
|
||||||
the union is computed over whichever files *did* supply the column.
|
the union is computed over whichever files *did* supply the column.
|
||||||
@ -819,13 +823,18 @@ def union_column_types(
|
|||||||
result[col] = "TIMESTAMP"
|
result[col] = "TIMESTAMP"
|
||||||
elif "DATE" in driven:
|
elif "DATE" in driven:
|
||||||
result[col] = "DATE"
|
result[col] = "DATE"
|
||||||
elif any(_format_hints_decimal(f) for f in formats):
|
|
||||||
result[col] = "DOUBLE PRECISION"
|
|
||||||
else:
|
else:
|
||||||
# Safe default: BIGINT. The user explicitly accepted wasting a
|
# Safe default: DOUBLE PRECISION. The BIGINT default we tried
|
||||||
# few bytes here to avoid INTEGER→BIGINT widening failures on
|
# first failed the moment a file contained a fractional
|
||||||
# multi-year clusters.
|
# value in a column whose format didn't carry a decimal
|
||||||
result[col] = "BIGINT"
|
# hint (very common: SAS ``BEST12.`` / ``F8.`` are display
|
||||||
|
# formats, not storage constraints, so the underlying
|
||||||
|
# 8-byte float can hold any value). Same storage cost as
|
||||||
|
# BIGINT, handles both integer- and float-valued data, and
|
||||||
|
# keeps loads from failing mid-cluster. Use a
|
||||||
|
# ``column_types`` override to pin specific columns to
|
||||||
|
# ``BIGINT`` when you want integer semantics in queries.
|
||||||
|
result[col] = "DOUBLE PRECISION"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user