Implement type inference sampling in load_sas.py to improve performance on large SAS files. Introduce TYPE_INFERENCE_SAMPLE_ROWS to limit the number of rows scanned for type detection while ensuring nullability checks cover the entire column. Update documentation to reflect these changes.
This commit is contained in:
parent
4f7ded09c6
commit
3a0537270c
@ -163,6 +163,14 @@ Priority order used by :func:`infer_schema`:
|
|||||||
value exceeds the int32 range ``NUMERIC_INT_RANGE``); otherwise
|
value exceeds the int32 range ``NUMERIC_INT_RANGE``); otherwise
|
||||||
``DOUBLE PRECISION``.
|
``DOUBLE PRECISION``.
|
||||||
|
|
||||||
|
Type inference scans only the first ``TYPE_INFERENCE_SAMPLE_ROWS`` rows for
|
||||||
|
performance on large files. Nullability and all-null detection still run over
|
||||||
|
the full column (they're vectorized and fast) so a ``NOT NULL`` constraint is
|
||||||
|
never declared for a column that has a null anywhere in the file. Tradeoff:
|
||||||
|
if the first N rows fit ``INTEGER`` but a later row exceeds int32, COPY will
|
||||||
|
fail; bump the sample size or set ``TYPE_INFERENCE_SAMPLE_ROWS = None`` to
|
||||||
|
scan the whole column.
|
||||||
|
|
||||||
7. Tunables
|
7. Tunables
|
||||||
-----------
|
-----------
|
||||||
Module-level knobs at the top of this file:
|
Module-level knobs at the top of this file:
|
||||||
@ -173,6 +181,8 @@ Module-level knobs at the top of this file:
|
|||||||
char-column coercion is attempted.
|
char-column coercion is attempted.
|
||||||
* ``NUMERIC_INT_RANGE`` - INTEGER bounds; values outside become
|
* ``NUMERIC_INT_RANGE`` - INTEGER bounds; values outside become
|
||||||
``BIGINT``.
|
``BIGINT``.
|
||||||
|
* ``TYPE_INFERENCE_SAMPLE_ROWS`` - cap on rows used for type inference
|
||||||
|
(``None`` = scan the whole column).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@ -210,6 +220,15 @@ values; too small a sample is easy to mis-infer."""
|
|||||||
NUMERIC_INT_RANGE = (-2_147_483_648, 2_147_483_647)
|
NUMERIC_INT_RANGE = (-2_147_483_648, 2_147_483_647)
|
||||||
"""INTEGER bounds; anything outside becomes BIGINT."""
|
"""INTEGER bounds; anything outside becomes BIGINT."""
|
||||||
|
|
||||||
|
TYPE_INFERENCE_SAMPLE_ROWS: Optional[int] = 10_000
|
||||||
|
"""Cap on rows inspected during per-column type inference. The row-walking
|
||||||
|
helpers (date detection on object columns, string-coercion probes, whole-number
|
||||||
|
check on numeric columns) operate on ``df.head(TYPE_INFERENCE_SAMPLE_ROWS)``
|
||||||
|
instead of the full frame, which matters on SAS files with hundreds of millions
|
||||||
|
of rows. Nullability is still evaluated across the whole column (cheap,
|
||||||
|
vectorized) so ``NOT NULL`` declarations remain safe. Set to ``None`` to scan
|
||||||
|
every row."""
|
||||||
|
|
||||||
|
|
||||||
VALID_IF_EXISTS = ("fail", "replace", "append")
|
VALID_IF_EXISTS = ("fail", "replace", "append")
|
||||||
|
|
||||||
@ -547,6 +566,16 @@ def infer_schema(
|
|||||||
"""
|
"""
|
||||||
original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {})
|
original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {})
|
||||||
|
|
||||||
|
# Row-walking type probes run on a bounded head slice; nullability and the
|
||||||
|
# all-null check still see every row so NOT NULL declarations stay honest.
|
||||||
|
total_rows = len(df)
|
||||||
|
if TYPE_INFERENCE_SAMPLE_ROWS is not None and total_rows > TYPE_INFERENCE_SAMPLE_ROWS:
|
||||||
|
sample_df = df.head(TYPE_INFERENCE_SAMPLE_ROWS)
|
||||||
|
sampled = True
|
||||||
|
else:
|
||||||
|
sample_df = df
|
||||||
|
sampled = False
|
||||||
|
|
||||||
# Temporarily flip the module-level flag if the caller asked us to.
|
# Temporarily flip the module-level flag if the caller asked us to.
|
||||||
global COERCE_CHAR_COLUMNS
|
global COERCE_CHAR_COLUMNS
|
||||||
saved = COERCE_CHAR_COLUMNS
|
saved = COERCE_CHAR_COLUMNS
|
||||||
@ -555,6 +584,7 @@ def infer_schema(
|
|||||||
out: Dict[str, ColumnSpec] = {}
|
out: Dict[str, ColumnSpec] = {}
|
||||||
for col in df.columns:
|
for col in df.columns:
|
||||||
series = df[col]
|
series = df[col]
|
||||||
|
sample_series = sample_df[col]
|
||||||
sas_format = original_formats.get(col)
|
sas_format = original_formats.get(col)
|
||||||
notes: List[str] = []
|
notes: List[str] = []
|
||||||
|
|
||||||
@ -567,13 +597,13 @@ def infer_schema(
|
|||||||
elif pd.api.types.is_datetime64_any_dtype(series):
|
elif pd.api.types.is_datetime64_any_dtype(series):
|
||||||
pg_type = "TIMESTAMP"
|
pg_type = "TIMESTAMP"
|
||||||
elif pd.api.types.is_object_dtype(series):
|
elif pd.api.types.is_object_dtype(series):
|
||||||
is_dates, any_dt = _object_is_dates(series)
|
is_dates, any_dt = _object_is_dates(sample_series)
|
||||||
if is_dates:
|
if is_dates:
|
||||||
pg_type = "TIMESTAMP" if any_dt else "DATE"
|
pg_type = "TIMESTAMP" if any_dt else "DATE"
|
||||||
else:
|
else:
|
||||||
pg_type = _infer_char_type(series)
|
pg_type = _infer_char_type(sample_series)
|
||||||
elif pd.api.types.is_numeric_dtype(series):
|
elif pd.api.types.is_numeric_dtype(series):
|
||||||
int_target = _numeric_int_target(series)
|
int_target = _numeric_int_target(sample_series)
|
||||||
if int_target is not None:
|
if int_target is not None:
|
||||||
pg_type = int_target
|
pg_type = int_target
|
||||||
else:
|
else:
|
||||||
@ -582,6 +612,12 @@ def infer_schema(
|
|||||||
pg_type = "TEXT"
|
pg_type = "TEXT"
|
||||||
notes.append(f"unhandled dtype {series.dtype}; defaulting to TEXT")
|
notes.append(f"unhandled dtype {series.dtype}; defaulting to TEXT")
|
||||||
|
|
||||||
|
if sampled:
|
||||||
|
notes.append(
|
||||||
|
f"type inferred from first {TYPE_INFERENCE_SAMPLE_ROWS:,} of "
|
||||||
|
f"{total_rows:,} rows"
|
||||||
|
)
|
||||||
|
|
||||||
nullable = _is_nullable(series)
|
nullable = _is_nullable(series)
|
||||||
|
|
||||||
out[col] = ColumnSpec(
|
out[col] = ColumnSpec(
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user