From f84e127796bc8946b16a5fa98838727d38934455 Mon Sep 17 00:00:00 2001 From: David Peterson Date: Mon, 20 Apr 2026 20:43:27 -0500 Subject: [PATCH] Update type inference behavior in load_sas.py to scan entire files by default Changed the default setting for TYPE_INFERENCE_SAMPLE_ROWS to None, allowing type and nullability inference to consider all rows in a SAS file. This adjustment ensures accurate handling of null values and integer ranges, addressing issues observed in production with large datasets. Updated documentation to reflect the implications of this change and the risks associated with using an integer cap for sampling. --- generic_loader/load_sas.py | 55 +++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index 74f5ff8..90a4615 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -183,15 +183,17 @@ Priority order used by :func:`infer_schema`: value exceeds the int32 range ``NUMERIC_INT_RANGE``); otherwise ``DOUBLE PRECISION``. -Type inference scans only the first ``TYPE_INFERENCE_SAMPLE_ROWS`` rows for -performance on large files. The CLI enforces this at read time via -:func:`read_sas_preview`, so the whole file is never materialized just to pick -types. Sampled specs carry an ``inferred_from_sample`` marker and the usual -tradeoffs: if the first N rows fit ``INTEGER`` but a later row exceeds int32, -or a column had no nulls in the preview but does later in the file, ``COPY`` -will fail mid-stream and the whole transaction rolls back. Set -``TYPE_INFERENCE_SAMPLE_ROWS = None`` to scan every row when exact typing -matters more than speed. +Type inference scans the whole file by default (``TYPE_INFERENCE_SAMPLE_ROWS += None``) so type + nullability are both computed against every row. The CLI +materializes the file once for schema inference, then re-streams it chunk by +chunk into ``COPY``; peak memory is roughly one full dataframe. Override +``TYPE_INFERENCE_SAMPLE_ROWS`` to an integer cap if you're on a host that +can't hold the file in memory - but know that sampled specs carry the usual +risks: a later row may exceed the inferred integer range, or a column that +had no nulls in the preview may carry nulls later in the file (which then +detonates ``COPY`` because the sampled spec stamped it ``NOT NULL``). Seen +in production on a 2.5M-row file with ~6k null MAFIDs past the 10k-row +preview - the entire load aborted mid-stream. Streaming loads use :func:`iter_sas_chunks` + :func:`copy_dataframes`, which commit each chunk as it is copied so an interrupted load retains the rows @@ -255,12 +257,19 @@ values; too small a sample is easy to mis-infer.""" NUMERIC_INT_RANGE = (-2_147_483_648, 2_147_483_647) """INTEGER bounds; anything outside becomes BIGINT.""" -TYPE_INFERENCE_SAMPLE_ROWS: Optional[int] = 10_000 +TYPE_INFERENCE_SAMPLE_ROWS: Optional[int] = None """Cap on rows inspected during per-column type inference. Also governs how many rows :func:`read_sas_preview` pulls from the file for dry-run / validate / -schema-inference flows. Set to ``None`` to scan every row (and read the whole -file into memory for the preview step - don't do this on multi-hundred-million -row files).""" +schema-inference flows. + +Default is ``None`` (scan every row, reading the whole file into memory for +the schema-inference step). That's the only honest setting for nullability: +any integer cap lets a column look ``NOT NULL`` across the first N rows +while the file actually holds rare nulls past the window, which then +detonates ``COPY`` mid-stream (seen in production on a 2.5M-row file where +~6k MAFIDs were null past the 10k-row preview). If you're loading a file +so large that a full read won't fit in memory, set this to an integer cap +and accept that sampled specs can't be trusted for ``NOT NULL``.""" DEFAULT_CHUNK_ROWS = 100_000 """Rows per chunk when streaming a SAS file into ``COPY``. Larger values mean @@ -777,8 +786,12 @@ def infer_schema( """ original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {}) - # Row-walking type probes run on a bounded head slice; nullability and the - # all-null check still see every row so NOT NULL declarations stay honest. + # When ``TYPE_INFERENCE_SAMPLE_ROWS`` is an integer cap, row-walking type + # probes run on the head slice for speed; nullability and the all-null + # check still walk every row of ``df``. That's only honest when the + # caller handed us the full file - with the default cap of ``None`` the + # CLI does exactly that. Callers who pass a partial preview and a tight + # integer cap accept that ``NOT NULL`` can be wrong for rare-null columns. df_rows = len(df) effective_total = total_rows if total_rows is not None else df_rows if TYPE_INFERENCE_SAMPLE_ROWS is not None and df_rows > TYPE_INFERENCE_SAMPLE_ROWS: @@ -1921,10 +1934,14 @@ def main(argv: Optional[List[str]] = None) -> int: print(f"error: SAS file not found: {cfg.filename}", file=sys.stderr) return 2 - # Schema inference uses a bounded preview read so we never load a - # hundreds-of-millions-of-rows file into memory just to pick types. - # NB: ``meta.number_rows`` on a ``row_limit``-ed read reflects rows - # returned, not the file's total, so we don't trust it here. + # Schema inference reads the whole file so type + nullability are + # computed against every row. That's what the target host has the + # resources for and is the only way to honestly emit ``NOT NULL`` - + # a bounded preview routinely missed the ~0.2% of rows with nulls on + # otherwise-dense keys (e.g. MAFID). If you're on a box that can't + # fit the file in memory, override ``TYPE_INFERENCE_SAMPLE_ROWS`` to + # an integer cap and know that sampled specs may stamp ``NOT NULL`` + # on columns whose nulls live past the window. preview_df, meta = read_sas_preview(cfg.filename) preview_df = apply_column_filter(preview_df, cfg.include, cfg.exclude) columns = infer_schema(preview_df, meta)