diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index 74f5ff8..90a4615 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -183,15 +183,17 @@ Priority order used by :func:`infer_schema`: value exceeds the int32 range ``NUMERIC_INT_RANGE``); otherwise ``DOUBLE PRECISION``. -Type inference scans only the first ``TYPE_INFERENCE_SAMPLE_ROWS`` rows for -performance on large files. The CLI enforces this at read time via -:func:`read_sas_preview`, so the whole file is never materialized just to pick -types. Sampled specs carry an ``inferred_from_sample`` marker and the usual -tradeoffs: if the first N rows fit ``INTEGER`` but a later row exceeds int32, -or a column had no nulls in the preview but does later in the file, ``COPY`` -will fail mid-stream and the whole transaction rolls back. Set -``TYPE_INFERENCE_SAMPLE_ROWS = None`` to scan every row when exact typing -matters more than speed. +Type inference scans the whole file by default (``TYPE_INFERENCE_SAMPLE_ROWS += None``) so type + nullability are both computed against every row. The CLI +materializes the file once for schema inference, then re-streams it chunk by +chunk into ``COPY``; peak memory is roughly one full dataframe. Override +``TYPE_INFERENCE_SAMPLE_ROWS`` to an integer cap if you're on a host that +can't hold the file in memory - but know that sampled specs carry the usual +risks: a later row may exceed the inferred integer range, or a column that +had no nulls in the preview may carry nulls later in the file (which then +detonates ``COPY`` because the sampled spec stamped it ``NOT NULL``). Seen +in production on a 2.5M-row file with ~6k null MAFIDs past the 10k-row +preview - the entire load aborted mid-stream. Streaming loads use :func:`iter_sas_chunks` + :func:`copy_dataframes`, which commit each chunk as it is copied so an interrupted load retains the rows @@ -255,12 +257,19 @@ values; too small a sample is easy to mis-infer.""" NUMERIC_INT_RANGE = (-2_147_483_648, 2_147_483_647) """INTEGER bounds; anything outside becomes BIGINT.""" -TYPE_INFERENCE_SAMPLE_ROWS: Optional[int] = 10_000 +TYPE_INFERENCE_SAMPLE_ROWS: Optional[int] = None """Cap on rows inspected during per-column type inference. Also governs how many rows :func:`read_sas_preview` pulls from the file for dry-run / validate / -schema-inference flows. Set to ``None`` to scan every row (and read the whole -file into memory for the preview step - don't do this on multi-hundred-million -row files).""" +schema-inference flows. + +Default is ``None`` (scan every row, reading the whole file into memory for +the schema-inference step). That's the only honest setting for nullability: +any integer cap lets a column look ``NOT NULL`` across the first N rows +while the file actually holds rare nulls past the window, which then +detonates ``COPY`` mid-stream (seen in production on a 2.5M-row file where +~6k MAFIDs were null past the 10k-row preview). If you're loading a file +so large that a full read won't fit in memory, set this to an integer cap +and accept that sampled specs can't be trusted for ``NOT NULL``.""" DEFAULT_CHUNK_ROWS = 100_000 """Rows per chunk when streaming a SAS file into ``COPY``. Larger values mean @@ -777,8 +786,12 @@ def infer_schema( """ original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {}) - # Row-walking type probes run on a bounded head slice; nullability and the - # all-null check still see every row so NOT NULL declarations stay honest. + # When ``TYPE_INFERENCE_SAMPLE_ROWS`` is an integer cap, row-walking type + # probes run on the head slice for speed; nullability and the all-null + # check still walk every row of ``df``. That's only honest when the + # caller handed us the full file - with the default cap of ``None`` the + # CLI does exactly that. Callers who pass a partial preview and a tight + # integer cap accept that ``NOT NULL`` can be wrong for rare-null columns. df_rows = len(df) effective_total = total_rows if total_rows is not None else df_rows if TYPE_INFERENCE_SAMPLE_ROWS is not None and df_rows > TYPE_INFERENCE_SAMPLE_ROWS: @@ -1921,10 +1934,14 @@ def main(argv: Optional[List[str]] = None) -> int: print(f"error: SAS file not found: {cfg.filename}", file=sys.stderr) return 2 - # Schema inference uses a bounded preview read so we never load a - # hundreds-of-millions-of-rows file into memory just to pick types. - # NB: ``meta.number_rows`` on a ``row_limit``-ed read reflects rows - # returned, not the file's total, so we don't trust it here. + # Schema inference reads the whole file so type + nullability are + # computed against every row. That's what the target host has the + # resources for and is the only way to honestly emit ``NOT NULL`` - + # a bounded preview routinely missed the ~0.2% of rows with nulls on + # otherwise-dense keys (e.g. MAFID). If you're on a box that can't + # fit the file in memory, override ``TYPE_INFERENCE_SAMPLE_ROWS`` to + # an integer cap and know that sampled specs may stamp ``NOT NULL`` + # on columns whose nulls live past the window. preview_df, meta = read_sas_preview(cfg.filename) preview_df = apply_column_filter(preview_df, cfg.include, cfg.exclude) columns = infer_schema(preview_df, meta)