2026-04-21 22:32:18 +00:00
1 changed files with 36 additions and 19 deletions
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -183,15 +183,17 @@ Priority order used by :func:`infer_schema`:
       value exceeds the int32 range ``NUMERIC_INT_RANGE``); otherwise
       ``DOUBLE PRECISION``.

-Type inference scans only the first ``TYPE_INFERENCE_SAMPLE_ROWS`` rows for
-performance on large files. The CLI enforces this at read time via
-:func:`read_sas_preview`, so the whole file is never materialized just to pick
-types. Sampled specs carry an ``inferred_from_sample`` marker and the usual
-tradeoffs: if the first N rows fit ``INTEGER`` but a later row exceeds int32,
-or a column had no nulls in the preview but does later in the file, ``COPY``
-will fail mid-stream and the whole transaction rolls back. Set
-``TYPE_INFERENCE_SAMPLE_ROWS = None`` to scan every row when exact typing
-matters more than speed.
+Type inference scans the whole file by default (``TYPE_INFERENCE_SAMPLE_ROWS
+= None``) so type + nullability are both computed against every row. The CLI
+materializes the file once for schema inference, then re-streams it chunk by
+chunk into ``COPY``; peak memory is roughly one full dataframe. Override
+``TYPE_INFERENCE_SAMPLE_ROWS`` to an integer cap if you're on a host that
+can't hold the file in memory - but know that sampled specs carry the usual
+risks: a later row may exceed the inferred integer range, or a column that
+had no nulls in the preview may carry nulls later in the file (which then
+detonates ``COPY`` because the sampled spec stamped it ``NOT NULL``). Seen
+in production on a 2.5M-row file with ~6k null MAFIDs past the 10k-row
+preview - the entire load aborted mid-stream.

 Streaming loads use :func:`iter_sas_chunks` + :func:`copy_dataframes`, which
 commit each chunk as it is copied so an interrupted load retains the rows
@ -255,12 +257,19 @@ values; too small a sample is easy to mis-infer."""
 NUMERIC_INT_RANGE = (-2_147_483_648, 2_147_483_647)
 """INTEGER bounds; anything outside becomes BIGINT."""

-TYPE_INFERENCE_SAMPLE_ROWS: Optional[int] = 10_000
+TYPE_INFERENCE_SAMPLE_ROWS: Optional[int] = None
 """Cap on rows inspected during per-column type inference. Also governs how
 many rows :func:`read_sas_preview` pulls from the file for dry-run / validate /
-schema-inference flows. Set to ``None`` to scan every row (and read the whole
-file into memory for the preview step - don't do this on multi-hundred-million
-row files)."""
+schema-inference flows.
+
+Default is ``None`` (scan every row, reading the whole file into memory for
+the schema-inference step). That's the only honest setting for nullability:
+any integer cap lets a column look ``NOT NULL`` across the first N rows
+while the file actually holds rare nulls past the window, which then
+detonates ``COPY`` mid-stream (seen in production on a 2.5M-row file where
+~6k MAFIDs were null past the 10k-row preview). If you're loading a file
+so large that a full read won't fit in memory, set this to an integer cap
+and accept that sampled specs can't be trusted for ``NOT NULL``."""

 DEFAULT_CHUNK_ROWS = 100_000
 """Rows per chunk when streaming a SAS file into ``COPY``. Larger values mean
@ -777,8 +786,12 @@ def infer_schema(
    """
    original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {})

-    # Row-walking type probes run on a bounded head slice; nullability and the
-    # all-null check still see every row so NOT NULL declarations stay honest.
+    # When ``TYPE_INFERENCE_SAMPLE_ROWS`` is an integer cap, row-walking type
+    # probes run on the head slice for speed; nullability and the all-null
+    # check still walk every row of ``df``. That's only honest when the
+    # caller handed us the full file - with the default cap of ``None`` the
+    # CLI does exactly that. Callers who pass a partial preview and a tight
+    # integer cap accept that ``NOT NULL`` can be wrong for rare-null columns.
    df_rows = len(df)
    effective_total = total_rows if total_rows is not None else df_rows
    if TYPE_INFERENCE_SAMPLE_ROWS is not None and df_rows > TYPE_INFERENCE_SAMPLE_ROWS:
@ -1921,10 +1934,14 @@ def main(argv: Optional[List[str]] = None) -> int:
        print(f"error: SAS file not found: {cfg.filename}", file=sys.stderr)
        return 2

-    # Schema inference uses a bounded preview read so we never load a
-    # hundreds-of-millions-of-rows file into memory just to pick types.
-    # NB: ``meta.number_rows`` on a ``row_limit``-ed read reflects rows
-    # returned, not the file's total, so we don't trust it here.
+    # Schema inference reads the whole file so type + nullability are
+    # computed against every row. That's what the target host has the
+    # resources for and is the only way to honestly emit ``NOT NULL`` -
+    # a bounded preview routinely missed the ~0.2% of rows with nulls on
+    # otherwise-dense keys (e.g. MAFID). If you're on a box that can't
+    # fit the file in memory, override ``TYPE_INFERENCE_SAMPLE_ROWS`` to
+    # an integer cap and know that sampled specs may stamp ``NOT NULL``
+    # on columns whose nulls live past the window.
    preview_df, meta = read_sas_preview(cfg.filename)
    preview_df = apply_column_filter(preview_df, cfg.include, cfg.exclude)
    columns = infer_schema(preview_df, meta)