From 3a0537270c1607881dfc7224f917e4a647f4748d Mon Sep 17 00:00:00 2001 From: David Peterson Date: Sat, 18 Apr 2026 10:28:37 -0500 Subject: [PATCH] Implement type inference sampling in load_sas.py to improve performance on large SAS files. Introduce TYPE_INFERENCE_SAMPLE_ROWS to limit the number of rows scanned for type detection while ensuring nullability checks cover the entire column. Update documentation to reflect these changes. --- generic_loader/load_sas.py | 44 ++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index 4956fb3..06c5156 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -163,16 +163,26 @@ Priority order used by :func:`infer_schema`: value exceeds the int32 range ``NUMERIC_INT_RANGE``); otherwise ``DOUBLE PRECISION``. +Type inference scans only the first ``TYPE_INFERENCE_SAMPLE_ROWS`` rows for +performance on large files. Nullability and all-null detection still run over +the full column (they're vectorized and fast) so a ``NOT NULL`` constraint is +never declared for a column that has a null anywhere in the file. Tradeoff: +if the first N rows fit ``INTEGER`` but a later row exceeds int32, COPY will +fail; bump the sample size or set ``TYPE_INFERENCE_SAMPLE_ROWS = None`` to +scan the whole column. + 7. Tunables ----------- Module-level knobs at the top of this file: - * ``COERCE_CHAR_COLUMNS`` - whether to promote stringly-typed numerics/ + * ``COERCE_CHAR_COLUMNS`` - whether to promote stringly-typed numerics/ dates (default True). * ``CHAR_INFERENCE_MIN_VALUES`` - minimum non-empty sample size before char-column coercion is attempted. * ``NUMERIC_INT_RANGE`` - INTEGER bounds; values outside become ``BIGINT``. + * ``TYPE_INFERENCE_SAMPLE_ROWS`` - cap on rows used for type inference + (``None`` = scan the whole column). """ from __future__ import annotations @@ -210,6 +220,15 @@ values; too small a sample is easy to mis-infer.""" NUMERIC_INT_RANGE = (-2_147_483_648, 2_147_483_647) """INTEGER bounds; anything outside becomes BIGINT.""" +TYPE_INFERENCE_SAMPLE_ROWS: Optional[int] = 10_000 +"""Cap on rows inspected during per-column type inference. The row-walking +helpers (date detection on object columns, string-coercion probes, whole-number +check on numeric columns) operate on ``df.head(TYPE_INFERENCE_SAMPLE_ROWS)`` +instead of the full frame, which matters on SAS files with hundreds of millions +of rows. Nullability is still evaluated across the whole column (cheap, +vectorized) so ``NOT NULL`` declarations remain safe. Set to ``None`` to scan +every row.""" + VALID_IF_EXISTS = ("fail", "replace", "append") @@ -547,6 +566,16 @@ def infer_schema( """ original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {}) + # Row-walking type probes run on a bounded head slice; nullability and the + # all-null check still see every row so NOT NULL declarations stay honest. + total_rows = len(df) + if TYPE_INFERENCE_SAMPLE_ROWS is not None and total_rows > TYPE_INFERENCE_SAMPLE_ROWS: + sample_df = df.head(TYPE_INFERENCE_SAMPLE_ROWS) + sampled = True + else: + sample_df = df + sampled = False + # Temporarily flip the module-level flag if the caller asked us to. global COERCE_CHAR_COLUMNS saved = COERCE_CHAR_COLUMNS @@ -555,6 +584,7 @@ def infer_schema( out: Dict[str, ColumnSpec] = {} for col in df.columns: series = df[col] + sample_series = sample_df[col] sas_format = original_formats.get(col) notes: List[str] = [] @@ -567,13 +597,13 @@ def infer_schema( elif pd.api.types.is_datetime64_any_dtype(series): pg_type = "TIMESTAMP" elif pd.api.types.is_object_dtype(series): - is_dates, any_dt = _object_is_dates(series) + is_dates, any_dt = _object_is_dates(sample_series) if is_dates: pg_type = "TIMESTAMP" if any_dt else "DATE" else: - pg_type = _infer_char_type(series) + pg_type = _infer_char_type(sample_series) elif pd.api.types.is_numeric_dtype(series): - int_target = _numeric_int_target(series) + int_target = _numeric_int_target(sample_series) if int_target is not None: pg_type = int_target else: @@ -582,6 +612,12 @@ def infer_schema( pg_type = "TEXT" notes.append(f"unhandled dtype {series.dtype}; defaulting to TEXT") + if sampled: + notes.append( + f"type inferred from first {TYPE_INFERENCE_SAMPLE_ROWS:,} of " + f"{total_rows:,} rows" + ) + nullable = _is_nullable(series) out[col] = ColumnSpec(