From 3a0537270c1607881dfc7224f917e4a647f4748d Mon Sep 17 00:00:00 2001
From: David Peterson <dpeterson@americafirstpolicy.com>
Date: Sat, 18 Apr 2026 10:28:37 -0500
Subject: [PATCH] Implement type inference sampling in load_sas.py to improve
 performance on large SAS files. Introduce TYPE_INFERENCE_SAMPLE_ROWS to limit
 the number of rows scanned for type detection while ensuring nullability
 checks cover the entire column. Update documentation to reflect these
 changes.

---
 generic_loader/load_sas.py | 44 ++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py
index 4956fb3..06c5156 100644
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@@ -163,16 +163,26 @@ Priority order used by :func:`infer_schema`:
        value exceeds the int32 range ``NUMERIC_INT_RANGE``); otherwise
        ``DOUBLE PRECISION``.
 
+Type inference scans only the first ``TYPE_INFERENCE_SAMPLE_ROWS`` rows for
+performance on large files. Nullability and all-null detection still run over
+the full column (they're vectorized and fast) so a ``NOT NULL`` constraint is
+never declared for a column that has a null anywhere in the file. Tradeoff:
+if the first N rows fit ``INTEGER`` but a later row exceeds int32, COPY will
+fail; bump the sample size or set ``TYPE_INFERENCE_SAMPLE_ROWS = None`` to
+scan the whole column.
+
 7. Tunables
 -----------
 Module-level knobs at the top of this file:
 
-    * ``COERCE_CHAR_COLUMNS``     - whether to promote stringly-typed numerics/
+    * ``COERCE_CHAR_COLUMNS``      - whether to promote stringly-typed numerics/
                                     dates (default True).
     * ``CHAR_INFERENCE_MIN_VALUES`` - minimum non-empty sample size before
                                     char-column coercion is attempted.
     * ``NUMERIC_INT_RANGE``        - INTEGER bounds; values outside become
                                     ``BIGINT``.
+    * ``TYPE_INFERENCE_SAMPLE_ROWS`` - cap on rows used for type inference
+                                    (``None`` = scan the whole column).
 """
 
 from __future__ import annotations
@@ -210,6 +220,15 @@ values; too small a sample is easy to mis-infer."""
 NUMERIC_INT_RANGE = (-2_147_483_648, 2_147_483_647)
 """INTEGER bounds; anything outside becomes BIGINT."""
 
+TYPE_INFERENCE_SAMPLE_ROWS: Optional[int] = 10_000
+"""Cap on rows inspected during per-column type inference. The row-walking
+helpers (date detection on object columns, string-coercion probes, whole-number
+check on numeric columns) operate on ``df.head(TYPE_INFERENCE_SAMPLE_ROWS)``
+instead of the full frame, which matters on SAS files with hundreds of millions
+of rows. Nullability is still evaluated across the whole column (cheap,
+vectorized) so ``NOT NULL`` declarations remain safe. Set to ``None`` to scan
+every row."""
+
 
 VALID_IF_EXISTS = ("fail", "replace", "append")
 
@@ -547,6 +566,16 @@ def infer_schema(
     """
     original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {})
 
+    # Row-walking type probes run on a bounded head slice; nullability and the
+    # all-null check still see every row so NOT NULL declarations stay honest.
+    total_rows = len(df)
+    if TYPE_INFERENCE_SAMPLE_ROWS is not None and total_rows > TYPE_INFERENCE_SAMPLE_ROWS:
+        sample_df = df.head(TYPE_INFERENCE_SAMPLE_ROWS)
+        sampled = True
+    else:
+        sample_df = df
+        sampled = False
+
     # Temporarily flip the module-level flag if the caller asked us to.
     global COERCE_CHAR_COLUMNS
     saved = COERCE_CHAR_COLUMNS
@@ -555,6 +584,7 @@ def infer_schema(
         out: Dict[str, ColumnSpec] = {}
         for col in df.columns:
             series = df[col]
+            sample_series = sample_df[col]
             sas_format = original_formats.get(col)
             notes: List[str] = []
 
@@ -567,13 +597,13 @@ def infer_schema(
                 elif pd.api.types.is_datetime64_any_dtype(series):
                     pg_type = "TIMESTAMP"
                 elif pd.api.types.is_object_dtype(series):
-                    is_dates, any_dt = _object_is_dates(series)
+                    is_dates, any_dt = _object_is_dates(sample_series)
                     if is_dates:
                         pg_type = "TIMESTAMP" if any_dt else "DATE"
                     else:
-                        pg_type = _infer_char_type(series)
+                        pg_type = _infer_char_type(sample_series)
                 elif pd.api.types.is_numeric_dtype(series):
-                    int_target = _numeric_int_target(series)
+                    int_target = _numeric_int_target(sample_series)
                     if int_target is not None:
                         pg_type = int_target
                     else:
@@ -582,6 +612,12 @@ def infer_schema(
                     pg_type = "TEXT"
                     notes.append(f"unhandled dtype {series.dtype}; defaulting to TEXT")
 
+                if sampled:
+                    notes.append(
+                        f"type inferred from first {TYPE_INFERENCE_SAMPLE_ROWS:,} of "
+                        f"{total_rows:,} rows"
+                    )
+
             nullable = _is_nullable(series)
 
             out[col] = ColumnSpec(