2026-04-21 22:32:18 +00:00
4 changed files with 446 additions and 29 deletions
--- a/generic_loader/load_folder.py
+++ b/generic_loader/load_folder.py
@ -32,6 +32,17 @@ USAGE
    # include: [ID, INTCOL]
    # exclude: [ALLNULL]
    # Optional folder default for explicit column type overrides. These
    # win over the cluster-wide auto-union computed during pre-scan; set
    # them when a column's SAS-level type varies across files (e.g. phone
    # IDs stored as CHAR in some years and NUM in others) and you want to
    # pin the Postgres type yourself rather than accept the auto-derived
    # one. Per-cluster column_types inside each clusters[*] entry are
    # merged on top of this map.
    # column_types:
    #   RESP_PH_PREFIX_ID: TEXT
    #   SOME_BIGINT_COL: BIGINT
    # Optional folder default for LIST partitioning. Omit or set [] for no
    # partitioning. Accepts a single string or a list of column names.
    # partition_by:
@ -43,14 +54,16 @@ USAGE
    # Optional explicit cluster patterns. Each pattern is matched against the
    # file *basename*. Matched files are pulled out of the auto-detect pool.
-    # Per-cluster if_exists/include/exclude/partition_by/max_partitions
+    # Per-cluster if_exists/include/exclude/partition_by/max_partitions/
-    # override the folder-level defaults.
+    # column_types override the folder-level defaults.
    clusters:
      - pattern: '^group_a\\d+\\.sas7bdat$'
        tablename: group_a
      - pattern: '^group_b\\d+\\.sas7bdat$'
        tablename: group_b
        if_exists: replace
        column_types:
          PHONE_PREFIX: TEXT
 2. Command-line interface
 -------------------------
@ -158,6 +171,7 @@ from load_sas import (
    create_indexes,
    create_table,
    discover_partition_values_chunked,
    extract_union_metadata,
    infer_schema,
    iter_sas_chunks,
    read_sas_metadata,
@ -165,6 +179,7 @@ from load_sas import (
    render_create_indexes,
    render_create_table,
    render_partition_ddl,
    union_column_types,
 )
@ -182,7 +197,11 @@ class ClusterSpec:
    ``partition_by``, ``max_partitions``, and ``indexes`` are resolved from
    the folder defaults and any per-cluster overrides during
-    :func:`discover_clusters`.
+    :func:`discover_clusters`. ``column_types`` holds the effective type
    overrides for this cluster: user-supplied YAML entries merged on top
    of the auto-union result computed during pre-scan (see :func:`main`).
    The same dict is threaded through to workers so every file in the
    cluster infers the same schema.
    """
    tablename: str
@ -195,6 +214,7 @@ class ClusterSpec:
    partition_by: List[str] = field(default_factory=list)
    max_partitions: int = 10_000
    indexes: List[str] = field(default_factory=list)
    column_types: Dict[str, str] = field(default_factory=dict)
@dataclass
@ -205,6 +225,8 @@ class _ExplicitPattern:
    An explicit empty list ``[]`` means "disable partitioning for this cluster".
    ``max_partitions`` defaults to ``None`` meaning "inherit from folder level".
    ``indexes`` defaults to ``None`` meaning "inherit from folder level".
    ``column_types`` defaults to ``None`` meaning "inherit from folder level";
    an explicit ``{}`` means "no user overrides for this cluster".
    """
    pattern: re.Pattern
@ -216,6 +238,7 @@ class _ExplicitPattern:
    partition_by: Optional[List[str]] = None
    max_partitions: Optional[int] = None
    indexes: Optional[List[str]] = None
    column_types: Optional[Dict[str, str]] = None
@dataclass
@ -224,6 +247,9 @@ class FolderConfig:
    ``partition_by``, ``max_partitions``, and ``indexes`` serve as defaults
    for every cluster unless overridden at the cluster level.
    ``column_types`` is a ``{column_name: postgres_type_str}`` map of
    user-supplied type overrides that win over the auto-union computed
    during pre-scan.
    """
    folder: Path
@ -236,6 +262,7 @@ class FolderConfig:
    partition_by: List[str] = field(default_factory=list)
    max_partitions: int = 10_000
    indexes: List[str] = field(default_factory=list)
    column_types: Dict[str, str] = field(default_factory=dict)
 # ---------------------------------------------------------------------------
@ -396,6 +423,40 @@ def _validate_indexes_vs_columns(
        )
 def _parse_column_types(
    raw_value: Any, where: str, *, allow_none: bool = False
 ) -> Optional[Dict[str, str]]:
    """Parse a ``column_types`` mapping from YAML.
    The value must be a mapping ``{column_name: pg_type_str}``. Keys and
    values are whitespace-stripped strings; empty strings raise. When
    ``allow_none`` is True (used for per-cluster entries), an omitted key
    returns ``None`` to mean "inherit from folder level"; an explicit
    empty mapping returns ``{}`` (no overrides for this cluster).
    """
    if raw_value is None:
        return None if allow_none else {}
    if not isinstance(raw_value, dict):
        raise ValueError(
            f"{where}: 'column_types' must be a mapping of "
            f"{{column_name: postgres_type}}."
        )
    out: Dict[str, str] = {}
    for k, v in raw_value.items():
        key = str(k).strip()
        if not key:
            raise ValueError(
                f"{where}: 'column_types' contains an empty column name."
            )
        if not isinstance(v, str) or not v.strip():
            raise ValueError(
                f"{where}: 'column_types[{key}]' must be a non-empty "
                f"Postgres type string (got {v!r})."
            )
        out[key] = v.strip()
    return out
 def load_folder_config(path: Path) -> FolderConfig:
    """Parse and validate the folder-level YAML config at ``path``.
@ -438,6 +499,11 @@ def load_folder_config(path: Path) -> FolderConfig:
    indexes = _parse_indexes(raw.get("indexes"), f"Config {path}")
    _validate_indexes_vs_columns(indexes, exclude, f"Config {path}")
    # -- folder-level column_types overrides --------------------------------
    column_types = _parse_column_types(
        raw.get("column_types"), f"Config {path}"
    )
    explicit: List[_ExplicitPattern] = []
    clusters_raw = raw.get("clusters") or []
    if not isinstance(clusters_raw, list):
@ -479,6 +545,11 @@ def load_folder_config(path: Path) -> FolderConfig:
        effective_idx = c_indexes if c_indexes is not None else indexes
        _validate_indexes_vs_columns(effective_idx, effective_exclude, where)
        # -- per-cluster column_types overrides -----------------------------
        c_column_types = _parse_column_types(
            entry.get("column_types"), where, allow_none=True
        )
        explicit.append(
            _ExplicitPattern(
                pattern=compiled,
@ -490,6 +561,7 @@ def load_folder_config(path: Path) -> FolderConfig:
                partition_by=c_partition_by,
                max_partitions=c_max_partitions,
                indexes=c_indexes,
                column_types=c_column_types,
            )
        )
@ -504,6 +576,7 @@ def load_folder_config(path: Path) -> FolderConfig:
        partition_by=partition_by,
        max_partitions=max_partitions,
        indexes=indexes,
        column_types=column_types or {},
    )
@ -601,6 +674,14 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
            patt.indexes if patt.indexes is not None
            else cfg.indexes
        )
        # Resolve column_types: user overrides only. The auto-union adds
        # more entries later (in :func:`main`) after the metadata pre-scan.
        # None = inherit folder, {} = no cluster-level overrides, dict =
        # cluster-level overrides that win over folder-level entries.
        if patt.column_types is None:
            resolved_ct: Dict[str, str] = dict(cfg.column_types)
        else:
            resolved_ct = {**cfg.column_types, **patt.column_types}
        matched = [f for f in remaining if patt.pattern.search(f.name)]
        if not matched:
@ -618,6 +699,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
                    partition_by=resolved_pb,
                    max_partitions=resolved_mp,
                    indexes=resolved_idx,
                    column_types=dict(resolved_ct),
                )
            )
            continue
@ -634,6 +716,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
                partition_by=resolved_pb,
                max_partitions=resolved_mp,
                indexes=resolved_idx,
                column_types=dict(resolved_ct),
            )
        )
@ -654,6 +737,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
                    partition_by=cfg.partition_by,
                    max_partitions=cfg.max_partitions,
                    indexes=cfg.indexes,
                    column_types=dict(cfg.column_types),
                )
            )
@ -666,19 +750,29 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
 def _infer_cluster_schema(
-    path: Path, include, exclude
+    path: Path,
    include,
    exclude,
    *,
    column_types: Optional[Dict[str, str]] = None,
 ) -> Tuple[Dict, Optional[int]]:
    """Infer the Postgres column schema from a SAS file preview.
    Returns ``(columns, total_rows)``. ``total_rows`` comes from the
    pyreadstat metadata (the file's declared row count) and is threaded
    through to :func:`_stream_file` so the tqdm progress bar has a real
-    denominator instead of an indeterminate spinner.
+    denominator instead of an indeterminate spinner. ``column_types``
    lets the caller pin specific columns to a chosen Postgres type
    (typically the merged auto-union + YAML overrides for the cluster).
    """
    preview_df, meta = read_sas_preview(path)
    preview_df = apply_column_filter(preview_df, include, exclude)
    total_rows = getattr(meta, "number_rows", None)
-    columns = infer_schema(preview_df, meta, total_rows=total_rows)
+    columns = infer_schema(
        preview_df, meta,
        total_rows=total_rows,
        column_types=column_types,
    )
    return columns, total_rows
@ -748,7 +842,8 @@ def load_cluster(
    first, *rest = cluster.files
    first_columns, first_total_rows = _infer_cluster_schema(
-        first, cluster.include, cluster.exclude
+        first, cluster.include, cluster.exclude,
        column_types=cluster.column_types,
    )
    # -- Validate index columns early ---------------------------------------
@ -827,6 +922,7 @@ def load_cluster(
            workers=workers,
            progress_queue=progress_queue,
            db_overrides=db_overrides,
            column_types=cluster.column_types,
        )
    else:
        # Serial path: stream the first file on the main connection, then
@ -842,7 +938,8 @@ def load_cluster(
        conn.commit()
        for path in rest:
            columns, path_total_rows = _infer_cluster_schema(
-                path, cluster.include, cluster.exclude
+                path, cluster.include, cluster.exclude,
                column_types=cluster.column_types,
            )
            # Uses the same check that if_exists=append runs. A type
            # mismatch or missing column aborts the cluster; because
@ -926,6 +1023,7 @@ def _worker_load_append_file(
    exclude: Optional[List[str]],
    progress_queue: Any,
    db_overrides: Optional[Dict[str, Optional[str]]],
    column_types: Optional[Dict[str, str]] = None,
 ) -> Tuple[str, int, Optional[str]]:
    """Worker process: load one SAS file in append mode.
@ -965,7 +1063,11 @@ def _worker_load_append_file(
        preview_df, meta = _read_sas_preview(path)
        preview_df = _apply_column_filter(preview_df, include, exclude)
        total_rows = getattr(meta, "number_rows", None)
-        columns = _infer_schema(preview_df, meta, total_rows=total_rows)
+        columns = _infer_schema(
            preview_df, meta,
            total_rows=total_rows,
            column_types=column_types,
        )
        # Drop the preview ASAP - on a 2M-row wide file it's hundreds of MB
        # and we never need it again after schema inference.
        del preview_df, meta
@ -1031,6 +1133,7 @@ def _load_remaining_files_parallel(
    workers: int,
    progress_queue: Any,
    db_overrides: Optional[Dict[str, Optional[str]]],
    column_types: Optional[Dict[str, str]] = None,
 ) -> int:
    """Run append-mode loads for ``files`` across a process pool.
@ -1069,6 +1172,7 @@ def _load_remaining_files_parallel(
                exclude,
                progress_queue,
                db_overrides,
                column_types,
            )
            for p in files
        ]
@ -1219,7 +1323,14 @@ def main(argv: Optional[List[str]] = None) -> int:
        print()
        for c in loadable:
            print(f"--- DDL for cluster {c.tablename!r} ---")
-            columns, _ = _infer_cluster_schema(c.files[0], c.include, c.exclude)
+            # Dry-run skips the pre-scan (so no auto-union) but user-supplied
            # ``column_types`` from YAML are already baked into ``c.column_types``
            # by ``discover_clusters`` - honor them here so the previewed DDL
            # matches what a real load would produce on a single-file cluster.
            columns, _ = _infer_cluster_schema(
                c.files[0], c.include, c.exclude,
                column_types=c.column_types,
            )
            # Print parent CREATE TABLE (with PARTITION BY if applicable).
            print(
                render_create_table(
@ -1332,40 +1443,58 @@ def main(argv: Optional[List[str]] = None) -> int:
    # -- Metadata pre-scan -----------------------------------------------------
    # Sum ``number_rows`` across every file so the tqdm bar has a real
-    # denominator. ``read_sas_metadata`` uses pyreadstat's ``metadataonly=True``
+    # denominator, AND collect the per-column (readstat_type, sas_format)
-    # fast path, but on multi-GB sas7bdat files that still reads tens of MB
+    # tuples so we can union schemas across files in a cluster before any
-    # of scattered subheader pages per file - sequentially that's minutes for
+    # CREATE TABLE runs. ``read_sas_metadata`` uses pyreadstat's
-    # a 52-file folder. pyreadstat releases the GIL during I/O and C decoding,
+    # ``metadataonly=True`` fast path, but on multi-GB sas7bdat files
-    # so a ThreadPool gives near-linear scaling until the disk saturates.
+    # that still reads tens of MB of scattered subheader pages per file -
-    # ``--no-prescan`` bypasses the scan entirely; the progress bar then runs
+    # sequentially that's minutes for a 52-file folder. pyreadstat
-    # without an ETA - useful when pre-scan itself is expensive (half hour+
+    # releases the GIL during I/O and C decoding, so a ThreadPool gives
-    # on very large files) or when debugging iteratively.
+    # near-linear scaling until the disk saturates. ``--no-prescan``
    # bypasses the scan entirely; the progress bar then runs without an
    # ETA *and* the auto-union is skipped (user overrides from YAML
    # still apply).
    all_files: List[Path] = [p for c in loadable for p in c.files]
    grand_total: Optional[int] = 0
    file_meta_by_path: Dict[str, Dict[str, Tuple[str, Optional[str]]]] = {}
    if args.no_prescan:
        grand_total = None
        print(
            f"[info] --no-prescan set: skipping row-count pre-scan for "
            f"{len(all_files)} file(s); progress bar will show rate + "
-            f"elapsed but no ETA.",
+            f"elapsed but no ETA. Cluster-wide schema auto-union is also "
            f"disabled; only user-specified column_types overrides apply.",
            file=sys.stderr,
        )
    else:
        prescan_workers = min(16, max(1, len(all_files)))
        print(
-            f"pre-scanning row counts for {len(all_files)} file(s) "
+            f"pre-scanning row counts + per-column metadata for "
-            f"across {prescan_workers} thread(s)...",
+            f"{len(all_files)} file(s) across {prescan_workers} thread(s)...",
            file=sys.stderr,
        )
-        def _scan_one(p: Path) -> Tuple[Path, Optional[int], Optional[str]]:
+        def _scan_one(
            p: Path,
        ) -> Tuple[
            Path,
            Optional[int],
            Optional[Dict[str, Tuple[str, Optional[str]]]],
            Optional[str],
        ]:
            try:
                meta = read_sas_metadata(p)
                n = getattr(meta, "number_rows", None)
-                return (p, int(n) if n is not None else None, None)
+                col_meta = extract_union_metadata(meta)
                return (
                    p,
                    int(n) if n is not None else None,
                    col_meta,
                    None,
                )
            except Exception as e:
-                return (p, None, str(e))
+                return (p, None, None, str(e))
        unknown_total_files: List[str] = []
        running_total = 0
@ -1378,7 +1507,7 @@ def main(argv: Optional[List[str]] = None) -> int:
                dynamic_ncols=True,
            )
            try:
-                for p, n, err in tpool.map(_scan_one, all_files):
+                for p, n, col_meta, err in tpool.map(_scan_one, all_files):
                    prescan_bar.update(1)
                    if err is not None:
                        unknown_total_files.append(f"{p.name} ({err})")
@ -1386,6 +1515,8 @@ def main(argv: Optional[List[str]] = None) -> int:
                        unknown_total_files.append(p.name)
                    else:
                        running_total += n
                    if col_meta is not None:
                        file_meta_by_path[str(p)] = col_meta
            finally:
                prescan_bar.close()
@ -1402,6 +1533,59 @@ def main(argv: Optional[List[str]] = None) -> int:
        )
        grand_total = running_total
    # -- Cluster-wide schema auto-union ---------------------------------------
    # For each cluster, compute ``auto_types`` from the union of every
    # file's metadata (see :func:`load_sas.union_column_types`). Merge with
    # any user-supplied YAML overrides (user wins) and attach the result
    # back onto the cluster so every later read - first-file inference,
    # worker inference, schema-compat check - sees the same frozen schema.
    # With ``--no-prescan`` the file_meta_by_path dict is empty and
    # ``auto_types`` resolves to {}, so only the YAML overrides survive.
    for c in loadable:
        per_file = [
            file_meta_by_path[str(p)]
            for p in c.files
            if str(p) in file_meta_by_path
        ]
        auto_types = union_column_types(per_file) if per_file else {}
        user_overrides = dict(c.column_types)  # already merged folder+cluster
        # User-supplied overrides win over the auto-union.
        merged = {**auto_types, **user_overrides}
        c.column_types = merged
        if auto_types:
            # Only call out columns where auto-union *changed* something
            # relative to the default "first file wins" inference. We
            # don't have the default inference in hand at this point, so
            # log the full resolved map at a debug-friendly level - it's
            # bounded by column count and the user asked for visibility
            # into what got overridden.
            shown = auto_types
            if user_overrides:
                # Distinguish the user-forced entries in the log so it's
                # obvious which types came from YAML.
                shown = {
                    col: (
                        f"{user_overrides[col]} (user override)"
                        if col in user_overrides
                        else pg
                    )
                    for col, pg in merged.items()
                }
            print(
                f"[info] cluster {c.tablename!r}: auto-union derived "
                f"{len(auto_types)} column type(s) across "
                f"{len(per_file)} file(s): {shown}",
                file=sys.stderr,
            )
        elif user_overrides and args.no_prescan:
            print(
                f"[info] cluster {c.tablename!r}: using {len(user_overrides)} "
                f"user-supplied column_types override(s); auto-union "
                f"disabled by --no-prescan.",
                file=sys.stderr,
            )
    # -- Shared progress plumbing ---------------------------------------------
    # The queue crosses process boundaries when workers > 1 (managed proxy)
    # and is a plain in-process queue otherwise; the put/get contract is
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -307,6 +307,7 @@ class LoaderConfig:
    partition_by: List[str] = field(default_factory=list)
    max_partitions: int = 10_000
    indexes: List[str] = field(default_factory=list)
    column_types: Dict[str, str] = field(default_factory=dict)
@dataclass
@ -517,6 +518,35 @@ def load_config(path: Path) -> LoaderConfig:
                    f"{missing_in_include}"
                )
    # -- column_types -------------------------------------------------------
    # Optional ``{column_name: pg_type}`` escape hatch that bypasses
    # automatic type inference for specific columns. Useful when
    # pyreadstat reports a column as NUM but the downstream consumer
    # expects TEXT (e.g. phone-id columns), or when a column has drifted
    # between CHAR and NUM across file versions and you want to pin
    # TEXT up front. See also :func:`infer_schema`.
    raw_ct = raw.get("column_types")
    column_types: Dict[str, str] = {}
    if raw_ct is not None:
        if not isinstance(raw_ct, dict):
            raise ValueError(
                f"Config {path}: 'column_types' must be a mapping of "
                f"{{column_name: postgres_type}}."
            )
        for k, v in raw_ct.items():
            key = str(k).strip()
            if not key:
                raise ValueError(
                    f"Config {path}: 'column_types' contains an empty "
                    f"column name."
                )
            if not isinstance(v, str) or not v.strip():
                raise ValueError(
                    f"Config {path}: 'column_types[{key}]' must be a "
                    f"non-empty Postgres type string (got {v!r})."
                )
            column_types[key] = v.strip()
    return LoaderConfig(
        filename=filename,
        schemaname=schemaname,
@ -527,6 +557,7 @@ def load_config(path: Path) -> LoaderConfig:
        partition_by=partition_by,
        max_partitions=max_partitions,
        indexes=indexes,
        column_types=column_types,
    )
@ -687,6 +718,117 @@ def _format_driven_type(sas_format: Optional[str]) -> Optional[str]:
    return None
 _DECIMAL_FORMAT_RE = re.compile(r"\.(\d+)")
 def _format_hints_decimal(sas_format: Optional[str]) -> bool:
    """True if a numeric SAS format string explicitly carries decimal places.
    SAS numeric formats are ``NAMEw.d``; ``d > 0`` means the variable was
    intended to render with ``d`` decimal digits (COMMA10.2, F8.3, ...).
    A bare width like ``BEST12.`` or ``F8.`` has no digits after the dot
    and is treated as integer-presenting. Used by
    :func:`union_column_types` to pick BIGINT vs DOUBLE PRECISION when a
    column is numeric in every file of a cluster.
    """
    if not sas_format:
        return False
    m = _DECIMAL_FORMAT_RE.search(sas_format)
    if not m:
        return False
    try:
        return int(m.group(1)) > 0
    except ValueError:
        return False
 def extract_union_metadata(
    meta: Any,
 ) -> Dict[str, Tuple[str, Optional[str]]]:
    """Pull the (readstat_type, sas_format) pair for every column in ``meta``.
    Returns a plain dict that's safe to pass between processes and to
    :func:`union_column_types`. ``readstat_type`` is the simplified type
    reported by pyreadstat: ``"string"`` for SAS CHAR, ``"double"`` for
    SAS NUM. ``sas_format`` comes from ``meta.original_variable_types``
    and drives date/datetime detection during union.
    """
    var_types = dict(getattr(meta, "variable_types", None) or {})
    formats = dict(getattr(meta, "original_variable_types", None) or {})
    names = list(
        getattr(meta, "column_names", None)
        or list(var_types.keys())
        or list(formats.keys())
    )
    out: Dict[str, Tuple[str, Optional[str]]] = {}
    for col in names:
        rtype = str(var_types.get(col, "")) if var_types else ""
        fmt = formats.get(col)
        out[col] = (rtype, fmt if fmt else None)
    return out
 def union_column_types(
    per_file_metas: Iterable[Dict[str, Tuple[str, Optional[str]]]],
 ) -> Dict[str, str]:
    """Derive one Postgres type per column that's safe across every file.
    ``per_file_metas`` is an iterable (one entry per file in a cluster) of
    ``{column_name: (readstat_type, sas_format)}`` dicts as produced by
    :func:`extract_union_metadata`.
    Rules, evaluated per column:
    * **CHAR/NUM drift wins TEXT.** If any file stores the column as CHAR
      (``readstat_type != "double"``) the union is ``TEXT``. This covers
      the phone-id case where some years stored ``RESP_PH_PREFIX_ID`` as
      CHAR and others as NUM.
    * **All NUM, format hints DATETIME → TIMESTAMP.** Any file whose
      format resolves to ``TIMESTAMP`` (via :func:`_format_driven_type`)
      pins the column to ``TIMESTAMP`` even if other files left the
      format blank.
    * **All NUM, format hints DATE → DATE.** Same idea for date-only
      formats.
    * **All NUM, any decimal hint → DOUBLE PRECISION.** A ``w.d`` format
      with ``d > 0`` in any file implies fractional values somewhere.
    * **All NUM, otherwise → BIGINT.** Default to BIGINT per user
      preference: integer-presenting NUM columns drift between
      INTEGER/BIGINT/DOUBLE across files, and the few extra bytes are
      worth not re-failing every load.
    Columns missing from a given file are simply skipped for that file;
    the union is computed over whichever files *did* supply the column.
    Columns that never appear anywhere are omitted from the result.
    """
    per_col: Dict[str, List[Tuple[str, Optional[str]]]] = {}
    for meta in per_file_metas:
        for col, pair in meta.items():
            per_col.setdefault(col, []).append(pair)
    result: Dict[str, str] = {}
    for col, entries in per_col.items():
        any_char = any(
            rtype and rtype.lower() != "double" for rtype, _ in entries
        )
        if any_char:
            result[col] = "TEXT"
            continue
        formats = [fmt for _, fmt in entries if fmt]
        driven = [_format_driven_type(f) for f in formats]
        if "TIMESTAMP" in driven:
            result[col] = "TIMESTAMP"
        elif "DATE" in driven:
            result[col] = "DATE"
        elif any(_format_hints_decimal(f) for f in formats):
            result[col] = "DOUBLE PRECISION"
        else:
            # Safe default: BIGINT. The user explicitly accepted wasting a
            # few bytes here to avoid INTEGER→BIGINT widening failures on
            # multi-year clusters.
            result[col] = "BIGINT"
    return result
 def _all_null(series: pd.Series) -> bool:
    if pd.api.types.is_object_dtype(series):
        return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all())
@ -812,6 +954,7 @@ def infer_schema(
    *,
    coerce_chars: bool = COERCE_CHAR_COLUMNS,
    total_rows: Optional[int] = None,
    column_types: Optional[Dict[str, str]] = None,
 ) -> Dict[str, ColumnSpec]:
    """Infer a Postgres column spec for each column in ``df``.
@ -827,6 +970,14 @@ def infer_schema(
    ``total_rows`` lets callers who already sampled the frame (e.g. via
    :func:`read_sas_preview`) report the real file size in the per-column
    "inferred from first N of M rows" note. Falls back to ``len(df)``.
    ``column_types`` is an optional map ``{column_name: pg_type_str}``
    whose entries bypass inference entirely - the caller has already
    decided the type (e.g. via :func:`union_column_types` across a
    cluster, or a YAML ``column_types`` override). Nullability is still
    computed from the data. Columns in ``column_types`` that don't exist
    in ``df`` are ignored so a shared override dict can apply to clusters
    with different column sets.
    """
    original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {})
@ -846,6 +997,8 @@ def infer_schema(
        sample_size = df_rows
    sampled = sample_size < effective_total
    overrides: Dict[str, str] = dict(column_types or {})
    # Temporarily flip the module-level flag if the caller asked us to.
    global COERCE_CHAR_COLUMNS
    saved = COERCE_CHAR_COLUMNS
@ -858,6 +1011,23 @@ def infer_schema(
            sas_format = original_formats.get(col)
            notes: List[str] = []
            if col in overrides:
                pg_type = overrides[col]
                notes.append(
                    f"type forced to {pg_type} via column_types override"
                )
                nullable = _is_nullable(series)
                out[col] = ColumnSpec(
                    name=col,
                    postgres_type=pg_type,
                    nullable=nullable,
                    sas_format=sas_format,
                    source_dtype=str(series.dtype),
                    notes=notes,
                    sampled=sampled,
                )
                continue
            pg_type = _format_driven_type(sas_format)
            if pg_type is None:
@ -1832,7 +2002,33 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
            # astype(str) stringifies NaN/None to the literal "nan"/"None",
            # so we mask those after the fact rather than branching per cell.
            na_mask = series.isna()
-            out[name] = series.astype(str).mask(na_mask, "")
+            if pd.api.types.is_numeric_dtype(series):
                # Hit when a column was auto-unioned to TEXT because at
                # least one file of the cluster stored it as CHAR but this
                # particular file stored it as NUM (typical of SAS phone-id
                # columns). Default float formatting would emit "123.0" -
                # which doesn't match the plain "123" coming from the CHAR
                # files. When the whole chunk is integer-valued, round to
                # int before stringifying; when any fractional value is
                # present we leave float formatting alone so we don't
                # silently drop precision.
                nonnull = series.dropna()
                int_like = False
                if not nonnull.empty:
                    try:
                        int_like = bool(((nonnull % 1) == 0).all())
                    except TypeError:
                        int_like = False
                if int_like:
                    # ``Int64`` preserves NA; ``.astype(str)`` renders NA
                    # as '<NA>', which we then mask out alongside original
                    # NaNs.
                    as_str = series.astype("Int64").astype(str)
                    out[name] = as_str.mask(na_mask, "")
                else:
                    out[name] = series.astype(str).mask(na_mask, "")
            else:
                out[name] = series.astype(str).mask(na_mask, "")
        elif pg == "BOOLEAN":
            out[name] = series.astype("boolean") if series.dtype != object else series
        else:
@ -2064,7 +2260,7 @@ def main(argv: Optional[List[str]] = None) -> int:
    # on columns whose nulls live past the window.
    preview_df, meta = read_sas_preview(cfg.filename)
    preview_df = apply_column_filter(preview_df, cfg.include, cfg.exclude)
-    columns = infer_schema(preview_df, meta)
+    columns = infer_schema(preview_df, meta, column_types=cfg.column_types)
    # Validate partition columns exist in the schema after filtering.
    if cfg.partition_by:
--- a/generic_loader/sample_config.yaml
+++ b/generic_loader/sample_config.yaml
@ -38,3 +38,15 @@ if_exists: append
 #   indexes:
 #     - state
 #     - zip
 # column_types: Explicit {column_name: postgres_type} overrides that
 # bypass automatic type inference for the listed columns. Useful when
 # pyreadstat reports a column as NUM but you want it stored as TEXT
 # (phone/ID columns that are conceptually strings), or when a column's
 # inferred type is off for any other reason. Columns not listed here
 # fall through to the normal inference path. Nullability is always
 # computed from the data.
 #
 # column_types:
 #   RESP_PH_PREFIX_ID: TEXT
 #   SOMELONG_ID: BIGINT
--- a/generic_loader/sample_folder_config.yaml
+++ b/generic_loader/sample_folder_config.yaml
@ -61,15 +61,40 @@ auto_detect: true
 #     - state
 #     - zip
 # Folder-level column_types: Explicit {column_name: postgres_type} map that
 # bypasses automatic type inference for the listed columns. Applied to
 # every cluster unless a cluster supplies its own column_types, which are
 # merged on top (cluster entries win on conflict).
 #
 # During --workers>1 runs the pre-scan derives a cluster-wide "auto-union"
 # type per column (e.g. any file stores the column as CHAR -> TEXT; all
 # NUM with any format hinting decimals -> DOUBLE PRECISION; otherwise
 # BIGINT). Entries in column_types here win over that auto-union - use
 # them when the auto result is wrong or when --no-prescan disables the
 # auto-union and you still need to pin a column.
 #
 # Valid type strings are anything the CREATE TABLE DDL accepts (TEXT,
 # INTEGER, BIGINT, DOUBLE PRECISION, DATE, TIMESTAMP, ...). Columns that
 # don't exist in a given file are simply ignored for that file.
 #
 # column_types:
 #   RESP_PH_PREFIX_ID: TEXT
 #   RESP_PH_SUFFIX_ID: TEXT
 #   SOMELONG_ID: BIGINT
 # Explicit cluster patterns. Each pattern is matched against the file
 # *basename*. Files matched by a pattern are pulled out of the auto-detect
 # pool, so explicit and auto clusters compose cleanly.
 #
-# `tablename` is required. `if_exists`, `include`, and `exclude` are
+# `tablename` is required. `if_exists`, `include`, `exclude`, and
-# optional per-cluster overrides of the folder-level defaults above.
+# `column_types` are optional per-cluster overrides of the folder-level
 # defaults above. Cluster-level column_types entries win over folder-
 # level entries for the same column.
 clusters:
  - pattern: '^group_a\d+\.xpt$'
    tablename: group_a
    # column_types:
    #   INTCOL: TEXT
  # Example of an explicit override. Uncomment to force the group_b cluster to
  # append instead of replace even though the folder default is "replace":