Adding generic loader

2026-04-18 09:34:48 -05:00 · 2026-04-18 09:34:48 -05:00 · f681f1012a
commit f681f1012a
6 changed files with 1315 additions and 0 deletions
--- a/generic_loader/.env.example
+++ b/generic_loader/.env.example
@ -0,0 +1,5 @@
+PGHOST=localhost
+PGPORT=5432
+PGUSER=
+PGPASSWORD=
+PGDATABASE=
--- a/generic_loader/.gitignore
+++ b/generic_loader/.gitignore
@ -0,0 +1,3 @@
+/.venv
+/samples
+/.env
--- a/generic_loader/generate_sample_sas.py
+++ b/generic_loader/generate_sample_sas.py
@ -0,0 +1,380 @@
+"""Generate a kitchen-sink SAS XPORT file plus an expected-types manifest.
+
+Running this script produces two files under samples/:
+  - sample_kitchensink.xpt           the SAS XPORT test fixture
+  - sample_kitchensink.expected.json ground-truth Postgres types for the loader
+
+Tune behavior via the top-level constants below.
+"""
+
+from __future__ import annotations
+
+import datetime as dt
+import json
+import string
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pyreadstat
+
+SEED = 42
+N_ROWS = 1000
+NULL_FRACTION = 0.20
+OUT_DIR = Path("samples")
+OUT_PATH = OUT_DIR / "sample_kitchensink.xpt"
+MANIFEST_PATH = OUT_DIR / "sample_kitchensink.expected.json"
+
+POSITIVE_CONTROLS = {"ID", "INTCOL", "STRCOL", "DATECOL", "CONST"}
+ALL_NULL_COLS = {"ALLNULL", "ALLNULLC"}
+
+
+def _missing_mask(rng: np.random.Generator, n: int, frac: float) -> np.ndarray:
+    """Return a boolean array of length n with exactly round(frac * n) True positions.
+
+    Using an exact count (rather than per-row Bernoulli draws) keeps the observed
+    missing fraction tight so the round-trip assertion can use a small tolerance.
+    """
+    mask = np.zeros(n, dtype=bool)
+    k = int(round(frac * n))
+    if k > 0:
+        idx = rng.choice(n, size=k, replace=False)
+        mask[idx] = True
+    return mask
+
+
+def _random_word(rng: np.random.Generator, min_len: int = 3, max_len: int = 10) -> str:
+    length = int(rng.integers(min_len, max_len + 1))
+    letters = np.array(list(string.ascii_lowercase))
+    return "".join(rng.choice(letters, size=length))
+
+
+def _random_sentence(rng: np.random.Generator, min_words: int = 8, max_words: int = 20) -> str:
+    n_words = int(rng.integers(min_words, max_words + 1))
+    return " ".join(_random_word(rng) for _ in range(n_words))
+
+
+def build_dataframe(rng: np.random.Generator) -> pd.DataFrame:
+    n = N_ROWS
+
+    ids = np.arange(1, n + 1, dtype=np.int64)
+
+    int_vals = rng.integers(0, 1000, size=n).astype(np.float64)
+
+    bigint_vals = rng.integers(10_000_000_000, 20_000_000_000, size=n).astype(np.float64)
+    bigint_vals[_missing_mask(rng, n, NULL_FRACTION)] = np.nan
+
+    float_vals = rng.normal(loc=100.0, scale=15.0, size=n)
+    float_vals[_missing_mask(rng, n, NULL_FRACTION)] = np.nan
+
+    bool_vals = rng.integers(0, 2, size=n).astype(np.float64)
+    bool_vals[_missing_mask(rng, n, NULL_FRACTION)] = np.nan
+
+    str_vals = [_random_word(rng, 3, 8) for _ in range(n)]
+
+    long_str_vals: list[str] = []
+    long_mask = _missing_mask(rng, n, NULL_FRACTION)
+    for i in range(n):
+        long_str_vals.append("" if long_mask[i] else _random_sentence(rng))
+
+    base_date = dt.date(2020, 1, 1)
+    date_vals = [base_date + dt.timedelta(days=int(rng.integers(0, 2000))) for _ in range(n)]
+
+    dt_vals_mask = _missing_mask(rng, n, NULL_FRACTION)
+    dt_vals: list = []
+    base_dt = dt.datetime(2020, 1, 1, 0, 0, 0)
+    for i in range(n):
+        if dt_vals_mask[i]:
+            dt_vals.append(pd.NaT)
+        else:
+            offset_seconds = int(rng.integers(0, 2000 * 24 * 3600))
+            dt_vals.append(base_dt + dt.timedelta(seconds=offset_seconds))
+    dt_series = pd.to_datetime(dt_vals)
+
+    time_mask = _missing_mask(rng, n, NULL_FRACTION)
+    time_vals: list = []
+    for i in range(n):
+        if time_mask[i]:
+            time_vals.append(None)
+        else:
+            seconds_into_day = int(rng.integers(0, 24 * 3600))
+            h, rem = divmod(seconds_into_day, 3600)
+            m, s = divmod(rem, 60)
+            time_vals.append(dt.time(h, m, s))
+
+    numasstr_mask = _missing_mask(rng, n, NULL_FRACTION)
+    numasstr_vals: list[str] = []
+    for i in range(n):
+        if numasstr_mask[i]:
+            numasstr_vals.append("")
+        elif rng.random() < 0.5:
+            numasstr_vals.append(str(int(rng.integers(-500, 500))))
+        else:
+            numasstr_vals.append(f"{rng.normal(0, 50):.2f}")
+
+    dateasstr_mask = _missing_mask(rng, n, NULL_FRACTION)
+    dateasstr_vals: list[str] = []
+    for i in range(n):
+        if dateasstr_mask[i]:
+            dateasstr_vals.append("")
+        else:
+            d = base_date + dt.timedelta(days=int(rng.integers(0, 2000)))
+            dateasstr_vals.append(d.isoformat())
+
+    mixed_mask = _missing_mask(rng, n, NULL_FRACTION)
+    mixed_vals: list[str] = []
+    choices = ["number", "date", "text", "text"]
+    for i in range(n):
+        if mixed_mask[i]:
+            mixed_vals.append("")
+            continue
+        kind = choices[int(rng.integers(0, len(choices)))]
+        if kind == "number":
+            mixed_vals.append(str(int(rng.integers(0, 1000))))
+        elif kind == "date":
+            d = base_date + dt.timedelta(days=int(rng.integers(0, 2000)))
+            mixed_vals.append(d.isoformat())
+        else:
+            mixed_vals.append(_random_word(rng, 4, 12))
+
+    const_vals = ["CONSTANT"] * n
+
+    allnull_vals = np.full(n, np.nan, dtype=np.float64)
+    allnullc_vals = [""] * n
+
+    df = pd.DataFrame(
+        {
+            "ID": ids,
+            "INTCOL": int_vals,
+            "BIGINT": bigint_vals,
+            "FLOATCOL": float_vals,
+            "BOOLCOL": bool_vals,
+            "STRCOL": str_vals,
+            "LONGSTR": long_str_vals,
+            "DATECOL": date_vals,
+            "DTCOL": dt_series,
+            "TIMECOL": time_vals,
+            "NUMASSTR": numasstr_vals,
+            "DATEASTR": dateasstr_vals,
+            "MIXED": mixed_vals,
+            "CONST": const_vals,
+            "ALLNULL": allnull_vals,
+            "ALLNULLC": allnullc_vals,
+        }
+    )
+    return df
+
+
+COLUMN_LABELS: dict[str, str] = {
+    "ID": "Row identifier",
+    "INTCOL": "Integer positive control",
+    "BIGINT": "Big integer beyond int32 range",
+    "FLOATCOL": "Floating point with decimals",
+    "BOOLCOL": "Nullable boolean 0/1/NaN",
+    "STRCOL": "Short string positive control",
+    "LONGSTR": "Longer free-text string",
+    "DATECOL": "Date positive control",
+    "DTCOL": "Datetime with missing values",
+    "TIMECOL": "Time of day with missing values",
+    "NUMASSTR": "Numeric-looking strings in a char column",
+    "DATEASTR": "Date-looking strings in a char column",
+    "MIXED": "Heterogeneous strings: fallback to text",
+    "CONST": "Constant repeated value",
+    "ALLNULL": "Entirely missing numeric column",
+    "ALLNULLC": "Entirely missing character column",
+}
+
+
+VARIABLE_FORMATS: dict[str, str] = {
+    "DATECOL": "DATE9.",
+    "DTCOL": "DATETIME20.",
+    "TIMECOL": "TIME8.",
+}
+
+
+EXPECTED_MANIFEST: dict[str, dict] = {
+    "ID": {"postgres_type": "INTEGER", "nullable": False},
+    "INTCOL": {"postgres_type": "INTEGER", "nullable": False, "note": "positive control"},
+    "BIGINT": {"postgres_type": "BIGINT", "nullable": True, "note": "values beyond int32 range"},
+    "FLOATCOL": {"acceptable_types": ["DOUBLE PRECISION", "NUMERIC"], "nullable": True},
+    "BOOLCOL": {
+        "acceptable_types": ["BOOLEAN", "SMALLINT", "INTEGER"],
+        "nullable": True,
+        "note": "{0,1,NaN} is genuinely ambiguous; loader's choice is a design decision",
+    },
+    "STRCOL": {"acceptable_types": ["TEXT", "VARCHAR"], "nullable": False, "note": "positive control"},
+    "LONGSTR": {"acceptable_types": ["TEXT", "VARCHAR"], "nullable": True},
+    "DATECOL": {"postgres_type": "DATE", "nullable": False, "note": "positive control"},
+    "DTCOL": {"acceptable_types": ["TIMESTAMP", "TIMESTAMP WITHOUT TIME ZONE"], "nullable": True},
+    "TIMECOL": {"postgres_type": "TIME", "nullable": True},
+    "NUMASSTR": {
+        "acceptable_types": ["NUMERIC", "DOUBLE PRECISION"],
+        "nullable": True,
+        "note": "stored as char in SAS; loader should coerce numeric-looking strings",
+    },
+    "DATEASTR": {
+        "postgres_type": "DATE",
+        "nullable": True,
+        "note": "stored as char in SAS; loader should coerce ISO-date strings",
+    },
+    "MIXED": {
+        "acceptable_types": ["TEXT", "VARCHAR"],
+        "nullable": True,
+        "note": "heterogeneous content; loader should fall back to text",
+    },
+    "CONST": {"acceptable_types": ["TEXT", "VARCHAR"], "nullable": False},
+    "ALLNULL": {
+        "acceptable_types": ["TEXT", "VARCHAR"],
+        "nullable": True,
+        "note": "entirely null numeric; loader must pick a default type, typically TEXT",
+    },
+    "ALLNULLC": {
+        "acceptable_types": ["TEXT", "VARCHAR"],
+        "nullable": True,
+        "note": "entirely null character",
+    },
+}
+
+
+def write_manifest(df: pd.DataFrame) -> None:
+    manifest_cols = set(EXPECTED_MANIFEST.keys())
+    df_cols = set(df.columns)
+    missing = df_cols - manifest_cols
+    extra = manifest_cols - df_cols
+    if missing or extra:
+        raise AssertionError(
+            f"Manifest/DataFrame column mismatch. Missing from manifest: {missing}. "
+            f"Extra in manifest: {extra}."
+        )
+    with MANIFEST_PATH.open("w", encoding="utf-8") as f:
+        json.dump(EXPECTED_MANIFEST, f, indent=2, sort_keys=True)
+        f.write("\n")
+
+
+def _char_missing_fraction(series: pd.Series) -> float:
+    return float((series.fillna("").astype(str) == "").mean())
+
+
+def _numeric_missing_fraction(series: pd.Series) -> float:
+    return float(series.isna().mean())
+
+
+def verify_roundtrip(source_df: pd.DataFrame) -> pd.DataFrame:
+    # Use pyreadstat (the writer) to verify the writer's output. pyreadstat preserves
+    # SAS format metadata on readback, so we can confirm the date/datetime/time
+    # variable_format mappings actually took effect.
+    readback, _meta = pyreadstat.read_xport(str(OUT_PATH))
+
+    assert len(readback.columns) == len(source_df.columns), (
+        f"Column count mismatch: wrote {len(source_df.columns)}, read back {len(readback.columns)}"
+    )
+    assert set(readback.columns) == set(source_df.columns), (
+        f"Column name mismatch. Only in source: {set(source_df.columns) - set(readback.columns)}. "
+        f"Only in readback: {set(readback.columns) - set(source_df.columns)}."
+    )
+    assert len(readback) == len(source_df), (
+        f"Row count mismatch: wrote {len(source_df)}, read back {len(readback)}"
+    )
+
+    for col in ("DATECOL", "DTCOL"):
+        dtype = readback[col].dtype
+        is_datetime = pd.api.types.is_datetime64_any_dtype(dtype)
+        is_object_of_dates = pd.api.types.is_object_dtype(dtype) and readback[col].dropna().map(
+            lambda v: isinstance(v, (dt.date, dt.datetime, pd.Timestamp))
+        ).all()
+        assert is_datetime or is_object_of_dates, (
+            f"{col} came back as {dtype}; expected datetime-like. "
+            f"variable_format mapping may not have taken effect."
+        )
+
+    time_dtype = readback["TIMECOL"].dtype
+    time_ok = (
+        pd.api.types.is_datetime64_any_dtype(time_dtype)
+        or pd.api.types.is_numeric_dtype(time_dtype)
+        or (
+            pd.api.types.is_object_dtype(time_dtype)
+            and readback["TIMECOL"].dropna().map(
+                lambda v: isinstance(v, (dt.time, dt.datetime, pd.Timestamp, int, float))
+            ).all()
+        )
+    )
+    assert time_ok, f"TIMECOL came back as {time_dtype}; expected datetime/numeric/time-object"
+
+    tol = 0.10
+    for col in source_df.columns:
+        if col in POSITIVE_CONTROLS:
+            series = readback[col]
+            if pd.api.types.is_numeric_dtype(series) or pd.api.types.is_datetime64_any_dtype(series):
+                observed = _numeric_missing_fraction(series)
+            else:
+                observed = _char_missing_fraction(series)
+            assert observed == 0.0, (
+                f"Positive control {col!r} has {observed:.2%} missing; expected 0%."
+            )
+            continue
+
+        if col in ALL_NULL_COLS:
+            series = readback[col]
+            if pd.api.types.is_numeric_dtype(series):
+                observed = _numeric_missing_fraction(series)
+            else:
+                observed = _char_missing_fraction(series)
+            assert observed == 1.0, (
+                f"All-null column {col!r} has {observed:.2%} missing; expected 100%."
+            )
+            continue
+
+        series = readback[col]
+        if pd.api.types.is_numeric_dtype(series) or pd.api.types.is_datetime64_any_dtype(series):
+            observed = _numeric_missing_fraction(series)
+        else:
+            observed = _char_missing_fraction(series)
+        assert abs(observed - NULL_FRACTION) <= tol, (
+            f"Column {col!r}: observed missing fraction {observed:.2%} not within "
+            f"±{tol:.0%} of NULL_FRACTION={NULL_FRACTION:.2%}."
+        )
+
+    assert MANIFEST_PATH.exists(), f"Manifest file {MANIFEST_PATH} missing."
+    with MANIFEST_PATH.open("r", encoding="utf-8") as f:
+        manifest = json.load(f)
+    assert set(manifest.keys()) == set(readback.columns), (
+        f"Manifest/readback column set mismatch. "
+        f"Only in manifest: {set(manifest.keys()) - set(readback.columns)}. "
+        f"Only in readback: {set(readback.columns) - set(manifest.keys())}."
+    )
+
+    return readback
+
+
+def main() -> None:
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    rng = np.random.default_rng(SEED)
+    df = build_dataframe(rng)
+
+    pyreadstat.write_xport(
+        df,
+        str(OUT_PATH),
+        file_format_version=5,
+        table_name="SAMPLE",
+        file_label="Kitchen sink sample for loader testing",
+        column_labels=COLUMN_LABELS,
+        variable_format=VARIABLE_FORMATS,
+    )
+
+    write_manifest(df)
+
+    readback = verify_roundtrip(df)
+
+    print(f"Wrote {OUT_PATH} ({N_ROWS} rows x {len(df.columns)} cols)")
+    print(f"Wrote {MANIFEST_PATH}")
+    print()
+    print("Readback via pyreadstat.read_xport (same reader the loader will use):")
+    print(readback.dtypes.to_string())
+    print()
+    print("Readback head:")
+    print(readback.head().to_string())
+
+
+if __name__ == "__main__":
+    main()
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -0,0 +1,904 @@
+"""Per-file SAS-to-Postgres loader.
+
+Library-style functions plus a thin CLI wrapper. Designed so an orchestrator
+can wrap the library for directory/batch mode; orchestration is out of scope
+here.
+
+Python 3.9 compatible (target is an air-gapped host that currently only has
+3.9). ``from __future__ import annotations`` lets us use PEP 585 generics
+as annotations; runtime-resolved type uses (dataclass defaults, etc.) stick
+to ``typing``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime as dt
+import io
+import json
+import os
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import pandas as pd
+import psycopg2
+import psycopg2.extensions
+import pyreadstat
+import yaml
+from dotenv import load_dotenv
+
+
+# ---------------------------------------------------------------------------
+# Top-level tunables
+# ---------------------------------------------------------------------------
+
+COERCE_CHAR_COLUMNS = True
+"""If True, try to promote object (string) columns to numeric/date/timestamp
+when every non-empty value parses cleanly."""
+
+CHAR_INFERENCE_MIN_VALUES = 3
+"""Don't attempt character-column coercion with fewer than this many non-empty
+values; too small a sample is easy to mis-infer."""
+
+NUMERIC_INT_RANGE = (-2_147_483_648, 2_147_483_647)
+"""INTEGER bounds; anything outside becomes BIGINT."""
+
+
+VALID_IF_EXISTS = ("fail", "replace", "append")
+
+
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class LoaderConfig:
+    filename: Path
+    schemaname: str
+    tablename: str
+    if_exists: str = "fail"
+    include: Optional[List[str]] = None
+    exclude: Optional[List[str]] = None
+
+
+@dataclass
+class ColumnSpec:
+    name: str
+    postgres_type: str
+    nullable: bool
+    sas_format: Optional[str] = None
+    source_dtype: Optional[str] = None
+    notes: List[str] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Custom exceptions
+# ---------------------------------------------------------------------------
+
+
+class TableExistsError(RuntimeError):
+    """Raised when if_exists=fail and the target table already exists."""
+
+
+class SchemaCompatibilityError(RuntimeError):
+    """Raised when if_exists=append and the incoming schema is not
+    compatible with the existing table."""
+
+
+class ValidationError(RuntimeError):
+    """Raised when --validate detects a mismatch against the manifest."""
+
+
+# ---------------------------------------------------------------------------
+# Connection
+# ---------------------------------------------------------------------------
+
+
+def connect() -> psycopg2.extensions.connection:
+    """Open a psycopg2 connection using standard libpq env vars.
+
+    Assumes `.env` has already been loaded (the CLI does this before calling).
+    Orchestrators that wrap this module should either call ``load_dotenv()``
+    themselves or ensure the env vars are set.
+    """
+    conn = psycopg2.connect(
+        host=os.environ.get("PGHOST"),
+        port=os.environ.get("PGPORT"),
+        user=os.environ.get("PGUSER"),
+        password=os.environ.get("PGPASSWORD"),
+        dbname=os.environ.get("PGDATABASE"),
+    )
+    return conn
+
+
+# ---------------------------------------------------------------------------
+# Config loading
+# ---------------------------------------------------------------------------
+
+
+def load_config(path: Path) -> LoaderConfig:
+    """Parse and validate the YAML config at ``path``."""
+    path = Path(path)
+    with path.open("r", encoding="utf-8") as f:
+        raw = yaml.safe_load(f)
+
+    if not isinstance(raw, dict):
+        raise ValueError(f"Config at {path} must be a YAML mapping at the top level.")
+
+    missing = [k for k in ("filename", "schemaname", "tablename") if k not in raw]
+    if missing:
+        raise ValueError(f"Config {path} missing required keys: {', '.join(missing)}")
+
+    filename = Path(raw["filename"])
+    if not filename.is_absolute():
+        filename = (path.parent / filename).resolve() if (path.parent / filename).exists() else Path(raw["filename"])
+
+    schemaname = str(raw["schemaname"])
+    tablename = str(raw["tablename"])
+
+    if_exists = str(raw.get("if_exists", "fail")).lower()
+    if if_exists not in VALID_IF_EXISTS:
+        raise ValueError(
+            f"Config {path}: if_exists={if_exists!r} is not one of {VALID_IF_EXISTS}"
+        )
+
+    include = raw.get("include")
+    exclude = raw.get("exclude")
+    if include is not None and exclude is not None:
+        raise ValueError(
+            f"Config {path}: 'include' and 'exclude' are mutually exclusive; set at most one."
+        )
+    if include is not None and not isinstance(include, list):
+        raise ValueError(f"Config {path}: 'include' must be a list of column names.")
+    if exclude is not None and not isinstance(exclude, list):
+        raise ValueError(f"Config {path}: 'exclude' must be a list of column names.")
+
+    return LoaderConfig(
+        filename=filename,
+        schemaname=schemaname,
+        tablename=tablename,
+        if_exists=if_exists,
+        include=[str(c) for c in include] if include is not None else None,
+        exclude=[str(c) for c in exclude] if exclude is not None else None,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Reader
+# ---------------------------------------------------------------------------
+
+
+def read_sas(path: Path) -> Tuple[pd.DataFrame, Any]:
+    """Dispatch to the right pyreadstat reader by extension.
+
+    Invariants (learned the hard way while building the sample generator):
+
+    * ``.xpt`` / ``.xport`` - no encoding arg; pyreadstat is flaky about
+      encoding on XPORT files it wrote itself.
+    * ``.sas7bdat`` - explicit ``encoding="latin-1"`` per colleague guidance.
+    """
+    path = Path(path)
+    suffix = path.suffix.lower()
+    if suffix in (".xpt", ".xport"):
+        return pyreadstat.read_xport(str(path))
+    if suffix == ".sas7bdat":
+        return pyreadstat.read_sas7bdat(str(path), encoding="latin-1")
+    raise ValueError(f"Unsupported SAS file extension: {suffix}")
+
+
+# ---------------------------------------------------------------------------
+# Column filtering
+# ---------------------------------------------------------------------------
+
+
+def apply_column_filter(
+    df: pd.DataFrame,
+    include: Optional[List[str]],
+    exclude: Optional[List[str]],
+) -> pd.DataFrame:
+    """Restrict ``df`` to the requested columns. Names missing from the frame
+    raise a clear error rather than silently dropping."""
+    if include is not None and exclude is not None:
+        raise ValueError("include and exclude are mutually exclusive.")
+
+    if include is not None:
+        missing = [c for c in include if c not in df.columns]
+        if missing:
+            raise ValueError(f"include references unknown columns: {missing}")
+        return df.loc[:, list(include)].copy()
+
+    if exclude is not None:
+        missing = [c for c in exclude if c not in df.columns]
+        if missing:
+            raise ValueError(f"exclude references unknown columns: {missing}")
+        return df.drop(columns=list(exclude)).copy()
+
+    return df.copy()
+
+
+# ---------------------------------------------------------------------------
+# Type inference
+# ---------------------------------------------------------------------------
+
+
+_DATE_FORMAT_PREFIXES = ("DATE", "YYMMDD", "MMDDYY", "DDMMYY", "JULIAN")
+
+
+def _format_driven_type(sas_format: Optional[str]) -> Optional[str]:
+    """Return a Postgres type inferred from the SAS format string, or None
+    if the format doesn't pin it down."""
+    if not sas_format:
+        return None
+    fmt = sas_format.upper().lstrip()
+    # DATETIME must be checked before DATE since "DATETIME20." starts with "DATE".
+    if fmt.startswith("DATETIME"):
+        return "TIMESTAMP"
+    if fmt.startswith("TIME"):
+        return "TIME"
+    for prefix in _DATE_FORMAT_PREFIXES:
+        if fmt.startswith(prefix):
+            return "DATE"
+    return None
+
+
+def _all_null(series: pd.Series) -> bool:
+    if pd.api.types.is_object_dtype(series):
+        return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all())
+    return bool(series.isna().all())
+
+
+def _char_missing_mask(series: pd.Series) -> pd.Series:
+    return series.map(lambda v: v is None or (isinstance(v, float) and pd.isna(v)) or (isinstance(v, str) and v == ""))
+
+
+def _is_nullable(series: pd.Series) -> bool:
+    """True if the column has at least one missing value."""
+    if pd.api.types.is_object_dtype(series):
+        return bool(_char_missing_mask(series).any())
+    return bool(series.isna().any())
+
+
+def _numeric_int_target(series: pd.Series) -> Optional[str]:
+    """Given a numeric (float64) series, if every non-null value is a whole
+    number, return INTEGER or BIGINT depending on range; else None."""
+    nonnull = series.dropna()
+    if nonnull.empty:
+        return None
+    # Whole-number test. Guard against inf.
+    try:
+        whole = ((nonnull % 1) == 0).all()
+    except TypeError:
+        return None
+    if not whole:
+        return None
+    lo, hi = NUMERIC_INT_RANGE
+    vmin = nonnull.min()
+    vmax = nonnull.max()
+    if lo <= vmin and vmax <= hi:
+        return "INTEGER"
+    return "BIGINT"
+
+
+def _object_is_dates(series: pd.Series) -> Tuple[bool, bool]:
+    """Return (all-date-like, any-datetime). If every non-null value is a
+    ``datetime.date`` / ``datetime.datetime`` / ``pd.Timestamp``, return True
+    plus whether at least one carries a time component."""
+    nonnull = series.dropna()
+    if nonnull.empty:
+        return False, False
+    any_datetime = False
+    for v in nonnull:
+        if isinstance(v, dt.datetime) or isinstance(v, pd.Timestamp):
+            any_datetime = True
+            continue
+        if isinstance(v, dt.date):
+            continue
+        return False, False
+    return True, any_datetime
+
+
+def _try_int_coerce(values: List[str]) -> Optional[str]:
+    """If every value parses as an int, return INTEGER/BIGINT, else None."""
+    ints: List[int] = []
+    for v in values:
+        s = v.strip()
+        try:
+            ints.append(int(s))
+        except ValueError:
+            return None
+    if not ints:
+        return None
+    lo, hi = NUMERIC_INT_RANGE
+    if all(lo <= i <= hi for i in ints):
+        return "INTEGER"
+    return "BIGINT"
+
+
+def _try_float_coerce(values: List[str]) -> bool:
+    for v in values:
+        try:
+            float(v)
+        except ValueError:
+            return False
+    return True
+
+
+def _try_date_coerce(values: List[str]) -> bool:
+    for v in values:
+        try:
+            dt.date.fromisoformat(v)
+        except (ValueError, TypeError):
+            return False
+    return True
+
+
+def _try_datetime_coerce(values: List[str]) -> bool:
+    for v in values:
+        try:
+            dt.datetime.fromisoformat(v)
+        except (ValueError, TypeError):
+            return False
+    return True
+
+
+def _infer_char_type(series: pd.Series) -> str:
+    """Object/string column inference. Returns a Postgres type string."""
+    mask = _char_missing_mask(series)
+    nonempty = [str(v) for v in series[~mask].tolist()]
+    if not COERCE_CHAR_COLUMNS or len(nonempty) < CHAR_INFERENCE_MIN_VALUES:
+        return "TEXT"
+
+    int_guess = _try_int_coerce(nonempty)
+    if int_guess is not None:
+        return int_guess
+    if _try_float_coerce(nonempty):
+        return "DOUBLE PRECISION"
+    if _try_date_coerce(nonempty):
+        return "DATE"
+    if _try_datetime_coerce(nonempty):
+        return "TIMESTAMP"
+    return "TEXT"
+
+
+def infer_schema(
+    df: pd.DataFrame,
+    meta: Any,
+    *,
+    coerce_chars: bool = COERCE_CHAR_COLUMNS,
+) -> Dict[str, ColumnSpec]:
+    """Infer a Postgres column spec for each column in ``df``.
+
+    ``meta`` is the pyreadstat metadata object; we read
+    ``meta.original_variable_types`` (a dict keyed by column name) for
+    format-driven date/time/timestamp inference.
+
+    The ``coerce_chars`` kwarg lets callers override the module-level
+    ``COERCE_CHAR_COLUMNS`` without mutating global state. Internally the
+    char-inference helpers still read the constant - a full override would
+    thread the flag through, but the one-knob story here is intentional.
+    """
+    original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {})
+
+    # Temporarily flip the module-level flag if the caller asked us to.
+    global COERCE_CHAR_COLUMNS
+    saved = COERCE_CHAR_COLUMNS
+    COERCE_CHAR_COLUMNS = coerce_chars
+    try:
+        out: Dict[str, ColumnSpec] = {}
+        for col in df.columns:
+            series = df[col]
+            sas_format = original_formats.get(col)
+            notes: List[str] = []
+
+            pg_type = _format_driven_type(sas_format)
+
+            if pg_type is None:
+                if _all_null(series):
+                    pg_type = "TEXT"
+                    notes.append("all-null column; defaulting to TEXT")
+                elif pd.api.types.is_datetime64_any_dtype(series):
+                    pg_type = "TIMESTAMP"
+                elif pd.api.types.is_object_dtype(series):
+                    is_dates, any_dt = _object_is_dates(series)
+                    if is_dates:
+                        pg_type = "TIMESTAMP" if any_dt else "DATE"
+                    else:
+                        pg_type = _infer_char_type(series)
+                elif pd.api.types.is_numeric_dtype(series):
+                    int_target = _numeric_int_target(series)
+                    if int_target is not None:
+                        pg_type = int_target
+                    else:
+                        pg_type = "DOUBLE PRECISION"
+                else:
+                    pg_type = "TEXT"
+                    notes.append(f"unhandled dtype {series.dtype}; defaulting to TEXT")
+
+            nullable = _is_nullable(series)
+
+            out[col] = ColumnSpec(
+                name=col,
+                postgres_type=pg_type,
+                nullable=nullable,
+                sas_format=sas_format,
+                source_dtype=str(series.dtype),
+                notes=notes,
+            )
+        return out
+    finally:
+        COERCE_CHAR_COLUMNS = saved
+
+
+# ---------------------------------------------------------------------------
+# Table management
+# ---------------------------------------------------------------------------
+
+
+def _quote_ident(ident: str) -> str:
+    """Quote a Postgres identifier. psycopg2 doesn't expose this directly
+    until 2.8+ with sql.Identifier; we do it by hand to stay driver-simple."""
+    return '"' + ident.replace('"', '""') + '"'
+
+
+def _qualified(schema: str, table: str) -> str:
+    return f"{_quote_ident(schema)}.{_quote_ident(table)}"
+
+
+def _table_exists(conn, schema: str, table: str) -> bool:
+    with conn.cursor() as cur:
+        cur.execute(
+            "SELECT 1 FROM information_schema.tables "
+            "WHERE table_schema = %s AND table_name = %s",
+            (schema, table),
+        )
+        return cur.fetchone() is not None
+
+
+def render_create_table(schema: str, table: str, columns: Dict[str, ColumnSpec]) -> str:
+    lines = []
+    for spec in columns.values():
+        null_clause = "" if spec.nullable else " NOT NULL"
+        lines.append(f"    {_quote_ident(spec.name)} {spec.postgres_type}{null_clause}")
+    body = ",\n".join(lines)
+    return f"CREATE TABLE {_qualified(schema, table)} (\n{body}\n);"
+
+
+def _create_table_sql(conn, schema: str, table: str, columns: Dict[str, ColumnSpec]) -> None:
+    sql = render_create_table(schema, table, columns)
+    with conn.cursor() as cur:
+        cur.execute(sql)
+
+
+def _drop_table(conn, schema: str, table: str) -> None:
+    with conn.cursor() as cur:
+        cur.execute(f"DROP TABLE {_qualified(schema, table)}")
+
+
+# Normalization table: map both loader-emitted and Postgres-reported type
+# strings to a single canonical family name. Ignore length/precision
+# modifiers like VARCHAR(n) and NUMERIC(p,s).
+_TYPE_NORMALIZATION: Dict[str, str] = {
+    "INTEGER": "integer",
+    "INT": "integer",
+    "INT4": "integer",
+    "BIGINT": "bigint",
+    "INT8": "bigint",
+    "SMALLINT": "smallint",
+    "INT2": "smallint",
+    "DOUBLE PRECISION": "double precision",
+    "FLOAT8": "double precision",
+    "REAL": "real",
+    "FLOAT4": "real",
+    "NUMERIC": "numeric",
+    "DECIMAL": "numeric",
+    "TEXT": "text",
+    "VARCHAR": "character varying",
+    "CHARACTER VARYING": "character varying",
+    "CHAR": "character",
+    "CHARACTER": "character",
+    "BPCHAR": "character",
+    "BOOLEAN": "boolean",
+    "BOOL": "boolean",
+    "DATE": "date",
+    "TIMESTAMP": "timestamp without time zone",
+    "TIMESTAMP WITHOUT TIME ZONE": "timestamp without time zone",
+    "TIMESTAMPTZ": "timestamp with time zone",
+    "TIMESTAMP WITH TIME ZONE": "timestamp with time zone",
+    "TIME": "time without time zone",
+    "TIME WITHOUT TIME ZONE": "time without time zone",
+    "TIMETZ": "time with time zone",
+    "TIME WITH TIME ZONE": "time with time zone",
+}
+
+
+def _normalize_type(pg_type: str) -> str:
+    """Strip length/precision modifiers and map to canonical family."""
+    stripped = pg_type.strip().upper()
+    # Remove trailing (n) / (p,s) before the space-separated tail.
+    # Examples: "VARCHAR(10)" -> "VARCHAR"; "TIMESTAMP(6) WITHOUT TIME ZONE" -> "TIMESTAMP WITHOUT TIME ZONE"
+    import re
+
+    stripped = re.sub(r"\(\s*\d+\s*(?:,\s*\d+\s*)?\)", "", stripped).strip()
+    # Collapse doubled whitespace after paren removal.
+    stripped = re.sub(r"\s+", " ", stripped)
+    return _TYPE_NORMALIZATION.get(stripped, stripped.lower())
+
+
+def _assert_schema_compatible(
+    conn, schema: str, table: str, columns: Dict[str, ColumnSpec]
+) -> None:
+    """Pre-flight check for if_exists=append. See plan section on option B."""
+    with conn.cursor() as cur:
+        cur.execute(
+            "SELECT column_name, data_type, is_nullable "
+            "FROM information_schema.columns "
+            "WHERE table_schema = %s AND table_name = %s",
+            (schema, table),
+        )
+        existing = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
+
+    mismatches: List[str] = []
+    warnings: List[str] = []
+
+    for name, spec in columns.items():
+        if name not in existing:
+            mismatches.append(
+                f"column {name!r} not present in target {schema}.{table}"
+            )
+            continue
+        target_type, target_nullable = existing[name]
+        inferred_norm = _normalize_type(spec.postgres_type)
+        target_norm = _normalize_type(target_type)
+        if inferred_norm != target_norm:
+            mismatches.append(
+                f"column {name!r}: inferred {spec.postgres_type} "
+                f"(normalized {inferred_norm!r}) but target is {target_type} "
+                f"(normalized {target_norm!r})"
+            )
+        target_is_notnull = (target_nullable == "NO")
+        if spec.nullable and target_is_notnull:
+            warnings.append(
+                f"column {name!r}: incoming allows NULLs but target is NOT NULL; "
+                "COPY will fail if any NULLs appear"
+            )
+
+    for w in warnings:
+        print(f"[warn] {w}", file=sys.stderr)
+
+    if mismatches:
+        raise SchemaCompatibilityError(
+            "append-mode schema compatibility check failed:\n  - "
+            + "\n  - ".join(mismatches)
+        )
+
+
+def create_table(
+    conn,
+    schema_name: str,
+    table_name: str,
+    columns: Dict[str, ColumnSpec],
+    if_exists: str,
+) -> None:
+    """Create (or verify) the target table according to ``if_exists``."""
+    if if_exists not in VALID_IF_EXISTS:
+        raise ValueError(f"if_exists must be one of {VALID_IF_EXISTS}, got {if_exists!r}")
+
+    exists = _table_exists(conn, schema_name, table_name)
+    if exists:
+        if if_exists == "fail":
+            raise TableExistsError(
+                f"Table {schema_name}.{table_name} already exists and if_exists=fail"
+            )
+        if if_exists == "replace":
+            _drop_table(conn, schema_name, table_name)
+            _create_table_sql(conn, schema_name, table_name, columns)
+            return
+        if if_exists == "append":
+            _assert_schema_compatible(conn, schema_name, table_name, columns)
+            return
+    else:
+        _create_table_sql(conn, schema_name, table_name, columns)
+
+
+# ---------------------------------------------------------------------------
+# COPY loading
+# ---------------------------------------------------------------------------
+
+
+def _seconds_to_time(v: Any) -> Optional[dt.time]:
+    if v is None:
+        return None
+    if isinstance(v, float) and pd.isna(v):
+        return None
+    if isinstance(v, dt.time):
+        return v
+    if isinstance(v, (dt.datetime, pd.Timestamp)):
+        return v.time() if not pd.isna(v) else None
+    try:
+        total = int(round(float(v)))
+    except (TypeError, ValueError):
+        return None
+    h, rem = divmod(total, 3600)
+    m, s = divmod(rem, 60)
+    # Clamp; TIME8. is always within a day.
+    h = max(0, min(h, 23))
+    return dt.time(h, m, s)
+
+
+def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.DataFrame:
+    """Materialize a copy of ``df`` with each column in the right shape for
+    ``to_csv`` so the CSV lands as valid input for the target Postgres type.
+    """
+    out = pd.DataFrame(index=df.index)
+    for name, spec in columns.items():
+        series = df[name]
+        pg = spec.postgres_type.upper()
+
+        if pg in ("INTEGER", "BIGINT", "SMALLINT"):
+            if pd.api.types.is_object_dtype(series):
+                series = pd.to_numeric(
+                    series.replace({"": None}), errors="coerce"
+                )
+            out[name] = series.astype("Int64")
+        elif pg in ("DOUBLE PRECISION", "REAL", "NUMERIC"):
+            if pd.api.types.is_object_dtype(series):
+                series = pd.to_numeric(
+                    series.replace({"": None}), errors="coerce"
+                )
+            out[name] = series.astype("float64")
+        elif pg == "DATE":
+            if pd.api.types.is_datetime64_any_dtype(series):
+                out[name] = series.dt.date
+            elif pd.api.types.is_object_dtype(series):
+                def _to_date(v: Any) -> Optional[dt.date]:
+                    if v is None or (isinstance(v, float) and pd.isna(v)):
+                        return None
+                    if isinstance(v, dt.datetime):
+                        return v.date()
+                    if isinstance(v, dt.date):
+                        return v
+                    if isinstance(v, str):
+                        if v == "":
+                            return None
+                        try:
+                            return dt.date.fromisoformat(v)
+                        except ValueError:
+                            return None
+                    return None
+                out[name] = series.map(_to_date)
+            else:
+                out[name] = series
+        elif pg in ("TIMESTAMP", "TIMESTAMP WITHOUT TIME ZONE", "TIMESTAMP WITH TIME ZONE"):
+            if pd.api.types.is_datetime64_any_dtype(series):
+                out[name] = series
+            elif pd.api.types.is_object_dtype(series):
+                def _to_dt(v: Any) -> Optional[dt.datetime]:
+                    if v is None or (isinstance(v, float) and pd.isna(v)):
+                        return None
+                    if isinstance(v, dt.datetime):
+                        return v
+                    if isinstance(v, dt.date):
+                        return dt.datetime(v.year, v.month, v.day)
+                    if isinstance(v, pd.Timestamp):
+                        return v.to_pydatetime() if not pd.isna(v) else None
+                    if isinstance(v, str):
+                        if v == "":
+                            return None
+                        try:
+                            return dt.datetime.fromisoformat(v)
+                        except ValueError:
+                            return None
+                    return None
+                out[name] = series.map(_to_dt)
+            else:
+                out[name] = series
+        elif pg in ("TIME", "TIME WITHOUT TIME ZONE", "TIME WITH TIME ZONE"):
+            out[name] = series.map(_seconds_to_time)
+        elif pg in ("TEXT", "VARCHAR", "CHARACTER VARYING", "CHAR", "CHARACTER"):
+            # Leave empty strings as "" so `NULL ''` in COPY turns them into NULL.
+            def _to_str(v: Any) -> Any:
+                if v is None:
+                    return ""
+                if isinstance(v, float) and pd.isna(v):
+                    return ""
+                return str(v)
+            out[name] = series.map(_to_str)
+        elif pg == "BOOLEAN":
+            out[name] = series.astype("boolean") if series.dtype != object else series
+        else:
+            out[name] = series
+    return out
+
+
+def copy_dataframe(
+    conn,
+    schema_name: str,
+    table_name: str,
+    df: pd.DataFrame,
+    columns: Dict[str, ColumnSpec],
+) -> int:
+    """Stream ``df`` into Postgres via ``COPY ... FROM STDIN``.
+
+    Returns the number of rows inserted.
+    """
+    prepared = _prepare_for_copy(df, columns)
+
+    buf = io.StringIO()
+    prepared.to_csv(
+        buf,
+        index=False,
+        header=False,
+        na_rep="",
+        date_format="%Y-%m-%d %H:%M:%S",
+    )
+    buf.seek(0)
+
+    col_list = ", ".join(_quote_ident(name) for name in columns.keys())
+    sql = (
+        f"COPY {_qualified(schema_name, table_name)} ({col_list}) "
+        f"FROM STDIN WITH (FORMAT csv, NULL '')"
+    )
+
+    with conn.cursor() as cur:
+        cur.copy_expert(sql, buf)
+        rowcount = cur.rowcount
+
+    return int(rowcount) if rowcount is not None else len(prepared)
+
+
+# ---------------------------------------------------------------------------
+# Manifest validation
+# ---------------------------------------------------------------------------
+
+
+def _match_manifest_type(inferred: str, manifest_entry: Dict[str, Any]) -> bool:
+    inferred_norm = _normalize_type(inferred)
+    if "postgres_type" in manifest_entry:
+        return inferred_norm == _normalize_type(manifest_entry["postgres_type"])
+    if "acceptable_types" in manifest_entry:
+        return any(
+            inferred_norm == _normalize_type(t)
+            for t in manifest_entry["acceptable_types"]
+        )
+    return False
+
+
+def validate_against_manifest(
+    inferred: Dict[str, ColumnSpec],
+    manifest_path: Path,
+) -> List[str]:
+    """Compare the inferred schema against the expected-types manifest.
+
+    Returns a list of human-readable problem strings; empty list means OK.
+    """
+    manifest_path = Path(manifest_path)
+    if not manifest_path.exists():
+        return [f"manifest not found: {manifest_path}"]
+
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest = json.load(f)
+
+    problems: List[str] = []
+
+    only_in_inferred = set(inferred) - set(manifest)
+    only_in_manifest = set(manifest) - set(inferred)
+    if only_in_inferred:
+        problems.append(
+            f"columns in inferred but not manifest: {sorted(only_in_inferred)}"
+        )
+    if only_in_manifest:
+        problems.append(
+            f"columns in manifest but not inferred: {sorted(only_in_manifest)}"
+        )
+
+    for name, spec in inferred.items():
+        entry = manifest.get(name)
+        if entry is None:
+            continue
+        if not _match_manifest_type(spec.postgres_type, entry):
+            expected = entry.get("postgres_type") or entry.get("acceptable_types")
+            problems.append(
+                f"column {name!r}: inferred {spec.postgres_type!r}, "
+                f"manifest expected {expected!r}"
+            )
+        manifest_nullable = bool(entry.get("nullable", True))
+        if spec.nullable and not manifest_nullable:
+            problems.append(
+                f"column {name!r}: inferred nullable, manifest expects NOT NULL "
+                f"(loosening nullability is never allowed)"
+            )
+
+    return problems
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def _build_argparser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Load a single SAS file (XPT or sas7bdat) into Postgres.",
+    )
+    p.add_argument("--config", required=True, type=Path, help="Path to YAML config")
+    p.add_argument(
+        "--validate",
+        action="store_true",
+        help=(
+            "Compare inferred schema against <filename-stem>.expected.json "
+            "next to the SAS file; exits nonzero on mismatch."
+        ),
+    )
+    p.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print inferred CREATE TABLE and stop; don't touch Postgres.",
+    )
+    return p
+
+
+def _format_columns_summary(columns: Dict[str, ColumnSpec]) -> str:
+    lines = []
+    for spec in columns.values():
+        null = "" if spec.nullable else " NOT NULL"
+        lines.append(f"  {spec.name}: {spec.postgres_type}{null}")
+    return "\n".join(lines)
+
+
+def main(argv: Optional[List[str]] = None) -> int:
+    args = _build_argparser().parse_args(argv)
+
+    load_dotenv()
+
+    cfg = load_config(args.config)
+
+    if not cfg.filename.exists():
+        print(f"error: SAS file not found: {cfg.filename}", file=sys.stderr)
+        return 2
+
+    df, meta = read_sas(cfg.filename)
+    df = apply_column_filter(df, cfg.include, cfg.exclude)
+    columns = infer_schema(df, meta)
+
+    if args.validate:
+        manifest_path = cfg.filename.with_suffix("").with_suffix(".expected.json")
+        # The above strips .xpt then appends .expected.json, e.g.
+        # "sample_kitchensink.xpt" -> "sample_kitchensink.expected.json".
+        problems = validate_against_manifest(columns, manifest_path)
+        if problems:
+            print("validation failed:", file=sys.stderr)
+            for p in problems:
+                print(f"  - {p}", file=sys.stderr)
+            return 1
+        print(f"validation OK ({len(columns)} columns match {manifest_path.name})")
+
+    if args.dry_run:
+        print(render_create_table(cfg.schemaname, cfg.tablename, columns))
+        return 0
+
+    conn = connect()
+    conn.autocommit = False
+    try:
+        create_table(conn, cfg.schemaname, cfg.tablename, columns, cfg.if_exists)
+        inserted = copy_dataframe(conn, cfg.schemaname, cfg.tablename, df, columns)
+        conn.commit()
+    except Exception:
+        conn.rollback()
+        raise
+    finally:
+        conn.close()
+
+    print(
+        f"loaded {inserted} rows into {cfg.schemaname}.{cfg.tablename} "
+        f"({len(columns)} columns)"
+    )
+    print("final schema:")
+    print(_format_columns_summary(columns))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/generic_loader/requirements.txt
+++ b/generic_loader/requirements.txt
@ -0,0 +1,6 @@
+pandas>=2.0,<2.3
+pyreadstat>=1.2,<1.3
+numpy>=1.24,<2.1
+pyyaml>=6.0,<7.0
+psycopg2-binary>=2.9,<3.0
+python-dotenv>=1.0,<2.0
--- a/generic_loader/sample_config.yaml
+++ b/generic_loader/sample_config.yaml
@ -0,0 +1,17 @@
+filename: samples/sample_kitchensink.xpt
+schemaname: public
+tablename: kitchensink
+
+# Optional. If set, only these columns are loaded. Mutually exclusive with exclude.
+# include:
+#   - ID
+#   - INTCOL
+#   - DATECOL
+
+# Optional. Columns to drop.
+# exclude:
+#   - ALLNULL
+
+# What to do if the target table already exists: fail | replace | append
+# Defaults to fail.
+if_exists: append