foxtrot/generic_loader/generate_sample_folder.py

"""Generate a folder of clustered SAS XPORT files for testing ``load_folder``.

Produces ``samples/folder_test/`` containing three clusters:

* ``group_a{1,2,3}.xpt`` - kitchen-sink schema (every column).
* ``group_b{1,2}.xpt`` - a *different* schema (drops ``BIGINT`` and
  ``TIMECOL``) so a schema-compat check would catch cross-cluster
  contamination if the regex were wrong.
* ``standalone.xpt`` - singleton to exercise the no-cluster / singleton
  auto-detect path.

Alongside the files, writes ``sample_folder_config.yaml`` that exercises
both code paths: ``group_a*`` via an explicit regex pattern, ``group_b*``
and ``standalone`` via auto-detect.

Finally, runs :func:`load_folder.discover_clusters` against the generated
folder and asserts the grouping is what we expect. This is a pure in-process
smoke test of the clustering logic; no Postgres connection is required.

Reuses ``generate_sample_sas.build_dataframe`` so data shape / dtypes match
the single-file loader tests. ``N_ROWS`` is temporarily shrunk on the
imported module for this run's duration so repeated invocations stay fast.
"""

from __future__ import annotations

import sys
from pathlib import Path

import numpy as np
import pandas as pd
import pyreadstat
import yaml

import generate_sample_sas as gss
from load_folder import discover_clusters, load_folder_config


FIXTURE_ROWS = 2_000
OUT_DIR = Path("samples/folder_test")
CONFIG_PATH = OUT_DIR / "folder_config.yaml"

GROUP_A_FILES = ["group_a1.xpt", "group_a2.xpt", "group_a3.xpt"]
GROUP_B_FILES = ["group_b1.xpt", "group_b2.xpt"]
STANDALONE_FILE = "standalone.xpt"

# Columns dropped from the group_b cluster so it has a genuinely different
# schema from the group_a cluster. If the regex accidentally pulled a group_b file
# into the group_a cluster (or vice versa), load_cluster's schema-compat check
# would fire on these differences.
GROUP_B_DROPPED_COLUMNS = ("BIGINT", "TIMECOL")


def _build_df(seed: int) -> pd.DataFrame:
    """Build a kitchen-sink DataFrame via the existing generator.

    Temporarily shrinks ``generate_sample_sas.N_ROWS`` so each fixture file
    is small enough to regenerate quickly. Restored afterward so importing
    this module alongside the main generator stays side-effect free.
    """
    saved = gss.N_ROWS
    gss.N_ROWS = FIXTURE_ROWS
    try:
        rng = np.random.default_rng(seed)
        return gss.build_dataframe(rng)
    finally:
        gss.N_ROWS = saved


def _write_xport(df: pd.DataFrame, path: Path, table_name: str) -> None:
    # Only pass variable_format entries for columns that actually exist in
    # this frame - write_xport errors on formats referencing missing cols.
    variable_format = {
        k: v for k, v in gss.VARIABLE_FORMATS.items() if k in df.columns
    }
    column_labels = {k: v for k, v in gss.COLUMN_LABELS.items() if k in df.columns}

    pyreadstat.write_xport(
        df,
        str(path),
        file_format_version=5,
        table_name=table_name,
        file_label=f"Folder-loader fixture ({path.name})",
        column_labels=column_labels,
        variable_format=variable_format,
    )


def generate_fixtures() -> None:
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    for i, name in enumerate(GROUP_A_FILES):
        df = _build_df(seed=100 + i)
        _write_xport(df, OUT_DIR / name, table_name=f"GRPA{i + 1}")
        print(f"  wrote {OUT_DIR / name} ({len(df):,} rows, {len(df.columns)} cols)")

    for i, name in enumerate(GROUP_B_FILES):
        df = _build_df(seed=200 + i)
        df = df.drop(columns=list(GROUP_B_DROPPED_COLUMNS))
        _write_xport(df, OUT_DIR / name, table_name=f"GRPB{i + 1}")
        print(f"  wrote {OUT_DIR / name} ({len(df):,} rows, {len(df.columns)} cols)")

    df = _build_df(seed=300)
    _write_xport(df, OUT_DIR / STANDALONE_FILE, table_name="STDALONE")
    print(
        f"  wrote {OUT_DIR / STANDALONE_FILE} "
        f"({len(df):,} rows, {len(df.columns)} cols)"
    )


def write_config() -> None:
    cfg = {
        "folder": ".",  # config lives inside the target folder
        "schemaname": "public",
        "if_exists": "replace",
        "auto_detect": True,
        "clusters": [
            {
                "pattern": r"^group_a\d+\.xpt$",
                "tablename": "group_a",
            },
        ],
    }
    with CONFIG_PATH.open("w", encoding="utf-8") as f:
        # Top-of-file comment documents the intent of this generated config.
        f.write(
            "# Generated by generate_sample_folder.py. Demonstrates both\n"
            "# explicit regex clustering (group_a*) and auto-detect\n"
            "# (group_b* and standalone) working together.\n"
        )
        yaml.safe_dump(cfg, f, sort_keys=False)
    print(f"  wrote {CONFIG_PATH}")


def verify() -> None:
    """Smoke-test the clustering logic against the generated folder."""
    cfg = load_folder_config(CONFIG_PATH)
    clusters = discover_clusters(cfg)

    by_name = {c.tablename: c for c in clusters}

    expected_names = {"group_a", "group_b", "standalone"}
    actual_names = set(by_name)
    assert expected_names == actual_names, (
        f"cluster set mismatch: expected {expected_names}, got {actual_names}"
    )

    group_a = by_name["group_a"]
    assert group_a.source == "explicit", f"group_a source = {group_a.source!r}"
    assert [f.name for f in group_a.files] == sorted(GROUP_A_FILES), (
        f"group_a files = {[f.name for f in group_a.files]}"
    )

    group_b = by_name["group_b"]
    assert group_b.source == "auto", f"group_b source = {group_b.source!r}"
    assert [f.name for f in group_b.files] == sorted(GROUP_B_FILES), (
        f"group_b files = {[f.name for f in group_b.files]}"
    )

    standalone = by_name["standalone"]
    assert standalone.source == "auto", f"standalone source = {standalone.source!r}"
    assert [f.name for f in standalone.files] == [STANDALONE_FILE], (
        f"standalone files = {[f.name for f in standalone.files]}"
    )

    print("  clustering verified:")
    for c in clusters:
        files = ", ".join(f.name for f in c.files)
        print(f"    {c.tablename} [{c.source}]: {files}")


def main() -> int:
    print(f"Writing fixture SAS files to {OUT_DIR}/")
    generate_fixtures()
    print(f"\nWriting folder config to {CONFIG_PATH}")
    write_config()
    print("\nVerifying discover_clusters() grouping...")
    verify()
    print("\nOK. Try:")
    print(f"  python load_folder.py --config {CONFIG_PATH} --dry-run")
    return 0


if __name__ == "__main__":
    sys.exit(main())
Add generate_sample_folder.py and load_folder.py for clustered SAS file generation and loading Introduce generate_sample_folder.py to create a test folder with clustered SAS XPORT files, including configurations for schema compatibility checks. Implement load_folder.py to facilitate loading entire directories of SAS files into Postgres, supporting explicit and auto-detect clustering. Update sample_folder_config.yaml for usage examples and configuration structure. Enhance load_sas.py with a public schema compatibility check function for orchestrators. 2026-04-18 16:25:04 +00:00			"""Generate a folder of clustered SAS XPORT files for testing ``load_folder``.

			Produces ``samples/folder_test/`` containing three clusters:

			* ``group_a{1,2,3}.xpt`` - kitchen-sink schema (every column).
			* ``group_b{1,2}.xpt`` - a different schema (drops ``BIGINT`` and
			``TIMECOL``) so a schema-compat check would catch cross-cluster
			`contamination if the regex were wrong.`
			* ``standalone.xpt`` - singleton to exercise the no-cluster / singleton
			`auto-detect path.`

			Alongside the files, writes ``sample_folder_config.yaml`` that exercises
			both code paths: ``group_a`` via an explicit regex pattern, ``group_b``
			and ``standalone`` via auto-detect.

			Finally, runs :func:`load_folder.discover_clusters` against the generated
			`folder and asserts the grouping is what we expect. This is a pure in-process`
			`smoke test of the clustering logic; no Postgres connection is required.`

			Reuses ``generate_sample_sas.build_dataframe`` so data shape / dtypes match
			the single-file loader tests. ``N_ROWS`` is temporarily shrunk on the
			`imported module for this run's duration so repeated invocations stay fast.`
			`"""`

			`from __future__ import annotations`

			`import sys`
			`from pathlib import Path`

			`import numpy as np`
			`import pandas as pd`
			`import pyreadstat`
			`import yaml`

			`import generate_sample_sas as gss`
			`from load_folder import discover_clusters, load_folder_config`


			`FIXTURE_ROWS = 2_000`
			`OUT_DIR = Path("samples/folder_test")`
			`CONFIG_PATH = OUT_DIR / "folder_config.yaml"`

			`GROUP_A_FILES = ["group_a1.xpt", "group_a2.xpt", "group_a3.xpt"]`
			`GROUP_B_FILES = ["group_b1.xpt", "group_b2.xpt"]`
			`STANDALONE_FILE = "standalone.xpt"`

			`# Columns dropped from the group_b cluster so it has a genuinely different`
			`# schema from the group_a cluster. If the regex accidentally pulled a group_b file`
			`# into the group_a cluster (or vice versa), load_cluster's schema-compat check`
			`# would fire on these differences.`
			`GROUP_B_DROPPED_COLUMNS = ("BIGINT", "TIMECOL")`


			`def _build_df(seed: int) -> pd.DataFrame:`
			`"""Build a kitchen-sink DataFrame via the existing generator.`

			Temporarily shrinks ``generate_sample_sas.N_ROWS`` so each fixture file
			`is small enough to regenerate quickly. Restored afterward so importing`
			`this module alongside the main generator stays side-effect free.`
			`"""`
			`saved = gss.N_ROWS`
			`gss.N_ROWS = FIXTURE_ROWS`
			`try:`
			`rng = np.random.default_rng(seed)`
			`return gss.build_dataframe(rng)`
			`finally:`
			`gss.N_ROWS = saved`


			`def _write_xport(df: pd.DataFrame, path: Path, table_name: str) -> None:`
			`# Only pass variable_format entries for columns that actually exist in`
			`# this frame - write_xport errors on formats referencing missing cols.`
			`variable_format = {`
			`k: v for k, v in gss.VARIABLE_FORMATS.items() if k in df.columns`
			`}`
			`column_labels = {k: v for k, v in gss.COLUMN_LABELS.items() if k in df.columns}`

			`pyreadstat.write_xport(`
			`df,`
			`str(path),`
			`file_format_version=5,`
			`table_name=table_name,`
			`file_label=f"Folder-loader fixture ({path.name})",`
			`column_labels=column_labels,`
			`variable_format=variable_format,`
			`)`


			`def generate_fixtures() -> None:`
			`OUT_DIR.mkdir(parents=True, exist_ok=True)`

			`for i, name in enumerate(GROUP_A_FILES):`
			`df = _build_df(seed=100 + i)`
			`_write_xport(df, OUT_DIR / name, table_name=f"GRPA{i + 1}")`
			`print(f" wrote {OUT_DIR / name} ({len(df):,} rows, {len(df.columns)} cols)")`

			`for i, name in enumerate(GROUP_B_FILES):`
			`df = _build_df(seed=200 + i)`
			`df = df.drop(columns=list(GROUP_B_DROPPED_COLUMNS))`
			`_write_xport(df, OUT_DIR / name, table_name=f"GRPB{i + 1}")`
			`print(f" wrote {OUT_DIR / name} ({len(df):,} rows, {len(df.columns)} cols)")`

			`df = _build_df(seed=300)`
			`_write_xport(df, OUT_DIR / STANDALONE_FILE, table_name="STDALONE")`
			`print(`
			`f" wrote {OUT_DIR / STANDALONE_FILE} "`
			`f"({len(df):,} rows, {len(df.columns)} cols)"`
			`)`


			`def write_config() -> None:`
			`cfg = {`
			`"folder": ".", # config lives inside the target folder`
			`"schemaname": "public",`
			`"if_exists": "replace",`
			`"auto_detect": True,`
			`"clusters": [`
			`{`
			`"pattern": r"^group_a\d+\.xpt$",`
			`"tablename": "group_a",`
			`},`
			`],`
			`}`
			`with CONFIG_PATH.open("w", encoding="utf-8") as f:`
			`# Top-of-file comment documents the intent of this generated config.`
			`f.write(`
			`"# Generated by generate_sample_folder.py. Demonstrates both\n"`
			`"# explicit regex clustering (group_a*) and auto-detect\n"`
			`"# (group_b* and standalone) working together.\n"`
			`)`
			`yaml.safe_dump(cfg, f, sort_keys=False)`
			`print(f" wrote {CONFIG_PATH}")`


			`def verify() -> None:`
			`"""Smoke-test the clustering logic against the generated folder."""`
			`cfg = load_folder_config(CONFIG_PATH)`
			`clusters = discover_clusters(cfg)`

			`by_name = {c.tablename: c for c in clusters}`

			`expected_names = {"group_a", "group_b", "standalone"}`
			`actual_names = set(by_name)`
			`assert expected_names == actual_names, (`
			`f"cluster set mismatch: expected {expected_names}, got {actual_names}"`
			`)`

			`group_a = by_name["group_a"]`
			`assert group_a.source == "explicit", f"group_a source = {group_a.source!r}"`
			`assert [f.name for f in group_a.files] == sorted(GROUP_A_FILES), (`
			`f"group_a files = {[f.name for f in group_a.files]}"`
			`)`

			`group_b = by_name["group_b"]`
			`assert group_b.source == "auto", f"group_b source = {group_b.source!r}"`
			`assert [f.name for f in group_b.files] == sorted(GROUP_B_FILES), (`
			`f"group_b files = {[f.name for f in group_b.files]}"`
			`)`

			`standalone = by_name["standalone"]`
			`assert standalone.source == "auto", f"standalone source = {standalone.source!r}"`
			`assert [f.name for f in standalone.files] == [STANDALONE_FILE], (`
			`f"standalone files = {[f.name for f in standalone.files]}"`
			`)`

			`print(" clustering verified:")`
			`for c in clusters:`
			`files = ", ".join(f.name for f in c.files)`
			`print(f" {c.tablename} [{c.source}]: {files}")`


			`def main() -> int:`
			`print(f"Writing fixture SAS files to {OUT_DIR}/")`
			`generate_fixtures()`
			`print(f"\nWriting folder config to {CONFIG_PATH}")`
			`write_config()`
			`print("\nVerifying discover_clusters() grouping...")`
			`verify()`
			`print("\nOK. Try:")`
			`print(f" python load_folder.py --config {CONFIG_PATH} --dry-run")`
			`return 0`


			`if __name__ == "__main__":`
			`sys.exit(main())`