Add generate_sample_folder.py and load_folder.py for clustered SAS file generation and loading

Introduce generate_sample_folder.py to create a test folder with clustered SAS XPORT files, including configurations for schema compatibility checks. Implement load_folder.py to facilitate loading entire directories of SAS files into Postgres, supporting explicit and auto-detect clustering. Update sample_folder_config.yaml for usage examples and configuration structure. Enhance load_sas.py with a public schema compatibility check function for orchestrators.
This commit is contained in:
David Peterson 2026-04-18 11:25:04 -05:00
parent 5645ff5597
commit 5b48872dd7
4 changed files with 810 additions and 0 deletions

View File

@ -0,0 +1,185 @@
"""Generate a folder of clustered SAS XPORT files for testing ``load_folder``.
Produces ``samples/folder_test/`` containing three clusters:
* ``group_a{1,2,3}.xpt`` - kitchen-sink schema (every column).
* ``group_b{1,2}.xpt`` - a *different* schema (drops ``BIGINT`` and
``TIMECOL``) so a schema-compat check would catch cross-cluster
contamination if the regex were wrong.
* ``standalone.xpt`` - singleton to exercise the no-cluster / singleton
auto-detect path.
Alongside the files, writes ``sample_folder_config.yaml`` that exercises
both code paths: ``group_a*`` via an explicit regex pattern, ``group_b*``
and ``standalone`` via auto-detect.
Finally, runs :func:`load_folder.discover_clusters` against the generated
folder and asserts the grouping is what we expect. This is a pure in-process
smoke test of the clustering logic; no Postgres connection is required.
Reuses ``generate_sample_sas.build_dataframe`` so data shape / dtypes match
the single-file loader tests. ``N_ROWS`` is temporarily shrunk on the
imported module for this run's duration so repeated invocations stay fast.
"""
from __future__ import annotations
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import pyreadstat
import yaml
import generate_sample_sas as gss
from load_folder import discover_clusters, load_folder_config
FIXTURE_ROWS = 2_000
OUT_DIR = Path("samples/folder_test")
CONFIG_PATH = OUT_DIR / "folder_config.yaml"
GROUP_A_FILES = ["group_a1.xpt", "group_a2.xpt", "group_a3.xpt"]
GROUP_B_FILES = ["group_b1.xpt", "group_b2.xpt"]
STANDALONE_FILE = "standalone.xpt"
# Columns dropped from the group_b cluster so it has a genuinely different
# schema from the group_a cluster. If the regex accidentally pulled a group_b file
# into the group_a cluster (or vice versa), load_cluster's schema-compat check
# would fire on these differences.
GROUP_B_DROPPED_COLUMNS = ("BIGINT", "TIMECOL")
def _build_df(seed: int) -> pd.DataFrame:
"""Build a kitchen-sink DataFrame via the existing generator.
Temporarily shrinks ``generate_sample_sas.N_ROWS`` so each fixture file
is small enough to regenerate quickly. Restored afterward so importing
this module alongside the main generator stays side-effect free.
"""
saved = gss.N_ROWS
gss.N_ROWS = FIXTURE_ROWS
try:
rng = np.random.default_rng(seed)
return gss.build_dataframe(rng)
finally:
gss.N_ROWS = saved
def _write_xport(df: pd.DataFrame, path: Path, table_name: str) -> None:
# Only pass variable_format entries for columns that actually exist in
# this frame - write_xport errors on formats referencing missing cols.
variable_format = {
k: v for k, v in gss.VARIABLE_FORMATS.items() if k in df.columns
}
column_labels = {k: v for k, v in gss.COLUMN_LABELS.items() if k in df.columns}
pyreadstat.write_xport(
df,
str(path),
file_format_version=5,
table_name=table_name,
file_label=f"Folder-loader fixture ({path.name})",
column_labels=column_labels,
variable_format=variable_format,
)
def generate_fixtures() -> None:
OUT_DIR.mkdir(parents=True, exist_ok=True)
for i, name in enumerate(GROUP_A_FILES):
df = _build_df(seed=100 + i)
_write_xport(df, OUT_DIR / name, table_name=f"GRPA{i + 1}")
print(f" wrote {OUT_DIR / name} ({len(df):,} rows, {len(df.columns)} cols)")
for i, name in enumerate(GROUP_B_FILES):
df = _build_df(seed=200 + i)
df = df.drop(columns=list(GROUP_B_DROPPED_COLUMNS))
_write_xport(df, OUT_DIR / name, table_name=f"GRPB{i + 1}")
print(f" wrote {OUT_DIR / name} ({len(df):,} rows, {len(df.columns)} cols)")
df = _build_df(seed=300)
_write_xport(df, OUT_DIR / STANDALONE_FILE, table_name="STDALONE")
print(
f" wrote {OUT_DIR / STANDALONE_FILE} "
f"({len(df):,} rows, {len(df.columns)} cols)"
)
def write_config() -> None:
cfg = {
"folder": ".", # config lives inside the target folder
"schemaname": "public",
"if_exists": "replace",
"auto_detect": True,
"clusters": [
{
"pattern": r"^group_a\d+\.xpt$",
"tablename": "group_a",
},
],
}
with CONFIG_PATH.open("w", encoding="utf-8") as f:
# Top-of-file comment documents the intent of this generated config.
f.write(
"# Generated by generate_sample_folder.py. Demonstrates both\n"
"# explicit regex clustering (group_a*) and auto-detect\n"
"# (group_b* and standalone) working together.\n"
)
yaml.safe_dump(cfg, f, sort_keys=False)
print(f" wrote {CONFIG_PATH}")
def verify() -> None:
"""Smoke-test the clustering logic against the generated folder."""
cfg = load_folder_config(CONFIG_PATH)
clusters = discover_clusters(cfg)
by_name = {c.tablename: c for c in clusters}
expected_names = {"group_a", "group_b", "standalone"}
actual_names = set(by_name)
assert expected_names == actual_names, (
f"cluster set mismatch: expected {expected_names}, got {actual_names}"
)
group_a = by_name["group_a"]
assert group_a.source == "explicit", f"group_a source = {group_a.source!r}"
assert [f.name for f in group_a.files] == sorted(GROUP_A_FILES), (
f"group_a files = {[f.name for f in group_a.files]}"
)
group_b = by_name["group_b"]
assert group_b.source == "auto", f"group_b source = {group_b.source!r}"
assert [f.name for f in group_b.files] == sorted(GROUP_B_FILES), (
f"group_b files = {[f.name for f in group_b.files]}"
)
standalone = by_name["standalone"]
assert standalone.source == "auto", f"standalone source = {standalone.source!r}"
assert [f.name for f in standalone.files] == [STANDALONE_FILE], (
f"standalone files = {[f.name for f in standalone.files]}"
)
print(" clustering verified:")
for c in clusters:
files = ", ".join(f.name for f in c.files)
print(f" {c.tablename} [{c.source}]: {files}")
def main() -> int:
print(f"Writing fixture SAS files to {OUT_DIR}/")
generate_fixtures()
print(f"\nWriting folder config to {CONFIG_PATH}")
write_config()
print("\nVerifying discover_clusters() grouping...")
verify()
print("\nOK. Try:")
print(f" python load_folder.py --config {CONFIG_PATH} --dry-run")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,555 @@
"""Folder-level SAS-to-Postgres loader.
Wraps :mod:`load_sas` so an entire directory of SAS files can be ingested in
one invocation. A directory often contains several *clusters* of files that
share a schema (e.g. ``group_a1.sas7bdat``, ``group_a2.sas7bdat``, ...). Each
cluster becomes one Postgres table; files inside a cluster are appended to it.
-------------------------------------------------------------------------------
USAGE
-------------------------------------------------------------------------------
1. YAML config
--------------
::
folder: samples/folder_test # required; relative paths resolve against
# the config file's directory
schemaname: public # required
# Optional. One of: fail | replace | append. Default: fail.
# Applied to the first file of each cluster (subsequent files in the
# cluster always run through the append-mode compatibility check).
if_exists: fail
# Optional. Default: true. When true, files that don't match any explicit
# pattern below are grouped by their common prefix (trailing digits, and
# optional trailing separators, are stripped from each file stem).
auto_detect: true
# Optional. Columns to force-include or force-exclude across every file.
# include and exclude are mutually exclusive.
# include: [ID, INTCOL]
# exclude: [ALLNULL]
# Optional explicit cluster patterns. Each pattern is matched against the
# file *basename*. Matched files are pulled out of the auto-detect pool.
# Per-cluster if_exists/include/exclude override the folder-level defaults.
clusters:
- pattern: '^group_a\\d+\\.sas7bdat$'
tablename: group_a
- pattern: '^group_b\\d+\\.sas7bdat$'
tablename: group_b
if_exists: replace
2. Command-line interface
-------------------------
::
python load_folder.py --config folder_config.yaml [--dry-run] [--fail-fast]
Flags:
--config PATH Required. Path to the YAML config above.
--dry-run Print the discovered clusters and the inferred CREATE
TABLE for each (schema from the first file of the
cluster). The database is never touched.
--fail-fast Abort the whole run on the first cluster failure.
Default is to log the failure, roll that cluster back,
and keep going.
Exit codes:
0 - every cluster loaded successfully (or dry-run completed)
1 - at least one cluster failed (details on stderr)
2 - folder does not exist / contains no SAS files
3. Discovery rules
------------------
* Supported extensions: ``.sas7bdat``, ``.xpt``, ``.xport`` (matches
:mod:`load_sas`). The folder is not scanned recursively.
* Explicit patterns are tried in order. A file matched by one pattern is
removed from the pool before the next pattern runs, so earlier patterns
win in case of overlap. Overlap between patterns is flagged as an error
at config-parse time (a file matching two patterns is almost always a bug).
* Auto-detect groups remaining files by ``re.sub(r'\\d+$', '', stem)`` with
any trailing ``_`` / ``-`` stripped afterward. Stems without trailing
digits become singleton clusters named after the stem.
4. Library usage
----------------
::
from load_folder import load_folder_config, discover_clusters, load_cluster
from load_sas import connect
cfg = load_folder_config("folder_config.yaml")
clusters = discover_clusters(cfg)
conn = connect()
try:
for cluster in clusters:
load_cluster(conn, cluster, cfg.schemaname)
finally:
conn.close()
"""
from __future__ import annotations
import argparse
import re
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import yaml
from dotenv import load_dotenv
from load_sas import (
VALID_IF_EXISTS,
apply_column_filter,
assert_schema_compatible,
connect,
copy_dataframes,
create_table,
infer_schema,
iter_sas_chunks,
read_sas_preview,
render_create_table,
)
SAS_EXTENSIONS = (".sas7bdat", ".xpt", ".xport")
# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------
@dataclass
class ClusterSpec:
tablename: str
files: List[Path]
if_exists: str
include: Optional[List[str]]
exclude: Optional[List[str]]
source: str # "explicit" or "auto"
pattern: Optional[str] = None
@dataclass
class _ExplicitPattern:
"""Parsed form of a single ``clusters[*]`` YAML entry."""
pattern: re.Pattern
raw_pattern: str
tablename: str
if_exists: Optional[str] = None
include: Optional[List[str]] = None
exclude: Optional[List[str]] = None
@dataclass
class FolderConfig:
folder: Path
schemaname: str
if_exists: str = "fail"
auto_detect: bool = True
include: Optional[List[str]] = None
exclude: Optional[List[str]] = None
explicit: List[_ExplicitPattern] = field(default_factory=list)
# ---------------------------------------------------------------------------
# Config loading
# ---------------------------------------------------------------------------
def _validate_if_exists(value: Any, where: str) -> str:
s = str(value).lower()
if s not in VALID_IF_EXISTS:
raise ValueError(
f"{where}: if_exists={value!r} is not one of {VALID_IF_EXISTS}"
)
return s
def _parse_columns_filter(
raw: Dict[str, Any], where: str
) -> Tuple[Optional[List[str]], Optional[List[str]]]:
include = raw.get("include")
exclude = raw.get("exclude")
if include is not None and exclude is not None:
raise ValueError(f"{where}: 'include' and 'exclude' are mutually exclusive.")
if include is not None and not isinstance(include, list):
raise ValueError(f"{where}: 'include' must be a list of column names.")
if exclude is not None and not isinstance(exclude, list):
raise ValueError(f"{where}: 'exclude' must be a list of column names.")
include_out = [str(c) for c in include] if include is not None else None
exclude_out = [str(c) for c in exclude] if exclude is not None else None
return include_out, exclude_out
def load_folder_config(path: Path) -> FolderConfig:
"""Parse and validate the folder-level YAML config at ``path``."""
path = Path(path)
with path.open("r", encoding="utf-8") as f:
raw = yaml.safe_load(f)
if not isinstance(raw, dict):
raise ValueError(f"Config at {path} must be a YAML mapping at the top level.")
missing = [k for k in ("folder", "schemaname") if k not in raw]
if missing:
raise ValueError(f"Config {path} missing required keys: {', '.join(missing)}")
folder = Path(raw["folder"])
if not folder.is_absolute():
candidate = (path.parent / folder).resolve()
folder = candidate if candidate.exists() else folder
schemaname = str(raw["schemaname"])
if_exists = _validate_if_exists(raw.get("if_exists", "fail"), f"Config {path}")
auto_detect = bool(raw.get("auto_detect", True))
include, exclude = _parse_columns_filter(raw, f"Config {path}")
explicit: List[_ExplicitPattern] = []
clusters_raw = raw.get("clusters") or []
if not isinstance(clusters_raw, list):
raise ValueError(f"Config {path}: 'clusters' must be a list if present.")
for i, entry in enumerate(clusters_raw):
where = f"Config {path} clusters[{i}]"
if not isinstance(entry, dict):
raise ValueError(f"{where} must be a mapping.")
if "pattern" not in entry or "tablename" not in entry:
raise ValueError(f"{where} must include 'pattern' and 'tablename'.")
raw_pat = str(entry["pattern"])
try:
compiled = re.compile(raw_pat)
except re.error as e:
raise ValueError(f"{where}: invalid regex {raw_pat!r}: {e}") from e
c_if_exists = (
_validate_if_exists(entry["if_exists"], where)
if "if_exists" in entry
else None
)
c_include, c_exclude = _parse_columns_filter(entry, where)
explicit.append(
_ExplicitPattern(
pattern=compiled,
raw_pattern=raw_pat,
tablename=str(entry["tablename"]),
if_exists=c_if_exists,
include=c_include,
exclude=c_exclude,
)
)
return FolderConfig(
folder=folder,
schemaname=schemaname,
if_exists=if_exists,
auto_detect=auto_detect,
include=include,
exclude=exclude,
explicit=explicit,
)
# ---------------------------------------------------------------------------
# Cluster discovery
# ---------------------------------------------------------------------------
_TRAILING_DIGIT_RE = re.compile(r"\d+$")
def _auto_prefix(stem: str) -> str:
"""Derive the cluster key for a file stem.
Strip trailing digits and any trailing separators so
``group_a1`` / ``group_a_2`` / ``group_a-3`` all land in the same
``group_a`` bucket. If nothing is stripped, the stem is its own key.
"""
stripped = _TRAILING_DIGIT_RE.sub("", stem)
stripped = stripped.rstrip("_-")
return stripped or stem
def _list_sas_files(folder: Path) -> List[Path]:
files: List[Path] = []
for p in sorted(folder.iterdir()):
if p.is_file() and p.suffix.lower() in SAS_EXTENSIONS:
files.append(p)
return files
def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
"""Enumerate ``cfg.folder`` and bucket files into ``ClusterSpec`` objects.
Pure/IO-bounded: the only filesystem access is listing ``cfg.folder``. No
SAS file is opened here. Explicit patterns are applied first, in config
order; files matched by an earlier pattern are removed from the pool
before the next pattern runs. A file matching two patterns triggers a
hard error (that's almost always a config bug).
"""
if not cfg.folder.exists() or not cfg.folder.is_dir():
raise FileNotFoundError(f"Folder not found or not a directory: {cfg.folder}")
pool = _list_sas_files(cfg.folder)
clusters: List[ClusterSpec] = []
# Detect cross-pattern overlap up front for a clearer error message.
for i, p_i in enumerate(cfg.explicit):
for j in range(i + 1, len(cfg.explicit)):
p_j = cfg.explicit[j]
for f in pool:
if p_i.pattern.search(f.name) and p_j.pattern.search(f.name):
raise ValueError(
f"File {f.name!r} matches multiple explicit patterns: "
f"{p_i.raw_pattern!r} and {p_j.raw_pattern!r}"
)
remaining = list(pool)
for patt in cfg.explicit:
matched = [f for f in remaining if patt.pattern.search(f.name)]
if not matched:
# Not an error - the folder might legitimately not contain files
# for this pattern on a given run. Emit a note for the CLI.
clusters.append(
ClusterSpec(
tablename=patt.tablename,
files=[],
if_exists=patt.if_exists or cfg.if_exists,
include=patt.include if patt.include is not None else cfg.include,
exclude=patt.exclude if patt.exclude is not None else cfg.exclude,
source="explicit",
pattern=patt.raw_pattern,
)
)
continue
remaining = [f for f in remaining if f not in matched]
clusters.append(
ClusterSpec(
tablename=patt.tablename,
files=sorted(matched),
if_exists=patt.if_exists or cfg.if_exists,
include=patt.include if patt.include is not None else cfg.include,
exclude=patt.exclude if patt.exclude is not None else cfg.exclude,
source="explicit",
pattern=patt.raw_pattern,
)
)
if cfg.auto_detect and remaining:
buckets: Dict[str, List[Path]] = {}
for f in remaining:
key = _auto_prefix(f.stem)
buckets.setdefault(key, []).append(f)
for key in sorted(buckets):
clusters.append(
ClusterSpec(
tablename=key,
files=sorted(buckets[key]),
if_exists=cfg.if_exists,
include=cfg.include,
exclude=cfg.exclude,
source="auto",
)
)
return clusters
# ---------------------------------------------------------------------------
# Per-cluster load
# ---------------------------------------------------------------------------
def _infer_cluster_schema(path: Path, include, exclude):
preview_df, meta = read_sas_preview(path)
preview_df = apply_column_filter(preview_df, include, exclude)
total_rows = getattr(meta, "number_rows", None)
columns = infer_schema(preview_df, meta, total_rows=total_rows)
return columns
def load_cluster(conn, cluster: ClusterSpec, schemaname: str) -> int:
"""Load every file in ``cluster`` into one table. Returns total rows loaded.
The caller owns transaction boundaries. This function does NOT commit or
roll back - :func:`main` does that per cluster so one bad cluster
doesn't poison the rest of the run.
"""
if not cluster.files:
return 0
first, *rest = cluster.files
first_columns = _infer_cluster_schema(first, cluster.include, cluster.exclude)
create_table(
conn, schemaname, cluster.tablename, first_columns, cluster.if_exists
)
total = 0
total += _stream_file(
conn, schemaname, cluster.tablename, first, first_columns,
cluster.include, cluster.exclude,
)
for path in rest:
columns = _infer_cluster_schema(path, cluster.include, cluster.exclude)
# Uses the same check that if_exists=append runs. A type mismatch or
# missing column aborts the cluster; the transaction rollback in
# main() keeps the table from ending up half-loaded.
assert_schema_compatible(conn, schemaname, cluster.tablename, columns)
total += _stream_file(
conn, schemaname, cluster.tablename, path, columns,
cluster.include, cluster.exclude,
)
return total
def _stream_file(
conn,
schemaname: str,
tablename: str,
path: Path,
columns,
include,
exclude,
) -> int:
def _chunks():
seen = 0
for chunk_df, _chunk_meta in iter_sas_chunks(path):
chunk_df = apply_column_filter(chunk_df, include, exclude)
seen += len(chunk_df)
print(
f" {path.name}: streaming... {seen:,} rows",
file=sys.stderr,
)
yield chunk_df
return copy_dataframes(conn, schemaname, tablename, _chunks(), columns)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def _build_argparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
description=(
"Load every SAS file in a folder into Postgres, grouping files "
"into clusters that each become one table."
),
)
p.add_argument("--config", required=True, type=Path, help="Path to YAML config")
p.add_argument(
"--dry-run",
action="store_true",
help=(
"Print discovered clusters and the inferred CREATE TABLE for "
"each; don't touch Postgres."
),
)
p.add_argument(
"--fail-fast",
action="store_true",
help=(
"Abort on the first cluster failure. Default is to roll that "
"cluster back and continue with the next one."
),
)
return p
def _describe_cluster(cluster: ClusterSpec) -> str:
src = f"{cluster.source}"
if cluster.pattern:
src += f" pattern={cluster.pattern!r}"
files = ", ".join(f.name for f in cluster.files) or "(no matching files)"
return (
f"cluster {cluster.tablename!r} [{src}] if_exists={cluster.if_exists}\n"
f" files: {files}"
)
def main(argv: Optional[List[str]] = None) -> int:
args = _build_argparser().parse_args(argv)
load_dotenv()
cfg = load_folder_config(args.config)
if not cfg.folder.exists() or not cfg.folder.is_dir():
print(f"error: folder not found: {cfg.folder}", file=sys.stderr)
return 2
clusters = discover_clusters(cfg)
loadable = [c for c in clusters if c.files]
if not loadable:
print(
f"error: no SAS files found in {cfg.folder} "
f"(looked for {', '.join(SAS_EXTENSIONS)})",
file=sys.stderr,
)
return 2
print(f"discovered {len(loadable)} cluster(s) in {cfg.folder}:")
for c in clusters:
print(_describe_cluster(c))
if args.dry_run:
print()
for c in loadable:
print(f"--- CREATE TABLE for cluster {c.tablename!r} ---")
columns = _infer_cluster_schema(c.files[0], c.include, c.exclude)
print(render_create_table(cfg.schemaname, c.tablename, columns))
print()
return 0
conn = connect()
conn.autocommit = False
failures: List[Tuple[str, Exception]] = []
totals: List[Tuple[str, int, int]] = [] # (tablename, files, rows)
try:
for cluster in loadable:
print(
f"\n>>> loading cluster {cluster.tablename!r} "
f"({len(cluster.files)} file(s))"
)
try:
rows = load_cluster(conn, cluster, cfg.schemaname)
conn.commit()
totals.append((cluster.tablename, len(cluster.files), rows))
print(
f" -> loaded {rows:,} row(s) into "
f"{cfg.schemaname}.{cluster.tablename}"
)
except Exception as e:
conn.rollback()
failures.append((cluster.tablename, e))
print(
f" !! cluster {cluster.tablename!r} failed: {e}",
file=sys.stderr,
)
if args.fail_fast:
break
finally:
conn.close()
print("\n=== summary ===")
for name, fcount, rows in totals:
print(f" ok {name}: {fcount} file(s), {rows:,} row(s)")
for name, err in failures:
print(f" FAIL {name}: {err}", file=sys.stderr)
return 1 if failures else 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -854,6 +854,22 @@ def _assert_schema_compatible(
) )
def assert_schema_compatible(
conn,
schema_name: str,
table_name: str,
columns: Dict[str, ColumnSpec],
) -> None:
"""Public wrapper around :func:`_assert_schema_compatible`.
Intended for orchestrators (e.g. the folder loader) that append multiple
files into one table and need to re-run the same compatibility check
that ``if_exists=append`` performs internally. Raises
:class:`SchemaCompatibilityError` on mismatch.
"""
_assert_schema_compatible(conn, schema_name, table_name, columns)
def create_table( def create_table(
conn, conn,
schema_name: str, schema_name: str,

View File

@ -0,0 +1,54 @@
# Example folder-level loader config.
#
# Shape mirrors what `load_folder.py` expects:
#
# python load_folder.py --config sample_folder_config.yaml --dry-run
# python load_folder.py --config sample_folder_config.yaml
#
# Relative paths are resolved against this config file's directory first,
# falling back to the current working directory if that doesn't exist.
folder: samples/folder_test
schemaname: public
# Applied when creating the first file of each cluster.
# One of: fail | replace | append. Default: fail.
if_exists: replace
# When true (default), any file not matched by an explicit pattern below is
# auto-grouped with its peers by stripping trailing digits (and any trailing
# _ / -) from the file stem. Files with no trailing digits become their own
# singleton cluster.
auto_detect: true
# Folder-level column filter. Every file in every cluster passes through
# this filter. `include` and `exclude` are mutually exclusive. A cluster can
# override these via its own `include` / `exclude` keys.
#
# include:
# - ID
# - INTCOL
# exclude:
# - ALLNULL
# Explicit cluster patterns. Each pattern is matched against the file
# *basename*. Files matched by a pattern are pulled out of the auto-detect
# pool, so explicit and auto clusters compose cleanly.
#
# `tablename` is required. `if_exists`, `include`, and `exclude` are
# optional per-cluster overrides of the folder-level defaults above.
clusters:
- pattern: '^group_a\d+\.xpt$'
tablename: group_a
# Example of an explicit override. Uncomment to force the group_b cluster to
# append instead of replace even though the folder default is "replace":
#
# - pattern: '^group_b\d+\.xpt$'
# tablename: group_b
# if_exists: append
# With only the gq pattern explicit, auto_detect: true will still bucket
# group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
# standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
# for the fixture that exercises exactly this layout.