2026-04-18 16:25:04 +00:00
|
|
|
# Example folder-level loader config.
|
|
|
|
|
#
|
|
|
|
|
# Shape mirrors what `load_folder.py` expects:
|
|
|
|
|
#
|
|
|
|
|
# python load_folder.py --config sample_folder_config.yaml --dry-run
|
|
|
|
|
# python load_folder.py --config sample_folder_config.yaml
|
|
|
|
|
#
|
|
|
|
|
# Relative paths are resolved against this config file's directory first,
|
|
|
|
|
# falling back to the current working directory if that doesn't exist.
|
|
|
|
|
|
|
|
|
|
folder: samples/folder_test
|
|
|
|
|
schemaname: public
|
|
|
|
|
|
|
|
|
|
# Applied when creating the first file of each cluster.
|
|
|
|
|
# One of: fail | replace | append. Default: fail.
|
|
|
|
|
if_exists: replace
|
|
|
|
|
|
|
|
|
|
# When true (default), any file not matched by an explicit pattern below is
|
|
|
|
|
# auto-grouped with its peers by stripping trailing digits (and any trailing
|
|
|
|
|
# _ / -) from the file stem. Files with no trailing digits become their own
|
|
|
|
|
# singleton cluster.
|
2026-04-20 16:48:22 +00:00
|
|
|
#
|
|
|
|
|
# Auto-detect only recognizes *trailing* digit runs. If your file names put
|
|
|
|
|
# the varying number in the middle of the stem (e.g. surrounded by year,
|
|
|
|
|
# region, and detail components), auto-detect will NOT group them - each
|
|
|
|
|
# file becomes its own singleton cluster. Use an explicit pattern instead;
|
|
|
|
|
# see the embedded-digit example near the bottom of this file.
|
2026-04-18 16:25:04 +00:00
|
|
|
auto_detect: true
|
|
|
|
|
|
|
|
|
|
# Folder-level column filter. Every file in every cluster passes through
|
|
|
|
|
# this filter. `include` and `exclude` are mutually exclusive. A cluster can
|
|
|
|
|
# override these via its own `include` / `exclude` keys.
|
|
|
|
|
#
|
|
|
|
|
# include:
|
|
|
|
|
# - ID
|
|
|
|
|
# - INTCOL
|
|
|
|
|
# exclude:
|
|
|
|
|
# - ALLNULL
|
|
|
|
|
|
2026-04-20 14:56:00 +00:00
|
|
|
# Folder-level partition_by: Partition every cluster's table by unique values
|
|
|
|
|
# of these columns. Inherited by all clusters unless overridden per-cluster.
|
|
|
|
|
# Requires if_exists: replace or fail (not append for initial creation).
|
|
|
|
|
# Single field:
|
|
|
|
|
# partition_by: state
|
|
|
|
|
# Multiple fields (cascading):
|
|
|
|
|
# partition_by:
|
|
|
|
|
# - state
|
|
|
|
|
# - zip
|
|
|
|
|
#
|
|
|
|
|
# Folder-level max_partitions: Warning threshold for total partition count
|
|
|
|
|
# (default: 10000). Inherited by all clusters unless overridden per-cluster.
|
|
|
|
|
# max_partitions: 10000
|
|
|
|
|
|
2026-04-20 15:18:09 +00:00
|
|
|
# Folder-level indexes: Create B-tree indexes on these columns after data
|
|
|
|
|
# loading. Inherited by all clusters unless overridden per-cluster.
|
|
|
|
|
# Indexes are created with IF NOT EXISTS for safe use with append mode.
|
|
|
|
|
# Single column:
|
|
|
|
|
# indexes: state
|
|
|
|
|
# Multiple columns (one index per column):
|
|
|
|
|
# indexes:
|
|
|
|
|
# - state
|
|
|
|
|
# - zip
|
|
|
|
|
|
2026-04-18 16:25:04 +00:00
|
|
|
# Explicit cluster patterns. Each pattern is matched against the file
|
|
|
|
|
# *basename*. Files matched by a pattern are pulled out of the auto-detect
|
|
|
|
|
# pool, so explicit and auto clusters compose cleanly.
|
|
|
|
|
#
|
|
|
|
|
# `tablename` is required. `if_exists`, `include`, and `exclude` are
|
|
|
|
|
# optional per-cluster overrides of the folder-level defaults above.
|
|
|
|
|
clusters:
|
|
|
|
|
- pattern: '^group_a\d+\.xpt$'
|
|
|
|
|
tablename: group_a
|
|
|
|
|
|
|
|
|
|
# Example of an explicit override. Uncomment to force the group_b cluster to
|
|
|
|
|
# append instead of replace even though the folder default is "replace":
|
|
|
|
|
#
|
|
|
|
|
# - pattern: '^group_b\d+\.xpt$'
|
|
|
|
|
# tablename: group_b
|
|
|
|
|
# if_exists: append
|
|
|
|
|
|
2026-04-20 14:56:00 +00:00
|
|
|
# Per-cluster partition_by / max_partitions override. These take precedence
|
|
|
|
|
# over the folder-level defaults above.
|
|
|
|
|
#
|
|
|
|
|
# - pattern: '^group_c\d+\.xpt$'
|
|
|
|
|
# tablename: group_c
|
|
|
|
|
# partition_by:
|
|
|
|
|
# - region
|
|
|
|
|
# - year
|
|
|
|
|
# max_partitions: 500
|
|
|
|
|
|
2026-04-20 15:18:09 +00:00
|
|
|
# Per-cluster indexes override. Takes precedence over the folder-level
|
|
|
|
|
# indexes default above. An explicit empty list disables indexing for
|
|
|
|
|
# this cluster even when the folder default has indexes.
|
|
|
|
|
#
|
|
|
|
|
# - pattern: '^group_d\d+\.xpt$'
|
|
|
|
|
# tablename: group_d
|
|
|
|
|
# indexes:
|
|
|
|
|
# - region
|
|
|
|
|
# - year
|
|
|
|
|
|
2026-04-20 16:48:22 +00:00
|
|
|
# Embedded-digit example. When the varying number sits in the MIDDLE of
|
|
|
|
|
# the stem (e.g. year2020_regionA_40_detail.sas7bdat,
|
|
|
|
|
# year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group
|
|
|
|
|
# them - each file becomes its own singleton cluster. An explicit
|
|
|
|
|
# pattern bucketizes them correctly. The \d+ matches any width, and
|
|
|
|
|
# files within the cluster are sorted numerically by the last digit
|
|
|
|
|
# group in the stem, so _9_ sorts before _40_ regardless of zero-
|
|
|
|
|
# padding. Gaps in the numeric sequence (missing 3, 7, 14, ...) are
|
|
|
|
|
# fine - whatever files are present get loaded in numeric order.
|
|
|
|
|
#
|
|
|
|
|
# - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
|
|
|
|
|
# tablename: year2020_regionA_detail
|
|
|
|
|
|
|
|
|
|
# With only the group_a pattern explicit, auto_detect: true will still
|
|
|
|
|
# bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
|
2026-04-18 16:25:04 +00:00
|
|
|
# standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
|
|
|
|
|
# for the fixture that exercises exactly this layout.
|