foxtrot/generic_loader/sample_folder_config.yaml

118 lines
4.4 KiB
YAML

# Example folder-level loader config.
#
# Shape mirrors what `load_folder.py` expects:
#
# python load_folder.py --config sample_folder_config.yaml --dry-run
# python load_folder.py --config sample_folder_config.yaml
#
# Relative paths are resolved against this config file's directory first,
# falling back to the current working directory if that doesn't exist.
folder: samples/folder_test
schemaname: public
# Applied when creating the first file of each cluster.
# One of: fail | replace | append. Default: fail.
if_exists: replace
# When true (default), any file not matched by an explicit pattern below is
# auto-grouped with its peers by stripping trailing digits (and any trailing
# _ / -) from the file stem. Files with no trailing digits become their own
# singleton cluster.
#
# Auto-detect only recognizes *trailing* digit runs. If your file names put
# the varying number in the middle of the stem (e.g. surrounded by year,
# region, and detail components), auto-detect will NOT group them - each
# file becomes its own singleton cluster. Use an explicit pattern instead;
# see the embedded-digit example near the bottom of this file.
auto_detect: true
# Folder-level column filter. Every file in every cluster passes through
# this filter. `include` and `exclude` are mutually exclusive. A cluster can
# override these via its own `include` / `exclude` keys.
#
# include:
# - ID
# - INTCOL
# exclude:
# - ALLNULL
# Folder-level partition_by: Partition every cluster's table by unique values
# of these columns. Inherited by all clusters unless overridden per-cluster.
# Requires if_exists: replace or fail (not append for initial creation).
# Single field:
# partition_by: state
# Multiple fields (cascading):
# partition_by:
# - state
# - zip
#
# Folder-level max_partitions: Warning threshold for total partition count
# (default: 10000). Inherited by all clusters unless overridden per-cluster.
# max_partitions: 10000
# Folder-level indexes: Create B-tree indexes on these columns after data
# loading. Inherited by all clusters unless overridden per-cluster.
# Indexes are created with IF NOT EXISTS for safe use with append mode.
# Single column:
# indexes: state
# Multiple columns (one index per column):
# indexes:
# - state
# - zip
# Explicit cluster patterns. Each pattern is matched against the file
# *basename*. Files matched by a pattern are pulled out of the auto-detect
# pool, so explicit and auto clusters compose cleanly.
#
# `tablename` is required. `if_exists`, `include`, and `exclude` are
# optional per-cluster overrides of the folder-level defaults above.
clusters:
- pattern: '^group_a\d+\.xpt$'
tablename: group_a
# Example of an explicit override. Uncomment to force the group_b cluster to
# append instead of replace even though the folder default is "replace":
#
# - pattern: '^group_b\d+\.xpt$'
# tablename: group_b
# if_exists: append
# Per-cluster partition_by / max_partitions override. These take precedence
# over the folder-level defaults above.
#
# - pattern: '^group_c\d+\.xpt$'
# tablename: group_c
# partition_by:
# - region
# - year
# max_partitions: 500
# Per-cluster indexes override. Takes precedence over the folder-level
# indexes default above. An explicit empty list disables indexing for
# this cluster even when the folder default has indexes.
#
# - pattern: '^group_d\d+\.xpt$'
# tablename: group_d
# indexes:
# - region
# - year
# Embedded-digit example. When the varying number sits in the MIDDLE of
# the stem (e.g. year2020_regionA_40_detail.sas7bdat,
# year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group
# them - each file becomes its own singleton cluster. An explicit
# pattern bucketizes them correctly. The \d+ matches any width, and
# files within the cluster are sorted numerically by the last digit
# group in the stem, so _9_ sorts before _40_ regardless of zero-
# padding. Gaps in the numeric sequence (missing 3, 7, 14, ...) are
# fine - whatever files are present get loaded in numeric order.
#
# - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
# tablename: year2020_regionA_detail
# With only the group_a pattern explicit, auto_detect: true will still
# bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
# standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
# for the fixture that exercises exactly this layout.