foxtrot/generic_loader/sample_folder_config.yaml
2026-04-20 10:18:09 -05:00

99 lines
3.4 KiB
YAML

# Example folder-level loader config.
#
# Shape mirrors what `load_folder.py` expects:
#
# python load_folder.py --config sample_folder_config.yaml --dry-run
# python load_folder.py --config sample_folder_config.yaml
#
# Relative paths are resolved against this config file's directory first,
# falling back to the current working directory if that doesn't exist.
folder: samples/folder_test
schemaname: public
# Applied when creating the first file of each cluster.
# One of: fail | replace | append. Default: fail.
if_exists: replace
# When true (default), any file not matched by an explicit pattern below is
# auto-grouped with its peers by stripping trailing digits (and any trailing
# _ / -) from the file stem. Files with no trailing digits become their own
# singleton cluster.
auto_detect: true
# Folder-level column filter. Every file in every cluster passes through
# this filter. `include` and `exclude` are mutually exclusive. A cluster can
# override these via its own `include` / `exclude` keys.
#
# include:
# - ID
# - INTCOL
# exclude:
# - ALLNULL
# Folder-level partition_by: Partition every cluster's table by unique values
# of these columns. Inherited by all clusters unless overridden per-cluster.
# Requires if_exists: replace or fail (not append for initial creation).
# Single field:
# partition_by: state
# Multiple fields (cascading):
# partition_by:
# - state
# - zip
#
# Folder-level max_partitions: Warning threshold for total partition count
# (default: 10000). Inherited by all clusters unless overridden per-cluster.
# max_partitions: 10000
# Folder-level indexes: Create B-tree indexes on these columns after data
# loading. Inherited by all clusters unless overridden per-cluster.
# Indexes are created with IF NOT EXISTS for safe use with append mode.
# Single column:
# indexes: state
# Multiple columns (one index per column):
# indexes:
# - state
# - zip
# Explicit cluster patterns. Each pattern is matched against the file
# *basename*. Files matched by a pattern are pulled out of the auto-detect
# pool, so explicit and auto clusters compose cleanly.
#
# `tablename` is required. `if_exists`, `include`, and `exclude` are
# optional per-cluster overrides of the folder-level defaults above.
clusters:
- pattern: '^group_a\d+\.xpt$'
tablename: group_a
# Example of an explicit override. Uncomment to force the group_b cluster to
# append instead of replace even though the folder default is "replace":
#
# - pattern: '^group_b\d+\.xpt$'
# tablename: group_b
# if_exists: append
# Per-cluster partition_by / max_partitions override. These take precedence
# over the folder-level defaults above.
#
# - pattern: '^group_c\d+\.xpt$'
# tablename: group_c
# partition_by:
# - region
# - year
# max_partitions: 500
# Per-cluster indexes override. Takes precedence over the folder-level
# indexes default above. An explicit empty list disables indexing for
# this cluster even when the folder default has indexes.
#
# - pattern: '^group_d\d+\.xpt$'
# tablename: group_d
# indexes:
# - region
# - year
# With only the gq pattern explicit, auto_detect: true will still bucket
# group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
# standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
# for the fixture that exercises exactly this layout.