foxtrot/generic_loader/sample_folder_config.yaml

# Example folder-level loader config.
#
# Shape mirrors what `load_folder.py` expects:
#
#   python load_folder.py --config sample_folder_config.yaml --dry-run
#   python load_folder.py --config sample_folder_config.yaml
#
# Relative paths are resolved against this config file's directory first,
# falling back to the current working directory if that doesn't exist.

folder: samples/folder_test
schemaname: public

# Applied when creating the first file of each cluster.
# One of: fail | replace | append. Default: fail.
if_exists: replace

# When true (default), any file not matched by an explicit pattern below is
# auto-grouped with its peers by stripping trailing digits (and any trailing
# _ / -) from the file stem. Files with no trailing digits become their own
# singleton cluster.
#
# Auto-detect only recognizes *trailing* digit runs. If your file names put
# the varying number in the middle of the stem (e.g. surrounded by year,
# region, and detail components), auto-detect will NOT group them - each
# file becomes its own singleton cluster. Use an explicit pattern instead;
# see the embedded-digit example near the bottom of this file.
auto_detect: true

# Folder-level column filter. Every file in every cluster passes through
# this filter. `include` and `exclude` are mutually exclusive. A cluster can
# override these via its own `include` / `exclude` keys.
#
# include:
#   - ID
#   - INTCOL
# exclude:
#   - ALLNULL

# Folder-level partition_by: Partition every cluster's table by unique values
# of these columns. Inherited by all clusters unless overridden per-cluster.
# Requires if_exists: replace or fail (not append for initial creation).
# Single field:
#   partition_by: state
# Multiple fields (cascading):
#   partition_by:
#     - state
#     - zip
#
# Folder-level max_partitions: Warning threshold for total partition count
# (default: 10000). Inherited by all clusters unless overridden per-cluster.
#   max_partitions: 10000

# Folder-level indexes: Create B-tree indexes on these columns after data
# loading. Inherited by all clusters unless overridden per-cluster.
# Indexes are created with IF NOT EXISTS for safe use with append mode.
# Single column:
#   indexes: state
# Multiple columns (one index per column):
#   indexes:
#     - state
#     - zip

# Explicit cluster patterns. Each pattern is matched against the file
# *basename*. Files matched by a pattern are pulled out of the auto-detect
# pool, so explicit and auto clusters compose cleanly.
#
# `tablename` is required. `if_exists`, `include`, and `exclude` are
# optional per-cluster overrides of the folder-level defaults above.
clusters:
  - pattern: '^group_a\d+\.xpt$'
    tablename: group_a

  # Example of an explicit override. Uncomment to force the group_b cluster to
  # append instead of replace even though the folder default is "replace":
  #
  # - pattern: '^group_b\d+\.xpt$'
  #   tablename: group_b
  #   if_exists: append

  # Per-cluster partition_by / max_partitions override. These take precedence
  # over the folder-level defaults above.
  #
  # - pattern: '^group_c\d+\.xpt$'
  #   tablename: group_c
  #   partition_by:
  #     - region
  #     - year
  #   max_partitions: 500

  # Per-cluster indexes override. Takes precedence over the folder-level
  # indexes default above. An explicit empty list disables indexing for
  # this cluster even when the folder default has indexes.
  #
  # - pattern: '^group_d\d+\.xpt$'
  #   tablename: group_d
  #   indexes:
  #     - region
  #     - year

  # Embedded-digit example. When the varying number sits in the MIDDLE of
  # the stem (e.g. year2020_regionA_40_detail.sas7bdat,
  # year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group
  # them - each file becomes its own singleton cluster. An explicit
  # pattern bucketizes them correctly. The \d+ matches any width, and
  # files within the cluster are sorted numerically by the last digit
  # group in the stem, so _9_ sorts before _40_ regardless of zero-
  # padding. Gaps in the numeric sequence (missing 3, 7, 14, ...) are
  # fine - whatever files are present get loaded in numeric order.
  #
  # - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
  #   tablename: year2020_regionA_detail

  # With only the group_a pattern explicit, auto_detect: true will still
  # bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
  # standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
  # for the fixture that exercises exactly this layout.