foxtrot/generic_loader/sample_folder_config.yaml

# Example folder-level loader config.
#
# Shape mirrors what `load_folder.py` expects:
#
#   python load_folder.py --config sample_folder_config.yaml --dry-run
#   python load_folder.py --config sample_folder_config.yaml
#
# Relative paths are resolved against this config file's directory first,
# falling back to the current working directory if that doesn't exist.

folder: samples/folder_test
schemaname: public

# Applied when creating the first file of each cluster.
# One of: fail | replace | append. Default: fail.
if_exists: replace

# When true (default), any file not matched by an explicit pattern below is
# auto-grouped with its peers by stripping trailing digits (and any trailing
# _ / -) from the file stem. Files with no trailing digits become their own
# singleton cluster.
#
# Auto-detect only recognizes *trailing* digit runs. If your file names put
# the varying number in the middle of the stem (e.g. surrounded by year,
# region, and detail components), auto-detect will NOT group them - each
# file becomes its own singleton cluster. Use an explicit pattern instead;
# see the embedded-digit example near the bottom of this file.
auto_detect: true

# Folder-level column filter. Every file in every cluster passes through
# this filter. `include` and `exclude` are mutually exclusive. A cluster can
# override these via its own `include` / `exclude` keys.
#
# include:
#   - ID
#   - INTCOL
# exclude:
#   - ALLNULL

# Folder-level partition_by: Partition every cluster's table by unique values
# of these columns. Inherited by all clusters unless overridden per-cluster.
# Requires if_exists: replace or fail (not append for initial creation).
# Single field:
#   partition_by: state
# Multiple fields (cascading):
#   partition_by:
#     - state
#     - zip
#
# Folder-level max_partitions: Warning threshold for total partition count
# (default: 10000). Inherited by all clusters unless overridden per-cluster.
#   max_partitions: 10000

# Folder-level indexes: Create B-tree indexes on these columns after data
# loading. Inherited by all clusters unless overridden per-cluster.
# Indexes are created with IF NOT EXISTS for safe use with append mode.
# Single column:
#   indexes: state
# Multiple columns (one index per column):
#   indexes:
#     - state
#     - zip

# Folder-level column_types: Explicit {column_name: postgres_type} map that
# bypasses automatic type inference for the listed columns. Applied to
# every cluster unless a cluster supplies its own column_types, which are
# merged on top (cluster entries win on conflict).
#
# During --workers>1 runs the pre-scan derives a cluster-wide "auto-union"
# type per column (e.g. any file stores the column as CHAR -> TEXT; all
# NUM with any format hinting decimals -> DOUBLE PRECISION; otherwise
# BIGINT). Entries in column_types here win over that auto-union - use
# them when the auto result is wrong or when --no-prescan disables the
# auto-union and you still need to pin a column.
#
# Valid type strings are anything the CREATE TABLE DDL accepts (TEXT,
# INTEGER, BIGINT, DOUBLE PRECISION, DATE, TIMESTAMP, ...). Columns that
# don't exist in a given file are simply ignored for that file.
#
# column_types:
#   RESP_PH_PREFIX_ID: TEXT
#   RESP_PH_SUFFIX_ID: TEXT
#   SOMELONG_ID: BIGINT

# Folder-level all_nullable: If true, every column of every cluster is
# stamped nullable in the generated schema; NOT NULL inference is skipped
# entirely. Use this when the sampler wrongly concludes a column has no
# nulls (sampled rows happened to be dense, but later files in the cluster
# carry nulls) and COPY blows up mid-load. Inherited by all clusters
# unless a cluster supplies its own all_nullable. The CLI flag
# --all-nullable overrides both this and any per-cluster setting when
# passed. Off by default.
#
# all_nullable: false

# Explicit cluster patterns. Each pattern is matched against the file
# *basename*. Files matched by a pattern are pulled out of the auto-detect
# pool, so explicit and auto clusters compose cleanly.
#
# `tablename` is required. `if_exists`, `include`, `exclude`, and
# `column_types` are optional per-cluster overrides of the folder-level
# defaults above. Cluster-level column_types entries win over folder-
# level entries for the same column.
clusters:
  - pattern: '^group_a\d+\.xpt$'
    tablename: group_a
    # column_types:
    #   INTCOL: TEXT
    # all_nullable: true  # per-cluster override of the folder-level default

  # Example of an explicit override. Uncomment to force the group_b cluster to
  # append instead of replace even though the folder default is "replace":
  #
  # - pattern: '^group_b\d+\.xpt$'
  #   tablename: group_b
  #   if_exists: append

  # Per-cluster partition_by / max_partitions override. These take precedence
  # over the folder-level defaults above.
  #
  # - pattern: '^group_c\d+\.xpt$'
  #   tablename: group_c
  #   partition_by:
  #     - region
  #     - year
  #   max_partitions: 500

  # Per-cluster indexes override. Takes precedence over the folder-level
  # indexes default above. An explicit empty list disables indexing for
  # this cluster even when the folder default has indexes.
  #
  # - pattern: '^group_d\d+\.xpt$'
  #   tablename: group_d
  #   indexes:
  #     - region
  #     - year

  # Embedded-digit example. When the varying number sits in the MIDDLE of
  # the stem (e.g. year2020_regionA_40_detail.sas7bdat,
  # year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group
  # them - each file becomes its own singleton cluster. An explicit
  # pattern bucketizes them correctly. The \d+ matches any width, and
  # files within the cluster are sorted numerically by the last digit
  # group in the stem, so _9_ sorts before _40_ regardless of zero-
  # padding. Gaps in the numeric sequence (missing 3, 7, 14, ...) are
  # fine - whatever files are present get loaded in numeric order.
  #
  # - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
  #   tablename: year2020_regionA_detail

  # With only the group_a pattern explicit, auto_detect: true will still
  # bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
  # standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
  # for the fixture that exercises exactly this layout.