# Example folder-level loader config. # # Shape mirrors what `load_folder.py` expects: # # python load_folder.py --config sample_folder_config.yaml --dry-run # python load_folder.py --config sample_folder_config.yaml # # Relative paths are resolved against this config file's directory first, # falling back to the current working directory if that doesn't exist. folder: samples/folder_test schemaname: public # Applied when creating the first file of each cluster. # One of: fail | replace | append. Default: fail. if_exists: replace # When true (default), any file not matched by an explicit pattern below is # auto-grouped with its peers by stripping trailing digits (and any trailing # _ / -) from the file stem. Files with no trailing digits become their own # singleton cluster. # # Auto-detect only recognizes *trailing* digit runs. If your file names put # the varying number in the middle of the stem (e.g. surrounded by year, # region, and detail components), auto-detect will NOT group them - each # file becomes its own singleton cluster. Use an explicit pattern instead; # see the embedded-digit example near the bottom of this file. auto_detect: true # Folder-level column filter. Every file in every cluster passes through # this filter. `include` and `exclude` are mutually exclusive. A cluster can # override these via its own `include` / `exclude` keys. # # include: # - ID # - INTCOL # exclude: # - ALLNULL # Folder-level partition_by: Partition every cluster's table by unique values # of these columns. Inherited by all clusters unless overridden per-cluster. # Requires if_exists: replace or fail (not append for initial creation). # Single field: # partition_by: state # Multiple fields (cascading): # partition_by: # - state # - zip # # Folder-level max_partitions: Warning threshold for total partition count # (default: 10000). Inherited by all clusters unless overridden per-cluster. # max_partitions: 10000 # Folder-level indexes: Create B-tree indexes on these columns after data # loading. Inherited by all clusters unless overridden per-cluster. # Indexes are created with IF NOT EXISTS for safe use with append mode. # Single column: # indexes: state # Multiple columns (one index per column): # indexes: # - state # - zip # Folder-level column_types: Explicit {column_name: postgres_type} map that # bypasses automatic type inference for the listed columns. Applied to # every cluster unless a cluster supplies its own column_types, which are # merged on top (cluster entries win on conflict). # # During --workers>1 runs the pre-scan derives a cluster-wide "auto-union" # type per column (e.g. any file stores the column as CHAR -> TEXT; all # NUM with any format hinting decimals -> DOUBLE PRECISION; otherwise # BIGINT). Entries in column_types here win over that auto-union - use # them when the auto result is wrong or when --no-prescan disables the # auto-union and you still need to pin a column. # # Valid type strings are anything the CREATE TABLE DDL accepts (TEXT, # INTEGER, BIGINT, DOUBLE PRECISION, DATE, TIMESTAMP, ...). Columns that # don't exist in a given file are simply ignored for that file. # # column_types: # RESP_PH_PREFIX_ID: TEXT # RESP_PH_SUFFIX_ID: TEXT # SOMELONG_ID: BIGINT # Explicit cluster patterns. Each pattern is matched against the file # *basename*. Files matched by a pattern are pulled out of the auto-detect # pool, so explicit and auto clusters compose cleanly. # # `tablename` is required. `if_exists`, `include`, `exclude`, and # `column_types` are optional per-cluster overrides of the folder-level # defaults above. Cluster-level column_types entries win over folder- # level entries for the same column. clusters: - pattern: '^group_a\d+\.xpt$' tablename: group_a # column_types: # INTCOL: TEXT # Example of an explicit override. Uncomment to force the group_b cluster to # append instead of replace even though the folder default is "replace": # # - pattern: '^group_b\d+\.xpt$' # tablename: group_b # if_exists: append # Per-cluster partition_by / max_partitions override. These take precedence # over the folder-level defaults above. # # - pattern: '^group_c\d+\.xpt$' # tablename: group_c # partition_by: # - region # - year # max_partitions: 500 # Per-cluster indexes override. Takes precedence over the folder-level # indexes default above. An explicit empty list disables indexing for # this cluster even when the folder default has indexes. # # - pattern: '^group_d\d+\.xpt$' # tablename: group_d # indexes: # - region # - year # Embedded-digit example. When the varying number sits in the MIDDLE of # the stem (e.g. year2020_regionA_40_detail.sas7bdat, # year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group # them - each file becomes its own singleton cluster. An explicit # pattern bucketizes them correctly. The \d+ matches any width, and # files within the cluster are sorted numerically by the last digit # group in the stem, so _9_ sorts before _40_ regardless of zero- # padding. Gaps in the numeric sequence (missing 3, 7, 14, ...) are # fine - whatever files are present get loaded in numeric order. # # - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$' # tablename: year2020_regionA_detail # With only the group_a pattern explicit, auto_detect: true will still # bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone # standalone.xpt into a "standalone" cluster. See generate_sample_folder.py # for the fixture that exercises exactly this layout.