foxtrot/generic_loader/sample_config.yaml

79 lines
2.7 KiB
YAML
Raw Normal View History

2026-04-18 14:34:48 +00:00
filename: samples/sample_kitchensink.xpt
schemaname: public
tablename: kitchensink
# Optional. If set, only these columns are loaded. Mutually exclusive with exclude.
# include:
# - ID
# - INTCOL
# - DATECOL
# Optional. Columns to drop.
# exclude:
# - ALLNULL
# What to do if the target table already exists: fail | replace | append
# Defaults to fail.
if_exists: append
2026-04-20 14:56:00 +00:00
2026-04-22 01:05:26 +00:00
# file_type: Type of data file to load. One of: sas | text. Default: sas.
# sas - SAS files (.sas7bdat, .xpt, .xport) read via pyreadstat
# text - Delimited text files (.txt, .csv, .tsv) read via pandas
# file_type: sas
# delimiter: Column delimiter for text files. Only used when file_type: text.
# Accepts: "," (comma, default), "tab" or "\t" (tab), "pipe" or "|" (pipe),
# or any single character.
# delimiter: ","
# text_encoding: Character encoding for text files. Default: utf-8.
# Common alternatives: latin-1, cp1252, iso-8859-1.
# text_encoding: utf-8
# quotechar: Quote character for text files. Default: '"' (double quote).
# quotechar: '"'
2026-04-20 14:56:00 +00:00
# partition_by: Partition the table by unique values of these columns.
# Columns are applied in cascading order (first column = top-level partition).
# Requires if_exists: replace or fail (not append for initial creation).
# Single field:
# partition_by: state
# Multiple fields (cascading):
# partition_by:
# - state
# - zip
#
# max_partitions: Warning threshold for total partition count (default: 10000).
# If the number of partitions exceeds this, a warning is logged but loading continues.
# max_partitions: 10000
2026-04-20 15:18:09 +00:00
# indexes: Create B-tree indexes on these columns after data loading.
# Indexes are created with IF NOT EXISTS for safe use with append mode.
# Single column:
# indexes: state
# Multiple columns (one index per column):
# indexes:
# - state
# - zip
# column_types: Explicit {column_name: postgres_type} overrides that
# bypass automatic type inference for the listed columns. Useful when
# pyreadstat reports a column as NUM but you want it stored as TEXT
# (phone/ID columns that are conceptually strings), or when a column's
# inferred type is off for any other reason. Columns not listed here
# fall through to the normal inference path. Nullability is always
# computed from the data.
#
# column_types:
# RESP_PH_PREFIX_ID: TEXT
# SOMELONG_ID: BIGINT
# all_nullable: If true, every column is stamped nullable in the generated
# schema; NOT NULL inference is skipped entirely. Use this when the sampler
# wrongly concludes a column has no nulls (e.g. a dense sample followed by
# rare-null data downstream) and COPY blows up mid-load on the first null
# it hits. Off by default. The CLI flag --all-nullable overrides this to
# true when set.
#
# all_nullable: false