foxtrot/utils/sample_s3_download_config.yaml
2026-04-21 20:05:26 -05:00

112 lines
4.7 KiB
YAML

# Example S3 download config for utils/s3_download.py.
#
# Shape mirrors what `s3_download.py` expects:
#
# python s3_download.py --config sample_s3_download_config.yaml --dry-run
# python s3_download.py --config sample_s3_download_config.yaml
#
# Relative paths (e.g. local_folder) are resolved against this config file's
# directory first, falling back to the current working directory if that
# doesn't exist.
# ---------------------------------------------------------------------------
# Required: where to read from and write to
# ---------------------------------------------------------------------------
bucket: my-bucket
# Listing is recursive under this prefix - no S3 Delimiter is used. A nested
# object like `census/2020/raw/nested/group_c1.sas7bdat` will be considered.
# Regex patterns below match against the object BASENAME only (the part
# after the last `/`), so subfolder location does not affect matching.
prefix: census/2020/raw/
# Root destination on disk. One subfolder per cluster is created beneath it.
# If two objects in the same cluster share a basename (possible under the
# recursive scan), the second one is renamed to a key-derived filename
# (slashes replaced with `__`) so neither file overwrites the other.
local_folder: ./downloads
# ---------------------------------------------------------------------------
# Optional: AWS credentials
# ---------------------------------------------------------------------------
# Named profile from ~/.aws/credentials. Omit to use the default boto3
# credential chain (env vars, instance role, SSO, etc.).
# aws_profile: default
# ---------------------------------------------------------------------------
# Optional: discovery behavior
# ---------------------------------------------------------------------------
# When true (default), any object not matched by an explicit pattern below is
# auto-grouped with its peers by stripping trailing digits (and any trailing
# _ / -) from the basename stem. Stems without trailing digits become their
# own singleton cluster.
#
# Auto-detect only recognizes *trailing* digit runs. If your basenames put
# the varying number in the middle of the stem (e.g. surrounded by year,
# region, and detail components), auto-detect will NOT group them - each
# object becomes its own singleton cluster. Use an explicit pattern instead;
# see the embedded-digit example near the bottom of this file.
auto_detect: true
# Object extensions to consider. Anything else under the prefix is ignored.
# Default (when this key is omitted): .sas7bdat, .xpt, .xport, .txt, .csv, .tsv
# extensions:
# - .sas7bdat
# - .xpt
# - .txt
# - .csv
# - .tsv
# ---------------------------------------------------------------------------
# Optional: download behavior
# ---------------------------------------------------------------------------
# What to do when the destination file already exists locally.
# skip - (default) if the local file's byte size matches the S3
# object's Size, reuse it. If sizes differ, re-download with a
# warning. Same byte-size cache rule used by
# utils/file_viewer.py::_ensure_local_copy.
# overwrite - always re-download.
# error - abort the run if any local file exists with a different size.
# (Equal sizes still skip.)
on_exists: skip
# Parallel download workers. boto3 clients are thread-safe. Default: 4.
# concurrency: 4
# ---------------------------------------------------------------------------
# Explicit cluster patterns
# ---------------------------------------------------------------------------
#
# Each pattern is matched against the S3 object BASENAME (last path segment
# of the key). Objects matched by a pattern are pulled out of the
# auto-detect pool, so explicit and auto clusters compose cleanly.
#
# `name` becomes both the subfolder name under `local_folder` and the label
# used in the discovery / summary output. Names must be unique across
# explicit clusters.
clusters:
- pattern: '^group_a\d+\.sas7bdat$'
name: group_a
# - pattern: '^group_b\d+\.sas7bdat$'
# name: group_b
# Embedded-digit example. When the varying number sits in the MIDDLE of
# the basename stem (e.g. year2020_regionA_40_detail.sas7bdat,
# year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group
# them - each object becomes its own singleton cluster. An explicit
# pattern bucketizes them correctly. The \d+ matches any width, and
# objects within the cluster are sorted numerically by the last digit
# group in the stem, so _9_ sorts before _40_ regardless of zero-padding.
#
# - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
# name: year2020_regionA_detail
# Text file cluster example (when file_type: text):
# - pattern: '^data_group_a\d+\.txt$'
# name: data_group_a