foxtrot/utils/sample_s3_download_config.yaml

# Example S3 download config for utils/s3_download.py.
#
# Shape mirrors what `s3_download.py` expects:
#
#   python s3_download.py --config sample_s3_download_config.yaml --dry-run
#   python s3_download.py --config sample_s3_download_config.yaml
#
# Relative paths (e.g. local_folder) are resolved against this config file's
# directory first, falling back to the current working directory if that
# doesn't exist.

# ---------------------------------------------------------------------------
# Required: where to read from and write to
# ---------------------------------------------------------------------------

bucket: my-bucket

# Listing is recursive under this prefix - no S3 Delimiter is used. A nested
# object like `census/2020/raw/nested/group_c1.sas7bdat` will be considered.
# Regex patterns below match against the object BASENAME only (the part
# after the last `/`), so subfolder location does not affect matching.
prefix: census/2020/raw/

# Root destination on disk. One subfolder per cluster is created beneath it.
# If two objects in the same cluster share a basename (possible under the
# recursive scan), the second one is renamed to a key-derived filename
# (slashes replaced with `__`) so neither file overwrites the other.
local_folder: ./downloads

# ---------------------------------------------------------------------------
# Optional: AWS credentials
# ---------------------------------------------------------------------------

# Named profile from ~/.aws/credentials. Omit to use the default boto3
# credential chain (env vars, instance role, SSO, etc.).
# aws_profile: default

# ---------------------------------------------------------------------------
# Optional: discovery behavior
# ---------------------------------------------------------------------------

# When true (default), any object not matched by an explicit pattern below is
# auto-grouped with its peers by stripping trailing digits (and any trailing
# _ / -) from the basename stem. Stems without trailing digits become their
# own singleton cluster.
#
# Auto-detect only recognizes *trailing* digit runs. If your basenames put
# the varying number in the middle of the stem (e.g. surrounded by year,
# region, and detail components), auto-detect will NOT group them - each
# object becomes its own singleton cluster. Use an explicit pattern instead;
# see the embedded-digit example near the bottom of this file.
auto_detect: true

# Object extensions to consider. Anything else under the prefix is ignored.
# Default (when this key is omitted): .sas7bdat, .xpt, .xport, .txt, .csv, .tsv
# extensions:
#   - .sas7bdat
#   - .xpt
#   - .txt
#   - .csv
#   - .tsv

# ---------------------------------------------------------------------------
# Optional: download behavior
# ---------------------------------------------------------------------------

# What to do when the destination file already exists locally.
#   skip      - (default) if the local file's byte size matches the S3
#               object's Size, reuse it. If sizes differ, re-download with a
#               warning. Same byte-size cache rule used by
#               utils/file_viewer.py::_ensure_local_copy.
#   overwrite - always re-download.
#   error     - abort the run if any local file exists with a different size.
#               (Equal sizes still skip.)
on_exists: skip

# Parallel download workers. boto3 clients are thread-safe. Default: 4.
# concurrency: 4

# ---------------------------------------------------------------------------
# Explicit cluster patterns
# ---------------------------------------------------------------------------
#
# Each pattern is matched against the S3 object BASENAME (last path segment
# of the key). Objects matched by a pattern are pulled out of the
# auto-detect pool, so explicit and auto clusters compose cleanly.
#
# `name` becomes both the subfolder name under `local_folder` and the label
# used in the discovery / summary output. Names must be unique across
# explicit clusters.
clusters:
  - pattern: '^group_a\d+\.sas7bdat$'
    name: group_a

  # - pattern: '^group_b\d+\.sas7bdat$'
  #   name: group_b

  # Embedded-digit example. When the varying number sits in the MIDDLE of
  # the basename stem (e.g. year2020_regionA_40_detail.sas7bdat,
  # year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group
  # them - each object becomes its own singleton cluster. An explicit
  # pattern bucketizes them correctly. The \d+ matches any width, and
  # objects within the cluster are sorted numerically by the last digit
  # group in the stem, so _9_ sorts before _40_ regardless of zero-padding.
  #
  # - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
  #   name: year2020_regionA_detail

  # Text file cluster example (when file_type: text):
  # - pattern: '^data_group_a\d+\.txt$'
  #   name: data_group_a