112 lines
4.7 KiB
YAML
112 lines
4.7 KiB
YAML
# Example S3 download config for utils/s3_download.py.
|
|
#
|
|
# Shape mirrors what `s3_download.py` expects:
|
|
#
|
|
# python s3_download.py --config sample_s3_download_config.yaml --dry-run
|
|
# python s3_download.py --config sample_s3_download_config.yaml
|
|
#
|
|
# Relative paths (e.g. local_folder) are resolved against this config file's
|
|
# directory first, falling back to the current working directory if that
|
|
# doesn't exist.
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Required: where to read from and write to
|
|
# ---------------------------------------------------------------------------
|
|
|
|
bucket: my-bucket
|
|
|
|
# Listing is recursive under this prefix - no S3 Delimiter is used. A nested
|
|
# object like `census/2020/raw/nested/group_c1.sas7bdat` will be considered.
|
|
# Regex patterns below match against the object BASENAME only (the part
|
|
# after the last `/`), so subfolder location does not affect matching.
|
|
prefix: census/2020/raw/
|
|
|
|
# Root destination on disk. One subfolder per cluster is created beneath it.
|
|
# If two objects in the same cluster share a basename (possible under the
|
|
# recursive scan), the second one is renamed to a key-derived filename
|
|
# (slashes replaced with `__`) so neither file overwrites the other.
|
|
local_folder: ./downloads
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Optional: AWS credentials
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Named profile from ~/.aws/credentials. Omit to use the default boto3
|
|
# credential chain (env vars, instance role, SSO, etc.).
|
|
# aws_profile: default
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Optional: discovery behavior
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# When true (default), any object not matched by an explicit pattern below is
|
|
# auto-grouped with its peers by stripping trailing digits (and any trailing
|
|
# _ / -) from the basename stem. Stems without trailing digits become their
|
|
# own singleton cluster.
|
|
#
|
|
# Auto-detect only recognizes *trailing* digit runs. If your basenames put
|
|
# the varying number in the middle of the stem (e.g. surrounded by year,
|
|
# region, and detail components), auto-detect will NOT group them - each
|
|
# object becomes its own singleton cluster. Use an explicit pattern instead;
|
|
# see the embedded-digit example near the bottom of this file.
|
|
auto_detect: true
|
|
|
|
# Object extensions to consider. Anything else under the prefix is ignored.
|
|
# Default (when this key is omitted): .sas7bdat, .xpt, .xport, .txt, .csv, .tsv
|
|
# extensions:
|
|
# - .sas7bdat
|
|
# - .xpt
|
|
# - .txt
|
|
# - .csv
|
|
# - .tsv
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Optional: download behavior
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# What to do when the destination file already exists locally.
|
|
# skip - (default) if the local file's byte size matches the S3
|
|
# object's Size, reuse it. If sizes differ, re-download with a
|
|
# warning. Same byte-size cache rule used by
|
|
# utils/file_viewer.py::_ensure_local_copy.
|
|
# overwrite - always re-download.
|
|
# error - abort the run if any local file exists with a different size.
|
|
# (Equal sizes still skip.)
|
|
on_exists: skip
|
|
|
|
# Parallel download workers. boto3 clients are thread-safe. Default: 4.
|
|
# concurrency: 4
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Explicit cluster patterns
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# Each pattern is matched against the S3 object BASENAME (last path segment
|
|
# of the key). Objects matched by a pattern are pulled out of the
|
|
# auto-detect pool, so explicit and auto clusters compose cleanly.
|
|
#
|
|
# `name` becomes both the subfolder name under `local_folder` and the label
|
|
# used in the discovery / summary output. Names must be unique across
|
|
# explicit clusters.
|
|
clusters:
|
|
- pattern: '^group_a\d+\.sas7bdat$'
|
|
name: group_a
|
|
|
|
# - pattern: '^group_b\d+\.sas7bdat$'
|
|
# name: group_b
|
|
|
|
# Embedded-digit example. When the varying number sits in the MIDDLE of
|
|
# the basename stem (e.g. year2020_regionA_40_detail.sas7bdat,
|
|
# year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group
|
|
# them - each object becomes its own singleton cluster. An explicit
|
|
# pattern bucketizes them correctly. The \d+ matches any width, and
|
|
# objects within the cluster are sorted numerically by the last digit
|
|
# group in the stem, so _9_ sorts before _40_ regardless of zero-padding.
|
|
#
|
|
# - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
|
|
# name: year2020_regionA_detail
|
|
|
|
# Text file cluster example (when file_type: text):
|
|
# - pattern: '^data_group_a\d+\.txt$'
|
|
# name: data_group_a
|