# Example S3 download config for utils/s3_download.py. # # Shape mirrors what `s3_download.py` expects: # # python s3_download.py --config sample_s3_download_config.yaml --dry-run # python s3_download.py --config sample_s3_download_config.yaml # # Relative paths (e.g. local_folder) are resolved against this config file's # directory first, falling back to the current working directory if that # doesn't exist. # --------------------------------------------------------------------------- # Required: where to read from and write to # --------------------------------------------------------------------------- bucket: my-bucket # Listing is recursive under this prefix - no S3 Delimiter is used. A nested # object like `census/2020/raw/nested/group_c1.sas7bdat` will be considered. # Regex patterns below match against the object BASENAME only (the part # after the last `/`), so subfolder location does not affect matching. prefix: census/2020/raw/ # Root destination on disk. One subfolder per cluster is created beneath it. # If two objects in the same cluster share a basename (possible under the # recursive scan), the second one is renamed to a key-derived filename # (slashes replaced with `__`) so neither file overwrites the other. local_folder: ./downloads # --------------------------------------------------------------------------- # Optional: AWS credentials # --------------------------------------------------------------------------- # Named profile from ~/.aws/credentials. Omit to use the default boto3 # credential chain (env vars, instance role, SSO, etc.). # aws_profile: default # --------------------------------------------------------------------------- # Optional: discovery behavior # --------------------------------------------------------------------------- # When true (default), any object not matched by an explicit pattern below is # auto-grouped with its peers by stripping trailing digits (and any trailing # _ / -) from the basename stem. Stems without trailing digits become their # own singleton cluster. # # Auto-detect only recognizes *trailing* digit runs. If your basenames put # the varying number in the middle of the stem (e.g. surrounded by year, # region, and detail components), auto-detect will NOT group them - each # object becomes its own singleton cluster. Use an explicit pattern instead; # see the embedded-digit example near the bottom of this file. auto_detect: true # Object extensions to consider. Anything else under the prefix is ignored. # Default (when this key is omitted): .sas7bdat, .xpt, .xport, .txt, .csv, .tsv # extensions: # - .sas7bdat # - .xpt # - .txt # - .csv # - .tsv # --------------------------------------------------------------------------- # Optional: download behavior # --------------------------------------------------------------------------- # What to do when the destination file already exists locally. # skip - (default) if the local file's byte size matches the S3 # object's Size, reuse it. If sizes differ, re-download with a # warning. Same byte-size cache rule used by # utils/file_viewer.py::_ensure_local_copy. # overwrite - always re-download. # error - abort the run if any local file exists with a different size. # (Equal sizes still skip.) on_exists: skip # Parallel download workers. boto3 clients are thread-safe. Default: 4. # concurrency: 4 # --------------------------------------------------------------------------- # Explicit cluster patterns # --------------------------------------------------------------------------- # # Each pattern is matched against the S3 object BASENAME (last path segment # of the key). Objects matched by a pattern are pulled out of the # auto-detect pool, so explicit and auto clusters compose cleanly. # # `name` becomes both the subfolder name under `local_folder` and the label # used in the discovery / summary output. Names must be unique across # explicit clusters. clusters: - pattern: '^group_a\d+\.sas7bdat$' name: group_a # - pattern: '^group_b\d+\.sas7bdat$' # name: group_b # Embedded-digit example. When the varying number sits in the MIDDLE of # the basename stem (e.g. year2020_regionA_40_detail.sas7bdat, # year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group # them - each object becomes its own singleton cluster. An explicit # pattern bucketizes them correctly. The \d+ matches any width, and # objects within the cluster are sorted numerically by the last digit # group in the stem, so _9_ sorts before _40_ regardless of zero-padding. # # - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$' # name: year2020_regionA_detail # Text file cluster example (when file_type: text): # - pattern: '^data_group_a\d+\.txt$' # name: data_group_a