Merge main into directory_explorer: combine text file support with exception tracking
This commit is contained in:
commit
f3bd5f02aa
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -16,6 +16,23 @@ tablename: kitchensink
|
|||||||
# Defaults to fail.
|
# Defaults to fail.
|
||||||
if_exists: append
|
if_exists: append
|
||||||
|
|
||||||
|
# file_type: Type of data file to load. One of: sas | text. Default: sas.
|
||||||
|
# sas - SAS files (.sas7bdat, .xpt, .xport) read via pyreadstat
|
||||||
|
# text - Delimited text files (.txt, .csv, .tsv) read via pandas
|
||||||
|
# file_type: sas
|
||||||
|
|
||||||
|
# delimiter: Column delimiter for text files. Only used when file_type: text.
|
||||||
|
# Accepts: "," (comma, default), "tab" or "\t" (tab), "pipe" or "|" (pipe),
|
||||||
|
# or any single character.
|
||||||
|
# delimiter: ","
|
||||||
|
|
||||||
|
# text_encoding: Character encoding for text files. Default: utf-8.
|
||||||
|
# Common alternatives: latin-1, cp1252, iso-8859-1.
|
||||||
|
# text_encoding: utf-8
|
||||||
|
|
||||||
|
# quotechar: Quote character for text files. Default: '"' (double quote).
|
||||||
|
# quotechar: '"'
|
||||||
|
|
||||||
# partition_by: Partition the table by unique values of these columns.
|
# partition_by: Partition the table by unique values of these columns.
|
||||||
# Columns are applied in cascading order (first column = top-level partition).
|
# Columns are applied in cascading order (first column = top-level partition).
|
||||||
# Requires if_exists: replace or fail (not append for initial creation).
|
# Requires if_exists: replace or fail (not append for initial creation).
|
||||||
@ -38,3 +55,24 @@ if_exists: append
|
|||||||
# indexes:
|
# indexes:
|
||||||
# - state
|
# - state
|
||||||
# - zip
|
# - zip
|
||||||
|
|
||||||
|
# column_types: Explicit {column_name: postgres_type} overrides that
|
||||||
|
# bypass automatic type inference for the listed columns. Useful when
|
||||||
|
# pyreadstat reports a column as NUM but you want it stored as TEXT
|
||||||
|
# (phone/ID columns that are conceptually strings), or when a column's
|
||||||
|
# inferred type is off for any other reason. Columns not listed here
|
||||||
|
# fall through to the normal inference path. Nullability is always
|
||||||
|
# computed from the data.
|
||||||
|
#
|
||||||
|
# column_types:
|
||||||
|
# RESP_PH_PREFIX_ID: TEXT
|
||||||
|
# SOMELONG_ID: BIGINT
|
||||||
|
|
||||||
|
# all_nullable: If true, every column is stamped nullable in the generated
|
||||||
|
# schema; NOT NULL inference is skipped entirely. Use this when the sampler
|
||||||
|
# wrongly concludes a column has no nulls (e.g. a dense sample followed by
|
||||||
|
# rare-null data downstream) and COPY blows up mid-load on the first null
|
||||||
|
# it hits. Off by default. The CLI flag --all-nullable overrides this to
|
||||||
|
# true when set.
|
||||||
|
#
|
||||||
|
# all_nullable: false
|
||||||
|
|||||||
@ -27,6 +27,25 @@ if_exists: replace
|
|||||||
# see the embedded-digit example near the bottom of this file.
|
# see the embedded-digit example near the bottom of this file.
|
||||||
auto_detect: true
|
auto_detect: true
|
||||||
|
|
||||||
|
# file_type: Type of data files in this folder. One of: sas | text. Default: sas.
|
||||||
|
# sas - SAS files (.sas7bdat, .xpt, .xport) read via pyreadstat
|
||||||
|
# text - Delimited text files (.txt, .csv, .tsv) read via pandas
|
||||||
|
# When set to 'text', the folder scanner looks for .txt/.csv/.tsv files
|
||||||
|
# instead of .sas7bdat/.xpt/.xport files.
|
||||||
|
# file_type: sas
|
||||||
|
|
||||||
|
# delimiter: Column delimiter for text files. Only used when file_type: text.
|
||||||
|
# Accepts: "," (comma, default), "tab" or "\t" (tab), "pipe" or "|" (pipe),
|
||||||
|
# or any single character.
|
||||||
|
# delimiter: ","
|
||||||
|
|
||||||
|
# text_encoding: Character encoding for text files. Default: utf-8.
|
||||||
|
# Common alternatives: latin-1, cp1252, iso-8859-1.
|
||||||
|
# text_encoding: utf-8
|
||||||
|
|
||||||
|
# quotechar: Quote character for text files. Default: '"' (double quote).
|
||||||
|
# quotechar: '"'
|
||||||
|
|
||||||
# Folder-level column filter. Every file in every cluster passes through
|
# Folder-level column filter. Every file in every cluster passes through
|
||||||
# this filter. `include` and `exclude` are mutually exclusive. A cluster can
|
# this filter. `include` and `exclude` are mutually exclusive. A cluster can
|
||||||
# override these via its own `include` / `exclude` keys.
|
# override these via its own `include` / `exclude` keys.
|
||||||
@ -61,15 +80,52 @@ auto_detect: true
|
|||||||
# - state
|
# - state
|
||||||
# - zip
|
# - zip
|
||||||
|
|
||||||
|
# Folder-level column_types: Explicit {column_name: postgres_type} map that
|
||||||
|
# bypasses automatic type inference for the listed columns. Applied to
|
||||||
|
# every cluster unless a cluster supplies its own column_types, which are
|
||||||
|
# merged on top (cluster entries win on conflict).
|
||||||
|
#
|
||||||
|
# During --workers>1 runs the pre-scan derives a cluster-wide "auto-union"
|
||||||
|
# type per column (e.g. any file stores the column as CHAR -> TEXT; all
|
||||||
|
# NUM with any format hinting decimals -> DOUBLE PRECISION; otherwise
|
||||||
|
# BIGINT). Entries in column_types here win over that auto-union - use
|
||||||
|
# them when the auto result is wrong or when --no-prescan disables the
|
||||||
|
# auto-union and you still need to pin a column.
|
||||||
|
#
|
||||||
|
# Valid type strings are anything the CREATE TABLE DDL accepts (TEXT,
|
||||||
|
# INTEGER, BIGINT, DOUBLE PRECISION, DATE, TIMESTAMP, ...). Columns that
|
||||||
|
# don't exist in a given file are simply ignored for that file.
|
||||||
|
#
|
||||||
|
# column_types:
|
||||||
|
# RESP_PH_PREFIX_ID: TEXT
|
||||||
|
# RESP_PH_SUFFIX_ID: TEXT
|
||||||
|
# SOMELONG_ID: BIGINT
|
||||||
|
|
||||||
|
# Folder-level all_nullable: If true, every column of every cluster is
|
||||||
|
# stamped nullable in the generated schema; NOT NULL inference is skipped
|
||||||
|
# entirely. Use this when the sampler wrongly concludes a column has no
|
||||||
|
# nulls (sampled rows happened to be dense, but later files in the cluster
|
||||||
|
# carry nulls) and COPY blows up mid-load. Inherited by all clusters
|
||||||
|
# unless a cluster supplies its own all_nullable. The CLI flag
|
||||||
|
# --all-nullable overrides both this and any per-cluster setting when
|
||||||
|
# passed. Off by default.
|
||||||
|
#
|
||||||
|
# all_nullable: false
|
||||||
|
|
||||||
# Explicit cluster patterns. Each pattern is matched against the file
|
# Explicit cluster patterns. Each pattern is matched against the file
|
||||||
# *basename*. Files matched by a pattern are pulled out of the auto-detect
|
# *basename*. Files matched by a pattern are pulled out of the auto-detect
|
||||||
# pool, so explicit and auto clusters compose cleanly.
|
# pool, so explicit and auto clusters compose cleanly.
|
||||||
#
|
#
|
||||||
# `tablename` is required. `if_exists`, `include`, and `exclude` are
|
# `tablename` is required. `if_exists`, `include`, `exclude`, and
|
||||||
# optional per-cluster overrides of the folder-level defaults above.
|
# `column_types` are optional per-cluster overrides of the folder-level
|
||||||
|
# defaults above. Cluster-level column_types entries win over folder-
|
||||||
|
# level entries for the same column.
|
||||||
clusters:
|
clusters:
|
||||||
- pattern: '^group_a\d+\.xpt$'
|
- pattern: '^group_a\d+\.xpt$'
|
||||||
tablename: group_a
|
tablename: group_a
|
||||||
|
# column_types:
|
||||||
|
# INTCOL: TEXT
|
||||||
|
# all_nullable: true # per-cluster override of the folder-level default
|
||||||
|
|
||||||
# Example of an explicit override. Uncomment to force the group_b cluster to
|
# Example of an explicit override. Uncomment to force the group_b cluster to
|
||||||
# append instead of replace even though the folder default is "replace":
|
# append instead of replace even though the folder default is "replace":
|
||||||
@ -111,6 +167,10 @@ clusters:
|
|||||||
# - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
|
# - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
|
||||||
# tablename: year2020_regionA_detail
|
# tablename: year2020_regionA_detail
|
||||||
|
|
||||||
|
# Text file cluster example (when file_type: text):
|
||||||
|
# - pattern: '^data_group_a\d+\.txt$'
|
||||||
|
# tablename: data_group_a
|
||||||
|
|
||||||
# With only the group_a pattern explicit, auto_detect: true will still
|
# With only the group_a pattern explicit, auto_detect: true will still
|
||||||
# bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
|
# bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
|
||||||
# standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
|
# standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
|
||||||
|
|||||||
@ -1,7 +1,10 @@
|
|||||||
pandas>=2.0,<3.0
|
pandas>=2.0,<3.0
|
||||||
pyreadstat>=1.2,<2.0
|
pyreadstat>=1.2,<2.0
|
||||||
numpy>=2.1,<3.0
|
numpy>=2.1,<3.0
|
||||||
|
pyarrow>=22.0,<24.0
|
||||||
pyyaml>=6.0,<7.0
|
pyyaml>=6.0,<7.0
|
||||||
psycopg2-binary>=2.9,<3.0
|
psycopg2-binary>=2.9,<3.0
|
||||||
python-dotenv>=1.0,<2.0
|
python-dotenv>=1.0,<2.0
|
||||||
boto3>=1.28,<2.0
|
boto3>=1.28,<2.0
|
||||||
|
openpyxl>=3.1,<4.0
|
||||||
|
tqdm>=4.66,<5.0
|
||||||
|
|||||||
@ -1,30 +1,39 @@
|
|||||||
"""Explore S3 directories and categorise them by accessibility.
|
"""Explore S3 directories and categorise them by accessibility.
|
||||||
|
|
||||||
Reads a text file containing one S3 prefix per line (paths within the bucket
|
Reads a text file containing one S3 prefix per line (paths within the bucket
|
||||||
configured by the ``S3_BUCKET`` constant), then for each prefix:
|
configured by the ``S3_BUCKET`` constant or ``--bucket`` CLI argument), then
|
||||||
|
for each prefix:
|
||||||
|
|
||||||
- Lists all objects recursively (via ``list_objects_v2`` paginator)
|
- Lists all objects recursively (via ``list_objects_v2`` paginator)
|
||||||
- **Only considers files matching the ``FILE_EXTENSION`` filter** (default
|
- **Only considers files matching the configured extensions** (default: all
|
||||||
``.sas7bdat``). All other file types are ignored.
|
supported extensions — SAS and text). All other file types are ignored.
|
||||||
- Tests read permission with ``head_object`` on the first matching file found
|
- Tests read permission with ``head_object`` on the first matching file found
|
||||||
- If the first file is accessible, tests ALL remaining files individually
|
- If the first file is accessible, tests ALL remaining files individually
|
||||||
- Categorises the directory as **Available**, **Blocked**, **Empty**, and
|
- Categorises the directory as **Available**, **Blocked**, **Empty**, and
|
||||||
tracks individual file **Exceptions** within available directories
|
tracks individual file **Exceptions** within available directories
|
||||||
|
|
||||||
|
Supported file types
|
||||||
|
--------------------
|
||||||
|
* **SAS files**: ``.sas7bdat``, ``.xpt``, ``.xport``
|
||||||
|
* **Text / delimited files**: ``.txt``, ``.csv``, ``.tsv``
|
||||||
|
|
||||||
A directory is considered *empty* if it contains no files matching the
|
A directory is considered *empty* if it contains no files matching the
|
||||||
extension filter, even when other file types are present.
|
extension filter, even when other file types are present.
|
||||||
|
|
||||||
Configure the constants below, then run::
|
Configure the constants below (or use CLI arguments), then run::
|
||||||
|
|
||||||
python3 data_explorer.py
|
python3 data_explorer.py [OPTIONS]
|
||||||
|
|
||||||
Python 3.10+ compatible. Requires only ``boto3`` / ``botocore`` and stdlib.
|
Python 3.10+ compatible. Requires ``boto3`` / ``botocore`` and stdlib.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import List, Tuple
|
from typing import List, Set, Tuple
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Dependency check
|
# Dependency check
|
||||||
@ -43,11 +52,25 @@ except ImportError:
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Configuration — edit these before running
|
# Extension constants
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
FILE_EXTENSION: str = ".sas7bdat"
|
SAS_EXTENSIONS: Set[str] = {".sas7bdat", ".xpt", ".xport"}
|
||||||
"""Only files whose key ends with this extension (case-insensitive) are considered."""
|
"""File extensions recognised as SAS data files."""
|
||||||
|
|
||||||
|
TEXT_EXTENSIONS: Set[str] = {".txt", ".csv", ".tsv"}
|
||||||
|
"""File extensions recognised as delimited text / CSV files."""
|
||||||
|
|
||||||
|
SUPPORTED_EXTENSIONS: Set[str] = SAS_EXTENSIONS | TEXT_EXTENSIONS
|
||||||
|
"""Union of all file extensions this tool can work with."""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Configuration defaults — edit these or override via CLI arguments
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
FILE_EXTENSIONS: Set[str] = SUPPORTED_EXTENSIONS
|
||||||
|
"""Set of extensions to filter on (case-insensitive). Defaults to all supported."""
|
||||||
|
|
||||||
INPUT_FILE: str = "s3_directories.txt"
|
INPUT_FILE: str = "s3_directories.txt"
|
||||||
"""Path to the text file containing one S3 prefix per line."""
|
"""Path to the text file containing one S3 prefix per line."""
|
||||||
@ -58,6 +81,57 @@ S3_BUCKET: str = "my-bucket"
|
|||||||
AWS_PROFILE: str = "default"
|
AWS_PROFILE: str = "default"
|
||||||
"""AWS CLI profile name used for authentication."""
|
"""AWS CLI profile name used for authentication."""
|
||||||
|
|
||||||
|
# Text-file reading defaults (used when downloading / previewing text files)
|
||||||
|
DEFAULT_DELIMITER: str = ","
|
||||||
|
DEFAULT_ENCODING: str = "utf-8"
|
||||||
|
DEFAULT_QUOTECHAR: str = '"'
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Auto-detection helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def detect_file_type(filename: str) -> str:
|
||||||
|
"""Return ``'sas'``, ``'text'``, or ``'unknown'`` based on *filename* extension.
|
||||||
|
|
||||||
|
The check is case-insensitive. For ``.tsv`` files the caller should
|
||||||
|
default the delimiter to a tab character (``'\\t'``).
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> detect_file_type("data.sas7bdat")
|
||||||
|
'sas'
|
||||||
|
>>> detect_file_type("report.CSV")
|
||||||
|
'text'
|
||||||
|
>>> detect_file_type("archive.zip")
|
||||||
|
'unknown'
|
||||||
|
"""
|
||||||
|
ext = os.path.splitext(filename)[1].lower()
|
||||||
|
if ext in SAS_EXTENSIONS:
|
||||||
|
return "sas"
|
||||||
|
if ext in TEXT_EXTENSIONS:
|
||||||
|
return "text"
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def default_delimiter_for(filename: str) -> str:
|
||||||
|
"""Return a sensible default delimiter for *filename*.
|
||||||
|
|
||||||
|
* ``.tsv`` → ``'\\t'``
|
||||||
|
* everything else → ``','``
|
||||||
|
"""
|
||||||
|
ext = os.path.splitext(filename)[1].lower()
|
||||||
|
if ext == ".tsv":
|
||||||
|
return "\t"
|
||||||
|
return ","
|
||||||
|
|
||||||
|
|
||||||
|
def matches_extensions(key: str, extensions: Set[str]) -> bool:
|
||||||
|
"""Return ``True`` if *key* ends with any extension in *extensions* (case-insensitive)."""
|
||||||
|
key_lower = key.lower()
|
||||||
|
return any(key_lower.endswith(ext) for ext in extensions)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Data structures
|
# Data structures
|
||||||
@ -149,27 +223,36 @@ def format_size(size_bytes: int) -> str:
|
|||||||
return f"{size_bytes:,.1f} TB"
|
return f"{size_bytes:,.1f} TB"
|
||||||
|
|
||||||
|
|
||||||
|
def extensions_label(extensions: Set[str]) -> str:
|
||||||
|
"""Return a compact, sorted label for a set of extensions (e.g. ``.csv/.tsv/.txt``)."""
|
||||||
|
return "/".join(sorted(extensions))
|
||||||
|
|
||||||
|
|
||||||
def list_objects(
|
def list_objects(
|
||||||
s3_client: "botocore.client.S3",
|
s3_client: "botocore.client.S3",
|
||||||
bucket: str,
|
bucket: str,
|
||||||
prefix: str,
|
prefix: str,
|
||||||
|
extensions: Set[str] | None = None,
|
||||||
) -> Tuple[List[Tuple[str, int]], int]:
|
) -> Tuple[List[Tuple[str, int]], int]:
|
||||||
"""Recursively list all objects under *prefix*.
|
"""Recursively list all objects under *prefix*.
|
||||||
|
|
||||||
Only objects whose key ends with ``FILE_EXTENSION`` (case-insensitive) are
|
Only objects whose key ends with one of *extensions* (case-insensitive) are
|
||||||
counted. All other files are silently skipped.
|
counted. All other files are silently skipped. When *extensions* is
|
||||||
|
``None`` the module-level ``FILE_EXTENSIONS`` set is used.
|
||||||
|
|
||||||
Returns ``(files, total_size)`` where *files* is a list of
|
Returns ``(files, total_size)`` where *files* is a list of
|
||||||
``(key, size)`` tuples for every matching object and *total_size* is the
|
``(key, size)`` tuples for every matching object and *total_size* is the
|
||||||
sum of their sizes in bytes.
|
sum of their sizes in bytes.
|
||||||
"""
|
"""
|
||||||
ext_lower = FILE_EXTENSION.lower()
|
if extensions is None:
|
||||||
|
extensions = FILE_EXTENSIONS
|
||||||
|
exts_lower = {e.lower() for e in extensions}
|
||||||
paginator = s3_client.get_paginator("list_objects_v2")
|
paginator = s3_client.get_paginator("list_objects_v2")
|
||||||
files: List[Tuple[str, int]] = []
|
files: List[Tuple[str, int]] = []
|
||||||
total_size: int = 0
|
total_size: int = 0
|
||||||
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
|
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
|
||||||
for obj in page.get("Contents", []):
|
for obj in page.get("Contents", []):
|
||||||
if not obj["Key"].lower().endswith(ext_lower):
|
if not any(obj["Key"].lower().endswith(ext) for ext in exts_lower):
|
||||||
continue
|
continue
|
||||||
files.append((obj["Key"], obj["Size"]))
|
files.append((obj["Key"], obj["Size"]))
|
||||||
total_size += obj["Size"]
|
total_size += obj["Size"]
|
||||||
@ -196,8 +279,26 @@ def check_read_permission(
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def explore_directories(prefixes: List[str]) -> Results:
|
def explore_directories(
|
||||||
"""Explore every prefix in ``S3_BUCKET`` and return categorised *Results*."""
|
prefixes: List[str],
|
||||||
|
*,
|
||||||
|
extensions: Set[str] | None = None,
|
||||||
|
) -> Results:
|
||||||
|
"""Explore every prefix in ``S3_BUCKET`` and return categorised *Results*.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
prefixes:
|
||||||
|
List of S3 key prefixes to explore.
|
||||||
|
extensions:
|
||||||
|
Set of file extensions to filter on. Defaults to the module-level
|
||||||
|
``FILE_EXTENSIONS`` (which itself defaults to ``SUPPORTED_EXTENSIONS``).
|
||||||
|
"""
|
||||||
|
if extensions is None:
|
||||||
|
extensions = FILE_EXTENSIONS
|
||||||
|
exts_lower = {e.lower() for e in extensions}
|
||||||
|
ext_label = extensions_label(extensions)
|
||||||
|
|
||||||
session = boto3.Session(profile_name=AWS_PROFILE)
|
session = boto3.Session(profile_name=AWS_PROFILE)
|
||||||
s3 = session.client("s3")
|
s3 = session.client("s3")
|
||||||
|
|
||||||
@ -206,13 +307,13 @@ def explore_directories(prefixes: List[str]) -> Results:
|
|||||||
|
|
||||||
for idx, prefix in enumerate(prefixes, start=1):
|
for idx, prefix in enumerate(prefixes, start=1):
|
||||||
print(
|
print(
|
||||||
f"[{idx}/{total}] Checking {prefix} (filtering for {FILE_EXTENSION}) ...",
|
f"[{idx}/{total}] Checking {prefix} (filtering for {ext_label}) ...",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- Recursive listing ------------------------------------------------
|
# --- Recursive listing ------------------------------------------------
|
||||||
try:
|
try:
|
||||||
files, total_size = list_objects(s3, S3_BUCKET, prefix)
|
files, total_size = list_objects(s3, S3_BUCKET, prefix, extensions=extensions)
|
||||||
except botocore.exceptions.ClientError as exc:
|
except botocore.exceptions.ClientError as exc:
|
||||||
code = exc.response.get("Error", {}).get("Code", "Unknown")
|
code = exc.response.get("Error", {}).get("Code", "Unknown")
|
||||||
message = exc.response.get("Error", {}).get("Message", str(exc))
|
message = exc.response.get("Error", {}).get("Message", str(exc))
|
||||||
@ -234,12 +335,13 @@ def explore_directories(prefixes: List[str]) -> Results:
|
|||||||
|
|
||||||
# --- Permission check on first file -----------------------------------
|
# --- Permission check on first file -----------------------------------
|
||||||
# Prefer a real object over a zero-byte directory marker (key ending
|
# Prefer a real object over a zero-byte directory marker (key ending
|
||||||
# in "/") for the head_object test.
|
# in "/") for the head_object test. The selected key must also match
|
||||||
|
# the extension filter.
|
||||||
first_key, _ = files[0]
|
first_key, _ = files[0]
|
||||||
test_key = first_key
|
test_key = first_key
|
||||||
if first_key.endswith("/") and total_size > 0:
|
if first_key.endswith("/") and total_size > 0:
|
||||||
for key, size in files:
|
for key, size in files:
|
||||||
if not (key.endswith("/") and size == 0):
|
if not (key.endswith("/") and size == 0) and matches_extensions(key, exts_lower):
|
||||||
test_key = key
|
test_key = key
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -268,7 +370,7 @@ def explore_directories(prefixes: List[str]) -> Results:
|
|||||||
if remaining:
|
if remaining:
|
||||||
if len(remaining) > 10:
|
if len(remaining) > 10:
|
||||||
print(
|
print(
|
||||||
f" Verifying access to {file_count} {FILE_EXTENSION} files in {prefix} ...",
|
f" Verifying access to {file_count} {ext_label} files in {prefix} ...",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -306,11 +408,25 @@ def explore_directories(prefixes: List[str]) -> Results:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def print_results(results: Results) -> None:
|
def print_results(results: Results, *, extensions: Set[str] | None = None) -> None:
|
||||||
"""Print a clean, human-readable summary to stdout."""
|
"""Print a clean, human-readable summary to stdout.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
results:
|
||||||
|
The exploration results to display.
|
||||||
|
extensions:
|
||||||
|
The set of extensions that were used for filtering. Used only for
|
||||||
|
labelling in the output. Defaults to ``FILE_EXTENSIONS``.
|
||||||
|
"""
|
||||||
|
if extensions is None:
|
||||||
|
extensions = FILE_EXTENSIONS
|
||||||
|
ext_label = extensions_label(extensions)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print("=== S3 Directory Explorer Results ===")
|
print("=== S3 Directory Explorer Results ===")
|
||||||
print(f"Bucket: {S3_BUCKET}")
|
print(f"Bucket: {S3_BUCKET}")
|
||||||
|
print(f"Extensions: {ext_label}")
|
||||||
|
|
||||||
# --- Available ---
|
# --- Available ---
|
||||||
print()
|
print()
|
||||||
@ -319,7 +435,7 @@ def print_results(results: Results) -> None:
|
|||||||
for d in results.available:
|
for d in results.available:
|
||||||
print(f" {d.prefix}")
|
print(f" {d.prefix}")
|
||||||
print(
|
print(
|
||||||
f" {FILE_EXTENSION} files: {d.accessible_count}/{d.total_count} accessible"
|
f" Matching files ({ext_label}): {d.accessible_count}/{d.total_count} accessible"
|
||||||
f" | Total Size: {format_size(d.accessible_size)}"
|
f" | Total Size: {format_size(d.accessible_size)}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -332,7 +448,7 @@ def print_results(results: Results) -> None:
|
|||||||
for d in results.blocked:
|
for d in results.blocked:
|
||||||
if d.file_count:
|
if d.file_count:
|
||||||
print(f" {d.prefix}")
|
print(f" {d.prefix}")
|
||||||
print(f" {FILE_EXTENSION} files found: {d.file_count} | Error: {d.error}")
|
print(f" Matching files ({ext_label}) found: {d.file_count} | Error: {d.error}")
|
||||||
else:
|
else:
|
||||||
print(f" {d.prefix}")
|
print(f" {d.prefix}")
|
||||||
print(f" Error: {d.error}")
|
print(f" Error: {d.error}")
|
||||||
@ -351,7 +467,7 @@ def print_results(results: Results) -> None:
|
|||||||
|
|
||||||
# --- Empty ---
|
# --- Empty ---
|
||||||
print()
|
print()
|
||||||
print(f"--- Empty / no {FILE_EXTENSION} files ({len(results.empty)}) ---")
|
print(f"--- Empty / no matching files ({len(results.empty)}) ---")
|
||||||
if results.empty:
|
if results.empty:
|
||||||
for d in results.empty:
|
for d in results.empty:
|
||||||
print(f" {d.prefix}")
|
print(f" {d.prefix}")
|
||||||
@ -361,20 +477,163 @@ def print_results(results: Results) -> None:
|
|||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI argument parsing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def build_arg_parser() -> argparse.ArgumentParser:
|
||||||
|
"""Build and return the CLI argument parser.
|
||||||
|
|
||||||
|
Supports selecting file-type filters, text-file reading parameters, and
|
||||||
|
overriding the default bucket / profile / input-file settings.
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description=(
|
||||||
|
"Explore S3 directories and categorise them by accessibility. "
|
||||||
|
"Supports SAS files (.sas7bdat, .xpt, .xport) and delimited text "
|
||||||
|
"files (.txt, .csv, .tsv)."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- File-type / extension selection ---
|
||||||
|
type_group = parser.add_argument_group("File-type selection")
|
||||||
|
type_group.add_argument(
|
||||||
|
"--file-type",
|
||||||
|
choices=["sas", "text", "all"],
|
||||||
|
default="all",
|
||||||
|
help=(
|
||||||
|
"Restrict the scan to a specific file type. "
|
||||||
|
"'sas' = .sas7bdat/.xpt/.xport only; "
|
||||||
|
"'text' = .txt/.csv/.tsv only; "
|
||||||
|
"'all' = both (default)."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
type_group.add_argument(
|
||||||
|
"--extensions",
|
||||||
|
nargs="+",
|
||||||
|
metavar="EXT",
|
||||||
|
help=(
|
||||||
|
"Explicit list of extensions to filter on (e.g. --extensions .csv .tsv). "
|
||||||
|
"Overrides --file-type when provided."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Text-file reading parameters ---
|
||||||
|
text_group = parser.add_argument_group(
|
||||||
|
"Text-file parameters",
|
||||||
|
description=(
|
||||||
|
"Parameters used when reading delimited text files. These are "
|
||||||
|
"stored for downstream consumers and do not affect the S3 scan "
|
||||||
|
"itself."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
text_group.add_argument(
|
||||||
|
"--delimiter",
|
||||||
|
default=None,
|
||||||
|
help=(
|
||||||
|
"Field delimiter for text files (default: ',' for .csv/.txt, "
|
||||||
|
"'\\t' for .tsv). Use 'tab' or '\\t' for a tab character."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
text_group.add_argument(
|
||||||
|
"--encoding",
|
||||||
|
default=DEFAULT_ENCODING,
|
||||||
|
help=f"Character encoding for text files (default: {DEFAULT_ENCODING}).",
|
||||||
|
)
|
||||||
|
text_group.add_argument(
|
||||||
|
"--quotechar",
|
||||||
|
default=DEFAULT_QUOTECHAR,
|
||||||
|
help=f"Quote character for text files (default: {DEFAULT_QUOTECHAR!r}).",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- S3 / general settings ---
|
||||||
|
s3_group = parser.add_argument_group("S3 settings")
|
||||||
|
s3_group.add_argument(
|
||||||
|
"--bucket",
|
||||||
|
default=None,
|
||||||
|
help=f"S3 bucket name (default: {S3_BUCKET}).",
|
||||||
|
)
|
||||||
|
s3_group.add_argument(
|
||||||
|
"--profile",
|
||||||
|
default=None,
|
||||||
|
help=f"AWS CLI profile name (default: {AWS_PROFILE}).",
|
||||||
|
)
|
||||||
|
s3_group.add_argument(
|
||||||
|
"--input-file",
|
||||||
|
default=None,
|
||||||
|
help=f"Path to the text file with S3 prefixes (default: {INPUT_FILE}).",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_extensions(args: argparse.Namespace) -> Set[str]:
|
||||||
|
"""Determine the active extension set from parsed CLI *args*.
|
||||||
|
|
||||||
|
If ``--extensions`` is provided it takes precedence. Otherwise
|
||||||
|
``--file-type`` is used to select a predefined set.
|
||||||
|
"""
|
||||||
|
if args.extensions:
|
||||||
|
# Normalise: ensure each extension starts with a dot and is lowercase
|
||||||
|
exts: Set[str] = set()
|
||||||
|
for ext in args.extensions:
|
||||||
|
ext = ext.strip().lower()
|
||||||
|
if not ext.startswith("."):
|
||||||
|
ext = "." + ext
|
||||||
|
exts.add(ext)
|
||||||
|
return exts
|
||||||
|
|
||||||
|
if args.file_type == "sas":
|
||||||
|
return SAS_EXTENSIONS
|
||||||
|
if args.file_type == "text":
|
||||||
|
return TEXT_EXTENSIONS
|
||||||
|
return SUPPORTED_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_delimiter(args: argparse.Namespace) -> str:
|
||||||
|
"""Return the effective delimiter from parsed CLI *args*.
|
||||||
|
|
||||||
|
Handles the special values ``'tab'`` and ``'\\t'`` so users can specify a
|
||||||
|
tab character on the command line without shell-escaping issues.
|
||||||
|
"""
|
||||||
|
if args.delimiter is None:
|
||||||
|
return DEFAULT_DELIMITER
|
||||||
|
raw = args.delimiter
|
||||||
|
if raw.lower() in ("tab", "\\t"):
|
||||||
|
return "\t"
|
||||||
|
return raw
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Main
|
# Main
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import os
|
parser = build_arg_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# --- Apply CLI overrides to module-level config ---------------------------
|
||||||
|
if args.bucket:
|
||||||
|
S3_BUCKET = args.bucket
|
||||||
|
if args.profile:
|
||||||
|
AWS_PROFILE = args.profile
|
||||||
|
input_file = args.input_file if args.input_file else INPUT_FILE
|
||||||
|
|
||||||
|
active_extensions = resolve_extensions(args)
|
||||||
|
FILE_EXTENSIONS = active_extensions
|
||||||
|
|
||||||
|
delimiter = resolve_delimiter(args)
|
||||||
|
encoding = args.encoding
|
||||||
|
quotechar = args.quotechar
|
||||||
|
|
||||||
# --- Read input file ------------------------------------------------------
|
# --- Read input file ------------------------------------------------------
|
||||||
if not os.path.exists(INPUT_FILE):
|
if not os.path.exists(input_file):
|
||||||
print(f"ERROR: Input file not found: {INPUT_FILE}", file=sys.stderr)
|
print(f"ERROR: Input file not found: {input_file}", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
prefixes = read_input_file(INPUT_FILE)
|
prefixes = read_input_file(input_file)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"ERROR: Could not read input file: {exc}", file=sys.stderr)
|
print(f"ERROR: Could not read input file: {exc}", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
@ -399,7 +658,17 @@ if __name__ == "__main__":
|
|||||||
print(f"ERROR: AWS profile validation failed: {exc}", file=sys.stderr)
|
print(f"ERROR: AWS profile validation failed: {exc}", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# --- Explore --------------------------------------------------------------
|
# --- Print active configuration -------------------------------------------
|
||||||
|
ext_label = extensions_label(active_extensions)
|
||||||
print(f"Bucket: {S3_BUCKET}", file=sys.stderr)
|
print(f"Bucket: {S3_BUCKET}", file=sys.stderr)
|
||||||
results = explore_directories(prefixes)
|
print(f"Extensions: {ext_label}", file=sys.stderr)
|
||||||
print_results(results)
|
if active_extensions & TEXT_EXTENSIONS:
|
||||||
|
print(
|
||||||
|
f"Text opts: delimiter={delimiter!r} encoding={encoding!r} "
|
||||||
|
f"quotechar={quotechar!r}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Explore --------------------------------------------------------------
|
||||||
|
results = explore_directories(prefixes, extensions=active_extensions)
|
||||||
|
print_results(results, extensions=active_extensions)
|
||||||
|
|||||||
@ -1,15 +1,23 @@
|
|||||||
"""Standalone utility to download a .sas7bdat file from S3 and print a
|
"""Standalone utility to download a SAS or delimited text file from S3 and
|
||||||
column-level summary of the first 10 rows.
|
print a column-level summary of the first *N* rows.
|
||||||
|
|
||||||
Configure the four constants below, then run::
|
Supported formats
|
||||||
|
-----------------
|
||||||
|
* **SAS** – ``.sas7bdat``, ``.xpt``, ``.xport`` (read via *pyreadstat*)
|
||||||
|
* **Text** – ``.csv``, ``.tsv``, ``.txt`` (read via *pandas.read_csv*)
|
||||||
|
|
||||||
|
Configure the four constants below **or** use the CLI arguments, then run::
|
||||||
|
|
||||||
python3 file_viewer.py
|
python3 file_viewer.py
|
||||||
|
python3 file_viewer.py --local path/to/file.csv
|
||||||
|
python3 file_viewer.py --local path/to/data.tsv --delimiter $'\\t'
|
||||||
|
|
||||||
Python 3.14 compatible.
|
Python 3.14 compatible.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@ -19,14 +27,28 @@ import pyreadstat
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Configuration — edit these before running
|
# Supported file extensions
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
SAS_EXTENSIONS: set[str] = {".sas7bdat", ".xpt", ".xport"}
|
||||||
|
"""File extensions recognised as SAS data files."""
|
||||||
|
|
||||||
|
TEXT_EXTENSIONS: set[str] = {".txt", ".csv", ".tsv"}
|
||||||
|
"""File extensions recognised as delimited text files."""
|
||||||
|
|
||||||
|
SUPPORTED_EXTENSIONS: set[str] = SAS_EXTENSIONS | TEXT_EXTENSIONS
|
||||||
|
"""Union of all supported file extensions."""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Configuration — edit these before running (or use CLI arguments)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
S3_BUCKET: str = "my-bucket"
|
S3_BUCKET: str = "my-bucket"
|
||||||
"""S3 bucket name."""
|
"""S3 bucket name."""
|
||||||
|
|
||||||
S3_KEY: str = "path/to/file.sas7bdat"
|
S3_KEY: str = "path/to/file.sas7bdat"
|
||||||
"""Object key (path) within the bucket to the .sas7bdat file."""
|
"""Object key (path) within the bucket to a supported data file."""
|
||||||
|
|
||||||
LOCAL_FOLDER: str = "./downloads"
|
LOCAL_FOLDER: str = "./downloads"
|
||||||
"""Local directory to download the file into."""
|
"""Local directory to download the file into."""
|
||||||
@ -45,6 +67,8 @@ def _ensure_local_copy(bucket: str, key: str, local_path: str) -> None:
|
|||||||
|
|
||||||
If *local_path* exists and its size matches the S3 object's size, the
|
If *local_path* exists and its size matches the S3 object's size, the
|
||||||
download is skipped and a message is printed.
|
download is skipped and a message is printed.
|
||||||
|
|
||||||
|
Supports any file whose extension is in :data:`SUPPORTED_EXTENSIONS`.
|
||||||
"""
|
"""
|
||||||
session = boto3.Session(profile_name=AWS_PROFILE)
|
session = boto3.Session(profile_name=AWS_PROFILE)
|
||||||
s3 = session.client("s3")
|
s3 = session.client("s3")
|
||||||
@ -69,12 +93,117 @@ def _ensure_local_copy(bucket: str, key: str, local_path: str) -> None:
|
|||||||
print("Download complete.")
|
print("Download complete.")
|
||||||
|
|
||||||
|
|
||||||
|
# -- SAS readers -------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def _read_sas_head(path: str, row_count: int = 10) -> pd.DataFrame:
|
def _read_sas_head(path: str, row_count: int = 10) -> pd.DataFrame:
|
||||||
"""Read the first *row_count* rows of a .sas7bdat file."""
|
"""Read the first *row_count* rows of a SAS file (``.sas7bdat``, ``.xpt``, ``.xport``)."""
|
||||||
|
ext = os.path.splitext(path)[1].lower()
|
||||||
|
if ext == ".sas7bdat":
|
||||||
df, _ = pyreadstat.read_sas7bdat(path, row_offset=0, row_limit=row_count)
|
df, _ = pyreadstat.read_sas7bdat(path, row_offset=0, row_limit=row_count)
|
||||||
|
elif ext in {".xpt", ".xport"}:
|
||||||
|
df, _ = pyreadstat.read_xport(path, row_offset=0, row_limit=row_count)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported SAS extension: {ext}")
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
# -- Text readers ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text_head(
|
||||||
|
path: str,
|
||||||
|
row_count: int = 10,
|
||||||
|
delimiter: str = ",",
|
||||||
|
encoding: str = "utf-8",
|
||||||
|
quotechar: str = '"',
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Read the first *row_count* rows of a delimited text file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
path : str
|
||||||
|
Path to the ``.csv``, ``.tsv``, or ``.txt`` file.
|
||||||
|
row_count : int, optional
|
||||||
|
Number of data rows to read (default ``10``).
|
||||||
|
delimiter : str, optional
|
||||||
|
Column delimiter (default ``","``). For ``.tsv`` files the caller
|
||||||
|
should pass ``"\\t"``.
|
||||||
|
encoding : str, optional
|
||||||
|
File encoding (default ``"utf-8"``).
|
||||||
|
quotechar : str, optional
|
||||||
|
Character used to quote fields (default ``'"'``).
|
||||||
|
"""
|
||||||
|
return pd.read_csv(
|
||||||
|
path,
|
||||||
|
sep=delimiter,
|
||||||
|
encoding=encoding,
|
||||||
|
quotechar=quotechar,
|
||||||
|
nrows=row_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# -- Unified reader ----------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _read_head(
|
||||||
|
path: str,
|
||||||
|
row_count: int = 10,
|
||||||
|
delimiter: str | None = None,
|
||||||
|
encoding: str = "utf-8",
|
||||||
|
quotechar: str = '"',
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Read the first *row_count* rows of a supported data file.
|
||||||
|
|
||||||
|
Auto-detects the file type from its extension and delegates to the
|
||||||
|
appropriate reader. For ``.tsv`` files the delimiter defaults to tab
|
||||||
|
(``"\\t"``); for other text files it defaults to ``","``.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
path : str
|
||||||
|
Path to the data file.
|
||||||
|
row_count : int, optional
|
||||||
|
Number of data rows to read (default ``10``).
|
||||||
|
delimiter : str or None, optional
|
||||||
|
Column delimiter for text files. ``None`` means *auto-detect*
|
||||||
|
(tab for ``.tsv``, comma otherwise).
|
||||||
|
encoding : str, optional
|
||||||
|
Encoding for text files (default ``"utf-8"``).
|
||||||
|
quotechar : str, optional
|
||||||
|
Quote character for text files (default ``'"'``).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
"""
|
||||||
|
ext = os.path.splitext(path)[1].lower()
|
||||||
|
|
||||||
|
if ext not in SUPPORTED_EXTENSIONS:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported file extension '{ext}'. "
|
||||||
|
f"Supported extensions: {sorted(SUPPORTED_EXTENSIONS)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if ext in SAS_EXTENSIONS:
|
||||||
|
return _read_sas_head(path, row_count=row_count)
|
||||||
|
|
||||||
|
# --- Text file path ---
|
||||||
|
if delimiter is None:
|
||||||
|
delimiter = "\t" if ext == ".tsv" else ","
|
||||||
|
|
||||||
|
return _read_text_head(
|
||||||
|
path,
|
||||||
|
row_count=row_count,
|
||||||
|
delimiter=delimiter,
|
||||||
|
encoding=encoding,
|
||||||
|
quotechar=quotechar,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# -- Display -----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def _sample_values(series: pd.Series, n: int = 3) -> str:
|
def _sample_values(series: pd.Series, n: int = 3) -> str:
|
||||||
"""Return up to *n* non-null sample values as a comma-separated string."""
|
"""Return up to *n* non-null sample values as a comma-separated string."""
|
||||||
non_null = series.dropna()
|
non_null = series.dropna()
|
||||||
@ -114,26 +243,126 @@ def _print_summary(df: pd.DataFrame) -> None:
|
|||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _build_parser() -> argparse.ArgumentParser:
|
||||||
|
"""Build the argument parser for the file-viewer CLI."""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description=(
|
||||||
|
"Download a SAS or delimited text file from S3 (or read a local "
|
||||||
|
"file) and print a column-level summary of the first N rows.\n\n"
|
||||||
|
"Supported extensions: "
|
||||||
|
+ ", ".join(sorted(SUPPORTED_EXTENSIONS))
|
||||||
|
),
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
|
||||||
|
source = parser.add_mutually_exclusive_group()
|
||||||
|
source.add_argument(
|
||||||
|
"--local",
|
||||||
|
metavar="FILE",
|
||||||
|
default=None,
|
||||||
|
help=(
|
||||||
|
"Path to a local data file to summarise (skips S3 download). "
|
||||||
|
"Supported extensions: "
|
||||||
|
+ ", ".join(sorted(SUPPORTED_EXTENSIONS))
|
||||||
|
),
|
||||||
|
)
|
||||||
|
source.add_argument(
|
||||||
|
"--s3-key",
|
||||||
|
metavar="KEY",
|
||||||
|
default=None,
|
||||||
|
help="Override the S3_KEY constant with this object key.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--rows",
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
metavar="N",
|
||||||
|
help="Number of rows to read (default: 10).",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Text-file-specific options
|
||||||
|
text_group = parser.add_argument_group(
|
||||||
|
"text file options",
|
||||||
|
"These options apply only to .csv / .tsv / .txt files.",
|
||||||
|
)
|
||||||
|
text_group.add_argument(
|
||||||
|
"--delimiter",
|
||||||
|
default=None,
|
||||||
|
help=(
|
||||||
|
'Column delimiter for text files (default: "," for .csv/.txt, '
|
||||||
|
'"\\t" for .tsv). Use $\'\\t\' in the shell for a literal tab.'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
text_group.add_argument(
|
||||||
|
"--encoding",
|
||||||
|
default="utf-8",
|
||||||
|
help='File encoding for text files (default: "utf-8").',
|
||||||
|
)
|
||||||
|
text_group.add_argument(
|
||||||
|
"--quotechar",
|
||||||
|
default='"',
|
||||||
|
help='Quote character for text files (default: \'"\').',
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Main
|
# Main
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# --- Download -----------------------------------------------------------
|
parser = _build_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.local:
|
||||||
|
# ---- Local file mode -----------------------------------------------
|
||||||
|
local_path = args.local
|
||||||
|
ext = os.path.splitext(local_path)[1].lower()
|
||||||
|
if ext not in SUPPORTED_EXTENSIONS:
|
||||||
|
parser.error(
|
||||||
|
f"Unsupported file extension '{ext}'. "
|
||||||
|
f"Supported: {sorted(SUPPORTED_EXTENSIONS)}"
|
||||||
|
)
|
||||||
|
if not os.path.isfile(local_path):
|
||||||
|
print(f"File not found: {local_path}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
# ---- S3 download mode ----------------------------------------------
|
||||||
|
s3_key = args.s3_key or S3_KEY
|
||||||
|
ext = os.path.splitext(s3_key)[1].lower()
|
||||||
|
if ext not in SUPPORTED_EXTENSIONS:
|
||||||
|
parser.error(
|
||||||
|
f"Unsupported file extension '{ext}' in S3 key. "
|
||||||
|
f"Supported: {sorted(SUPPORTED_EXTENSIONS)}"
|
||||||
|
)
|
||||||
|
|
||||||
os.makedirs(LOCAL_FOLDER, exist_ok=True)
|
os.makedirs(LOCAL_FOLDER, exist_ok=True)
|
||||||
local_filename = os.path.basename(S3_KEY)
|
local_filename = os.path.basename(s3_key)
|
||||||
local_path = os.path.join(LOCAL_FOLDER, local_filename)
|
local_path = os.path.join(LOCAL_FOLDER, local_filename)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_ensure_local_copy(S3_BUCKET, S3_KEY, local_path)
|
_ensure_local_copy(S3_BUCKET, s3_key, local_path)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"S3 download error: {exc}", file=sys.stderr)
|
print(f"S3 download error: {exc}", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# --- Read & summarize ---------------------------------------------------
|
# ---- Read & summarise --------------------------------------------------
|
||||||
try:
|
try:
|
||||||
df = _read_sas_head(local_path, row_count=10)
|
df = _read_head(
|
||||||
|
local_path,
|
||||||
|
row_count=args.rows,
|
||||||
|
delimiter=args.delimiter,
|
||||||
|
encoding=args.encoding,
|
||||||
|
quotechar=args.quotechar,
|
||||||
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"File read error: {exc}", file=sys.stderr)
|
print(f"File read error: {exc}", file=sys.stderr)
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|||||||
@ -5,6 +5,10 @@ under that prefix recursively, groups objects into *clusters* using the same
|
|||||||
explicit-pattern + auto-detect rules as ``load_folder.py``, and downloads each
|
explicit-pattern + auto-detect rules as ``load_folder.py``, and downloads each
|
||||||
cluster's files into its own subfolder under a local destination root.
|
cluster's files into its own subfolder under a local destination root.
|
||||||
|
|
||||||
|
Supported file types:
|
||||||
|
* SAS data files: ``.sas7bdat``, ``.xpt``, ``.xport``
|
||||||
|
* Delimited text files: ``.txt``, ``.csv``, ``.tsv``
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
-------------------------------------------------------------------------------
|
||||||
USAGE
|
USAGE
|
||||||
-------------------------------------------------------------------------------
|
-------------------------------------------------------------------------------
|
||||||
@ -19,8 +23,9 @@ USAGE
|
|||||||
aws_profile: default # optional; default boto3 chain if omitted
|
aws_profile: default # optional; default boto3 chain if omitted
|
||||||
|
|
||||||
auto_detect: true # optional; default true
|
auto_detect: true # optional; default true
|
||||||
extensions: # optional; default sas7bdat/xpt/xport
|
extensions: # optional; default sas7bdat/xpt/xport/txt/csv/tsv
|
||||||
- .sas7bdat
|
- .sas7bdat
|
||||||
|
- .csv
|
||||||
on_exists: skip # optional; skip | overwrite | error
|
on_exists: skip # optional; skip | overwrite | error
|
||||||
concurrency: 4 # optional; default 4
|
concurrency: 4 # optional; default 4
|
||||||
|
|
||||||
@ -58,7 +63,8 @@ Exit codes:
|
|||||||
* Listing is recursive (no S3 ``Delimiter``). Regexes are matched against
|
* Listing is recursive (no S3 ``Delimiter``). Regexes are matched against
|
||||||
the *basename* of each key (the part after the last ``/``), so a nested
|
the *basename* of each key (the part after the last ``/``), so a nested
|
||||||
object like ``census/2020/raw/nested/group_c1.sas7bdat`` is grouped by
|
object like ``census/2020/raw/nested/group_c1.sas7bdat`` is grouped by
|
||||||
``group_c1.sas7bdat`` alone.
|
``group_c1.sas7bdat`` alone. Text files (e.g. ``data.csv``) are handled
|
||||||
|
identically — the basename is extracted and matched the same way.
|
||||||
* Explicit patterns are tried in order. A key matched by one pattern is
|
* Explicit patterns are tried in order. A key matched by one pattern is
|
||||||
removed from the pool before the next pattern runs. Overlap between
|
removed from the pool before the next pattern runs. Overlap between
|
||||||
patterns is flagged as an error at discovery time.
|
patterns is flagged as an error at discovery time.
|
||||||
@ -97,7 +103,9 @@ import boto3
|
|||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_EXTENSIONS: Tuple[str, ...] = (".sas7bdat", ".xpt", ".xport")
|
SAS_EXTENSIONS: Tuple[str, ...] = (".sas7bdat", ".xpt", ".xport")
|
||||||
|
TEXT_EXTENSIONS: Tuple[str, ...] = (".txt", ".csv", ".tsv")
|
||||||
|
DEFAULT_EXTENSIONS: Tuple[str, ...] = SAS_EXTENSIONS + TEXT_EXTENSIONS
|
||||||
VALID_ON_EXISTS: Tuple[str, ...] = ("skip", "overwrite", "error")
|
VALID_ON_EXISTS: Tuple[str, ...] = ("skip", "overwrite", "error")
|
||||||
DEFAULT_CONCURRENCY: int = 4
|
DEFAULT_CONCURRENCY: int = 4
|
||||||
|
|
||||||
@ -318,7 +326,12 @@ def build_s3_client(cfg: DownloadConfig):
|
|||||||
|
|
||||||
|
|
||||||
def list_s3_objects(s3_client, cfg: DownloadConfig) -> List[S3Object]:
|
def list_s3_objects(s3_client, cfg: DownloadConfig) -> List[S3Object]:
|
||||||
"""List all objects under ``cfg.prefix`` recursively, filtered by extension."""
|
"""List all objects under ``cfg.prefix`` recursively, filtered by extension.
|
||||||
|
|
||||||
|
Supports SAS extensions (``.sas7bdat``, ``.xpt``, ``.xport``) and text
|
||||||
|
extensions (``.txt``, ``.csv``, ``.tsv``) — whichever are present in
|
||||||
|
``cfg.extensions``.
|
||||||
|
"""
|
||||||
paginator = s3_client.get_paginator("list_objects_v2")
|
paginator = s3_client.get_paginator("list_objects_v2")
|
||||||
out: List[S3Object] = []
|
out: List[S3Object] = []
|
||||||
for page in paginator.paginate(Bucket=cfg.bucket, Prefix=cfg.prefix):
|
for page in paginator.paginate(Bucket=cfg.bucket, Prefix=cfg.prefix):
|
||||||
@ -584,8 +597,12 @@ def download_cluster(
|
|||||||
def _build_argparser() -> argparse.ArgumentParser:
|
def _build_argparser() -> argparse.ArgumentParser:
|
||||||
p = argparse.ArgumentParser(
|
p = argparse.ArgumentParser(
|
||||||
description=(
|
description=(
|
||||||
"Download S3 objects under a prefix into a local folder, "
|
"Download S3 objects (SAS data files and/or delimited text files) "
|
||||||
"grouping objects into clusters that each become one subfolder."
|
"under a prefix into a local folder, grouping objects into "
|
||||||
|
"clusters that each become one subfolder. "
|
||||||
|
"Supported extensions: "
|
||||||
|
+ ", ".join(DEFAULT_EXTENSIONS)
|
||||||
|
+ "."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
p.add_argument(
|
p.add_argument(
|
||||||
|
|||||||
@ -52,11 +52,13 @@ local_folder: ./downloads
|
|||||||
auto_detect: true
|
auto_detect: true
|
||||||
|
|
||||||
# Object extensions to consider. Anything else under the prefix is ignored.
|
# Object extensions to consider. Anything else under the prefix is ignored.
|
||||||
# Default (when this key is omitted): .sas7bdat, .xpt, .xport (matches
|
# Default (when this key is omitted): .sas7bdat, .xpt, .xport, .txt, .csv, .tsv
|
||||||
# generic_loader/load_folder.py).
|
|
||||||
# extensions:
|
# extensions:
|
||||||
# - .sas7bdat
|
# - .sas7bdat
|
||||||
# - .xpt
|
# - .xpt
|
||||||
|
# - .txt
|
||||||
|
# - .csv
|
||||||
|
# - .tsv
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Optional: download behavior
|
# Optional: download behavior
|
||||||
@ -103,3 +105,7 @@ clusters:
|
|||||||
#
|
#
|
||||||
# - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
|
# - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
|
||||||
# name: year2020_regionA_detail
|
# name: year2020_regionA_detail
|
||||||
|
|
||||||
|
# Text file cluster example (when file_type: text):
|
||||||
|
# - pattern: '^data_group_a\d+\.txt$'
|
||||||
|
# name: data_group_a
|
||||||
|
|||||||
1274
utils/sas_profiler.py
Normal file
1274
utils/sas_profiler.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user