foxtrot/utils/data_explorer.py
2026-04-21 20:05:26 -05:00

622 lines
20 KiB
Python

"""Explore S3 directories and categorise them by accessibility.
Reads a text file containing one S3 prefix per line (paths within the bucket
configured by the ``S3_BUCKET`` constant or ``--bucket`` CLI argument), then
for each prefix:
- Lists all objects recursively (via ``list_objects_v2`` paginator)
- **Only considers files matching the configured extensions** (default: all
supported extensions — SAS and text). All other file types are ignored.
- Tests read permission with ``head_object`` on the first matching file found
- Categorises the directory as **Available**, **Blocked**, or **Empty**
Supported file types
--------------------
* **SAS files**: ``.sas7bdat``, ``.xpt``, ``.xport``
* **Text / delimited files**: ``.txt``, ``.csv``, ``.tsv``
A directory is considered *empty* if it contains no files matching the
extension filter, even when other file types are present.
Configure the constants below (or use CLI arguments), then run::
python3 data_explorer.py [OPTIONS]
Python 3.10+ compatible. Requires ``boto3`` / ``botocore`` and stdlib.
"""
from __future__ import annotations
import argparse
import os
import sys
from dataclasses import dataclass, field
from typing import List, Set, Tuple
# ---------------------------------------------------------------------------
# Dependency check
# ---------------------------------------------------------------------------
try:
import boto3 # noqa: F401
import botocore.exceptions # noqa: F401
except ImportError:
print(
"ERROR: boto3 / botocore is not installed.\n"
"Install with: pip install boto3",
file=sys.stderr,
)
sys.exit(1)
# ---------------------------------------------------------------------------
# Extension constants
# ---------------------------------------------------------------------------
SAS_EXTENSIONS: Set[str] = {".sas7bdat", ".xpt", ".xport"}
"""File extensions recognised as SAS data files."""
TEXT_EXTENSIONS: Set[str] = {".txt", ".csv", ".tsv"}
"""File extensions recognised as delimited text / CSV files."""
SUPPORTED_EXTENSIONS: Set[str] = SAS_EXTENSIONS | TEXT_EXTENSIONS
"""Union of all file extensions this tool can work with."""
# ---------------------------------------------------------------------------
# Configuration defaults — edit these or override via CLI arguments
# ---------------------------------------------------------------------------
FILE_EXTENSIONS: Set[str] = SUPPORTED_EXTENSIONS
"""Set of extensions to filter on (case-insensitive). Defaults to all supported."""
INPUT_FILE: str = "s3_directories.txt"
"""Path to the text file containing one S3 prefix per line."""
S3_BUCKET: str = "my-bucket"
"""S3 bucket name (all prefixes are assumed to live in this bucket)."""
AWS_PROFILE: str = "default"
"""AWS CLI profile name used for authentication."""
# Text-file reading defaults (used when downloading / previewing text files)
DEFAULT_DELIMITER: str = ","
DEFAULT_ENCODING: str = "utf-8"
DEFAULT_QUOTECHAR: str = '"'
# ---------------------------------------------------------------------------
# Auto-detection helpers
# ---------------------------------------------------------------------------
def detect_file_type(filename: str) -> str:
"""Return ``'sas'``, ``'text'``, or ``'unknown'`` based on *filename* extension.
The check is case-insensitive. For ``.tsv`` files the caller should
default the delimiter to a tab character (``'\\t'``).
Examples
--------
>>> detect_file_type("data.sas7bdat")
'sas'
>>> detect_file_type("report.CSV")
'text'
>>> detect_file_type("archive.zip")
'unknown'
"""
ext = os.path.splitext(filename)[1].lower()
if ext in SAS_EXTENSIONS:
return "sas"
if ext in TEXT_EXTENSIONS:
return "text"
return "unknown"
def default_delimiter_for(filename: str) -> str:
"""Return a sensible default delimiter for *filename*.
* ``.tsv`` → ``'\\t'``
* everything else → ``','``
"""
ext = os.path.splitext(filename)[1].lower()
if ext == ".tsv":
return "\t"
return ","
def matches_extensions(key: str, extensions: Set[str]) -> bool:
"""Return ``True`` if *key* ends with any extension in *extensions* (case-insensitive)."""
key_lower = key.lower()
return any(key_lower.endswith(ext) for ext in extensions)
# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------
@dataclass
class AvailableDir:
"""An S3 directory that is readable."""
prefix: str
file_count: int
total_size: int # bytes
@dataclass
class BlockedDir:
"""An S3 directory where access was denied or an error occurred."""
prefix: str
file_count: int
error: str
@dataclass
class EmptyDir:
"""An S3 directory with no objects."""
prefix: str
@dataclass
class Results:
"""Aggregated exploration results."""
available: List[AvailableDir] = field(default_factory=list)
blocked: List[BlockedDir] = field(default_factory=list)
empty: List[EmptyDir] = field(default_factory=list)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def read_input_file(path: str) -> List[str]:
"""Return a list of S3 prefixes from *path*, ignoring blanks and comments.
Each line is stripped and normalised so that non-empty prefixes always end
with a trailing ``/``.
"""
prefixes: List[str] = []
with open(path, encoding="utf-8") as fh:
for raw_line in fh:
line = raw_line.strip()
if not line or line.startswith("#"):
continue
# Normalise: strip surrounding whitespace/slashes, then re-add
# a single trailing slash (unless the prefix is empty/root).
line = line.strip("/")
if line:
line += "/"
prefixes.append(line)
return prefixes
def format_size(size_bytes: int) -> str:
"""Return a human-readable size string (KB, MB, GB, TB)."""
if size_bytes < 1024:
return f"{size_bytes} B"
for unit in ("KB", "MB", "GB", "TB"):
size_bytes /= 1024.0
if size_bytes < 1024.0 or unit == "TB":
return f"{size_bytes:,.1f} {unit}"
# Fallback (should not be reached)
return f"{size_bytes:,.1f} TB"
def extensions_label(extensions: Set[str]) -> str:
"""Return a compact, sorted label for a set of extensions (e.g. ``.csv/.tsv/.txt``)."""
return "/".join(sorted(extensions))
def list_objects(
s3_client: "botocore.client.S3",
bucket: str,
prefix: str,
extensions: Set[str] | None = None,
) -> Tuple[str | None, int, int]:
"""Recursively list all objects under *prefix* using streaming counters.
Only objects whose key ends with one of *extensions* (case-insensitive) are
counted. All other files are silently skipped. When *extensions* is
``None`` the module-level ``FILE_EXTENSIONS`` set is used.
Returns ``(first_key, file_count, total_size)`` where *first_key* is the
key of the first matching object found (or ``None`` if no matching files
exist), *file_count* is the total number of matching objects, and
*total_size* is the sum of their sizes in bytes.
Unlike the previous implementation this never accumulates all keys in
memory, making it safe for prefixes with millions of objects.
"""
if extensions is None:
extensions = FILE_EXTENSIONS
exts_lower = {e.lower() for e in extensions}
paginator = s3_client.get_paginator("list_objects_v2")
first_key: str | None = None
file_count: int = 0
total_size: int = 0
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
for obj in page.get("Contents", []):
if not any(obj["Key"].lower().endswith(ext) for ext in exts_lower):
continue
if first_key is None:
first_key = obj["Key"]
file_count += 1
total_size += obj["Size"]
return first_key, file_count, total_size
def check_read_permission(
s3_client: "botocore.client.S3",
bucket: str,
key: str,
) -> str | None:
"""Try ``head_object`` on *key*. Return ``None`` on success or an error string."""
try:
s3_client.head_object(Bucket=bucket, Key=key)
except botocore.exceptions.ClientError as exc:
code = exc.response.get("Error", {}).get("Code", "Unknown")
message = exc.response.get("Error", {}).get("Message", str(exc))
return f"{message} ({code})"
return None
# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------
def explore_directories(
prefixes: List[str],
*,
extensions: Set[str] | None = None,
) -> Results:
"""Explore every prefix in ``S3_BUCKET`` and return categorised *Results*.
Parameters
----------
prefixes:
List of S3 key prefixes to explore.
extensions:
Set of file extensions to filter on. Defaults to the module-level
``FILE_EXTENSIONS`` (which itself defaults to ``SUPPORTED_EXTENSIONS``).
"""
if extensions is None:
extensions = FILE_EXTENSIONS
exts_lower = {e.lower() for e in extensions}
ext_label = extensions_label(extensions)
session = boto3.Session(profile_name=AWS_PROFILE)
s3 = session.client("s3")
results = Results()
total = len(prefixes)
for idx, prefix in enumerate(prefixes, start=1):
print(
f"[{idx}/{total}] Checking {prefix} (filtering for {ext_label}) ...",
file=sys.stderr,
)
# --- Recursive listing ------------------------------------------------
try:
first_key, file_count, total_size = list_objects(
s3, S3_BUCKET, prefix, extensions=extensions,
)
except botocore.exceptions.ClientError as exc:
code = exc.response.get("Error", {}).get("Code", "Unknown")
message = exc.response.get("Error", {}).get("Message", str(exc))
results.blocked.append(
BlockedDir(prefix=prefix, file_count=0, error=f"{message} ({code})")
)
continue
except Exception as exc:
results.blocked.append(
BlockedDir(prefix=prefix, file_count=0, error=str(exc))
)
continue
if first_key is None:
results.empty.append(EmptyDir(prefix=prefix))
continue
# --- Permission check -------------------------------------------------
# Prefer a real object over a zero-byte directory marker (key ending
# in "/") for the head_object test. The selected key must also match
# the extension filter. If no suitable key is found, fall back to
# first_key.
test_key = first_key
if first_key.endswith("/") and total_size > 0:
# Re-scan the first page to find a non-marker key matching the extensions
try:
probe_paginator = s3.get_paginator("list_objects_v2")
for probe_page in probe_paginator.paginate(
Bucket=S3_BUCKET, Prefix=prefix, PaginationConfig={"MaxItems": 1000}
):
for obj in probe_page.get("Contents", []):
if (
not (obj["Key"].endswith("/") and obj["Size"] == 0)
and any(obj["Key"].lower().endswith(ext) for ext in exts_lower)
):
test_key = obj["Key"]
break
if test_key != first_key:
break
except Exception:
pass # Fall back to first_key
error = check_read_permission(s3, S3_BUCKET, test_key)
if error is None:
results.available.append(
AvailableDir(prefix=prefix, file_count=file_count, total_size=total_size)
)
else:
results.blocked.append(
BlockedDir(prefix=prefix, file_count=file_count, error=error)
)
return results
# ---------------------------------------------------------------------------
# Output
# ---------------------------------------------------------------------------
def print_results(results: Results, *, extensions: Set[str] | None = None) -> None:
"""Print a clean, human-readable summary to stdout.
Parameters
----------
results:
The exploration results to display.
extensions:
The set of extensions that were used for filtering. Used only for
labelling in the output. Defaults to ``FILE_EXTENSIONS``.
"""
if extensions is None:
extensions = FILE_EXTENSIONS
ext_label = extensions_label(extensions)
print()
print("=== S3 Directory Explorer Results ===")
print(f"Bucket: {S3_BUCKET}")
print(f"Extensions: {ext_label}")
# --- Available ---
print()
print(f"--- Available ({len(results.available)}) ---")
if results.available:
for d in results.available:
print(f" {d.prefix}")
print(f" Matching files ({ext_label}): {d.file_count} | Total Size: {format_size(d.total_size)}")
else:
print(" (none)")
# --- Blocked ---
print()
print(f"--- Blocked ({len(results.blocked)}) ---")
if results.blocked:
for d in results.blocked:
if d.file_count:
print(f" {d.prefix}")
print(f" Matching files ({ext_label}) found: {d.file_count} | Error: {d.error}")
else:
print(f" {d.prefix}")
print(f" Error: {d.error}")
else:
print(" (none)")
# --- Empty ---
print()
print(f"--- Empty / no matching files ({len(results.empty)}) ---")
if results.empty:
for d in results.empty:
print(f" {d.prefix}")
else:
print(" (none)")
print()
# ---------------------------------------------------------------------------
# CLI argument parsing
# ---------------------------------------------------------------------------
def build_arg_parser() -> argparse.ArgumentParser:
"""Build and return the CLI argument parser.
Supports selecting file-type filters, text-file reading parameters, and
overriding the default bucket / profile / input-file settings.
"""
parser = argparse.ArgumentParser(
description=(
"Explore S3 directories and categorise them by accessibility. "
"Supports SAS files (.sas7bdat, .xpt, .xport) and delimited text "
"files (.txt, .csv, .tsv)."
),
)
# --- File-type / extension selection ---
type_group = parser.add_argument_group("File-type selection")
type_group.add_argument(
"--file-type",
choices=["sas", "text", "all"],
default="all",
help=(
"Restrict the scan to a specific file type. "
"'sas' = .sas7bdat/.xpt/.xport only; "
"'text' = .txt/.csv/.tsv only; "
"'all' = both (default)."
),
)
type_group.add_argument(
"--extensions",
nargs="+",
metavar="EXT",
help=(
"Explicit list of extensions to filter on (e.g. --extensions .csv .tsv). "
"Overrides --file-type when provided."
),
)
# --- Text-file reading parameters ---
text_group = parser.add_argument_group(
"Text-file parameters",
description=(
"Parameters used when reading delimited text files. These are "
"stored for downstream consumers and do not affect the S3 scan "
"itself."
),
)
text_group.add_argument(
"--delimiter",
default=None,
help=(
"Field delimiter for text files (default: ',' for .csv/.txt, "
"'\\t' for .tsv). Use 'tab' or '\\t' for a tab character."
),
)
text_group.add_argument(
"--encoding",
default=DEFAULT_ENCODING,
help=f"Character encoding for text files (default: {DEFAULT_ENCODING}).",
)
text_group.add_argument(
"--quotechar",
default=DEFAULT_QUOTECHAR,
help=f"Quote character for text files (default: {DEFAULT_QUOTECHAR!r}).",
)
# --- S3 / general settings ---
s3_group = parser.add_argument_group("S3 settings")
s3_group.add_argument(
"--bucket",
default=None,
help=f"S3 bucket name (default: {S3_BUCKET}).",
)
s3_group.add_argument(
"--profile",
default=None,
help=f"AWS CLI profile name (default: {AWS_PROFILE}).",
)
s3_group.add_argument(
"--input-file",
default=None,
help=f"Path to the text file with S3 prefixes (default: {INPUT_FILE}).",
)
return parser
def resolve_extensions(args: argparse.Namespace) -> Set[str]:
"""Determine the active extension set from parsed CLI *args*.
If ``--extensions`` is provided it takes precedence. Otherwise
``--file-type`` is used to select a predefined set.
"""
if args.extensions:
# Normalise: ensure each extension starts with a dot and is lowercase
exts: Set[str] = set()
for ext in args.extensions:
ext = ext.strip().lower()
if not ext.startswith("."):
ext = "." + ext
exts.add(ext)
return exts
if args.file_type == "sas":
return SAS_EXTENSIONS
if args.file_type == "text":
return TEXT_EXTENSIONS
return SUPPORTED_EXTENSIONS
def resolve_delimiter(args: argparse.Namespace) -> str:
"""Return the effective delimiter from parsed CLI *args*.
Handles the special values ``'tab'`` and ``'\\t'`` so users can specify a
tab character on the command line without shell-escaping issues.
"""
if args.delimiter is None:
return DEFAULT_DELIMITER
raw = args.delimiter
if raw.lower() in ("tab", "\\t"):
return "\t"
return raw
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
if __name__ == "__main__":
parser = build_arg_parser()
args = parser.parse_args()
# --- Apply CLI overrides to module-level config ---------------------------
if args.bucket:
S3_BUCKET = args.bucket
if args.profile:
AWS_PROFILE = args.profile
input_file = args.input_file if args.input_file else INPUT_FILE
active_extensions = resolve_extensions(args)
FILE_EXTENSIONS = active_extensions
delimiter = resolve_delimiter(args)
encoding = args.encoding
quotechar = args.quotechar
# --- Read input file ------------------------------------------------------
if not os.path.exists(input_file):
print(f"ERROR: Input file not found: {input_file}", file=sys.stderr)
sys.exit(1)
try:
prefixes = read_input_file(input_file)
except Exception as exc:
print(f"ERROR: Could not read input file: {exc}", file=sys.stderr)
sys.exit(1)
if not prefixes:
print("No valid S3 prefixes found in the input file. Nothing to do.")
sys.exit(0)
# --- Validate AWS profile -------------------------------------------------
try:
session = boto3.Session(profile_name=AWS_PROFILE)
# Force credential resolution to catch bad profiles early
credentials = session.get_credentials()
if credentials is None:
raise RuntimeError(
f"No credentials found for AWS profile {AWS_PROFILE!r}"
)
except botocore.exceptions.ProfileNotFound as exc:
print(f"ERROR: {exc}", file=sys.stderr)
sys.exit(1)
except Exception as exc:
print(f"ERROR: AWS profile validation failed: {exc}", file=sys.stderr)
sys.exit(1)
# --- Print active configuration -------------------------------------------
ext_label = extensions_label(active_extensions)
print(f"Bucket: {S3_BUCKET}", file=sys.stderr)
print(f"Extensions: {ext_label}", file=sys.stderr)
if active_extensions & TEXT_EXTENSIONS:
print(
f"Text opts: delimiter={delimiter!r} encoding={encoding!r} "
f"quotechar={quotechar!r}",
file=sys.stderr,
)
# --- Explore --------------------------------------------------------------
results = explore_directories(prefixes, extensions=active_extensions)
print_results(results, extensions=active_extensions)