foxtrot/utils/data_explorer.py

622 lines
20 KiB
Python
Raw Normal View History

2026-04-20 21:27:54 +00:00
"""Explore S3 directories and categorise them by accessibility.
Reads a text file containing one S3 prefix per line (paths within the bucket
2026-04-22 01:05:26 +00:00
configured by the ``S3_BUCKET`` constant or ``--bucket`` CLI argument), then
for each prefix:
2026-04-20 21:27:54 +00:00
- Lists all objects recursively (via ``list_objects_v2`` paginator)
2026-04-22 01:05:26 +00:00
- **Only considers files matching the configured extensions** (default: all
supported extensions SAS and text). All other file types are ignored.
2026-04-20 21:30:35 +00:00
- Tests read permission with ``head_object`` on the first matching file found
2026-04-20 21:27:54 +00:00
- Categorises the directory as **Available**, **Blocked**, or **Empty**
2026-04-22 01:05:26 +00:00
Supported file types
--------------------
* **SAS files**: ``.sas7bdat``, ``.xpt``, ``.xport``
* **Text / delimited files**: ``.txt``, ``.csv``, ``.tsv``
2026-04-20 21:30:35 +00:00
A directory is considered *empty* if it contains no files matching the
extension filter, even when other file types are present.
2026-04-22 01:05:26 +00:00
Configure the constants below (or use CLI arguments), then run::
2026-04-20 21:27:54 +00:00
2026-04-22 01:05:26 +00:00
python3 data_explorer.py [OPTIONS]
2026-04-20 21:27:54 +00:00
2026-04-22 01:05:26 +00:00
Python 3.10+ compatible. Requires ``boto3`` / ``botocore`` and stdlib.
2026-04-20 21:27:54 +00:00
"""
from __future__ import annotations
2026-04-22 01:05:26 +00:00
import argparse
import os
2026-04-20 21:27:54 +00:00
import sys
from dataclasses import dataclass, field
2026-04-22 01:05:26 +00:00
from typing import List, Set, Tuple
2026-04-20 21:27:54 +00:00
# ---------------------------------------------------------------------------
# Dependency check
# ---------------------------------------------------------------------------
try:
import boto3 # noqa: F401
import botocore.exceptions # noqa: F401
except ImportError:
print(
"ERROR: boto3 / botocore is not installed.\n"
"Install with: pip install boto3",
file=sys.stderr,
)
sys.exit(1)
# ---------------------------------------------------------------------------
2026-04-22 01:05:26 +00:00
# Extension constants
# ---------------------------------------------------------------------------
SAS_EXTENSIONS: Set[str] = {".sas7bdat", ".xpt", ".xport"}
"""File extensions recognised as SAS data files."""
TEXT_EXTENSIONS: Set[str] = {".txt", ".csv", ".tsv"}
"""File extensions recognised as delimited text / CSV files."""
SUPPORTED_EXTENSIONS: Set[str] = SAS_EXTENSIONS | TEXT_EXTENSIONS
"""Union of all file extensions this tool can work with."""
# ---------------------------------------------------------------------------
# Configuration defaults — edit these or override via CLI arguments
2026-04-20 21:27:54 +00:00
# ---------------------------------------------------------------------------
2026-04-22 01:05:26 +00:00
FILE_EXTENSIONS: Set[str] = SUPPORTED_EXTENSIONS
"""Set of extensions to filter on (case-insensitive). Defaults to all supported."""
2026-04-20 21:30:35 +00:00
2026-04-20 21:27:54 +00:00
INPUT_FILE: str = "s3_directories.txt"
"""Path to the text file containing one S3 prefix per line."""
S3_BUCKET: str = "my-bucket"
"""S3 bucket name (all prefixes are assumed to live in this bucket)."""
AWS_PROFILE: str = "default"
"""AWS CLI profile name used for authentication."""
2026-04-22 01:05:26 +00:00
# Text-file reading defaults (used when downloading / previewing text files)
DEFAULT_DELIMITER: str = ","
DEFAULT_ENCODING: str = "utf-8"
DEFAULT_QUOTECHAR: str = '"'
# ---------------------------------------------------------------------------
# Auto-detection helpers
# ---------------------------------------------------------------------------
def detect_file_type(filename: str) -> str:
"""Return ``'sas'``, ``'text'``, or ``'unknown'`` based on *filename* extension.
The check is case-insensitive. For ``.tsv`` files the caller should
default the delimiter to a tab character (``'\\t'``).
Examples
--------
>>> detect_file_type("data.sas7bdat")
'sas'
>>> detect_file_type("report.CSV")
'text'
>>> detect_file_type("archive.zip")
'unknown'
"""
ext = os.path.splitext(filename)[1].lower()
if ext in SAS_EXTENSIONS:
return "sas"
if ext in TEXT_EXTENSIONS:
return "text"
return "unknown"
def default_delimiter_for(filename: str) -> str:
"""Return a sensible default delimiter for *filename*.
* ``.tsv`` ``'\\t'``
* everything else ``','``
"""
ext = os.path.splitext(filename)[1].lower()
if ext == ".tsv":
return "\t"
return ","
def matches_extensions(key: str, extensions: Set[str]) -> bool:
"""Return ``True`` if *key* ends with any extension in *extensions* (case-insensitive)."""
key_lower = key.lower()
return any(key_lower.endswith(ext) for ext in extensions)
2026-04-20 21:27:54 +00:00
# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------
@dataclass
class AvailableDir:
"""An S3 directory that is readable."""
prefix: str
file_count: int
total_size: int # bytes
@dataclass
class BlockedDir:
"""An S3 directory where access was denied or an error occurred."""
prefix: str
file_count: int
error: str
@dataclass
class EmptyDir:
"""An S3 directory with no objects."""
prefix: str
@dataclass
class Results:
"""Aggregated exploration results."""
available: List[AvailableDir] = field(default_factory=list)
blocked: List[BlockedDir] = field(default_factory=list)
empty: List[EmptyDir] = field(default_factory=list)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def read_input_file(path: str) -> List[str]:
"""Return a list of S3 prefixes from *path*, ignoring blanks and comments.
Each line is stripped and normalised so that non-empty prefixes always end
with a trailing ``/``.
"""
prefixes: List[str] = []
with open(path, encoding="utf-8") as fh:
for raw_line in fh:
line = raw_line.strip()
if not line or line.startswith("#"):
continue
# Normalise: strip surrounding whitespace/slashes, then re-add
# a single trailing slash (unless the prefix is empty/root).
line = line.strip("/")
if line:
line += "/"
prefixes.append(line)
return prefixes
def format_size(size_bytes: int) -> str:
"""Return a human-readable size string (KB, MB, GB, TB)."""
if size_bytes < 1024:
return f"{size_bytes} B"
for unit in ("KB", "MB", "GB", "TB"):
size_bytes /= 1024.0
if size_bytes < 1024.0 or unit == "TB":
return f"{size_bytes:,.1f} {unit}"
# Fallback (should not be reached)
return f"{size_bytes:,.1f} TB"
2026-04-22 01:05:26 +00:00
def extensions_label(extensions: Set[str]) -> str:
"""Return a compact, sorted label for a set of extensions (e.g. ``.csv/.tsv/.txt``)."""
return "/".join(sorted(extensions))
2026-04-20 21:27:54 +00:00
def list_objects(
s3_client: "botocore.client.S3",
bucket: str,
prefix: str,
2026-04-22 01:05:26 +00:00
extensions: Set[str] | None = None,
2026-04-20 21:27:54 +00:00
) -> Tuple[str | None, int, int]:
"""Recursively list all objects under *prefix* using streaming counters.
2026-04-22 01:05:26 +00:00
Only objects whose key ends with one of *extensions* (case-insensitive) are
counted. All other files are silently skipped. When *extensions* is
``None`` the module-level ``FILE_EXTENSIONS`` set is used.
2026-04-20 21:30:35 +00:00
2026-04-20 21:27:54 +00:00
Returns ``(first_key, file_count, total_size)`` where *first_key* is the
2026-04-20 21:30:35 +00:00
key of the first matching object found (or ``None`` if no matching files
exist), *file_count* is the total number of matching objects, and
*total_size* is the sum of their sizes in bytes.
2026-04-20 21:27:54 +00:00
Unlike the previous implementation this never accumulates all keys in
memory, making it safe for prefixes with millions of objects.
"""
2026-04-22 01:05:26 +00:00
if extensions is None:
extensions = FILE_EXTENSIONS
exts_lower = {e.lower() for e in extensions}
2026-04-20 21:27:54 +00:00
paginator = s3_client.get_paginator("list_objects_v2")
first_key: str | None = None
file_count: int = 0
total_size: int = 0
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
for obj in page.get("Contents", []):
2026-04-22 01:05:26 +00:00
if not any(obj["Key"].lower().endswith(ext) for ext in exts_lower):
2026-04-20 21:30:35 +00:00
continue
2026-04-20 21:27:54 +00:00
if first_key is None:
first_key = obj["Key"]
file_count += 1
total_size += obj["Size"]
return first_key, file_count, total_size
def check_read_permission(
s3_client: "botocore.client.S3",
bucket: str,
key: str,
) -> str | None:
"""Try ``head_object`` on *key*. Return ``None`` on success or an error string."""
try:
s3_client.head_object(Bucket=bucket, Key=key)
except botocore.exceptions.ClientError as exc:
code = exc.response.get("Error", {}).get("Code", "Unknown")
message = exc.response.get("Error", {}).get("Message", str(exc))
return f"{message} ({code})"
return None
# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------
2026-04-22 01:05:26 +00:00
def explore_directories(
prefixes: List[str],
*,
extensions: Set[str] | None = None,
) -> Results:
"""Explore every prefix in ``S3_BUCKET`` and return categorised *Results*.
Parameters
----------
prefixes:
List of S3 key prefixes to explore.
extensions:
Set of file extensions to filter on. Defaults to the module-level
``FILE_EXTENSIONS`` (which itself defaults to ``SUPPORTED_EXTENSIONS``).
"""
if extensions is None:
extensions = FILE_EXTENSIONS
exts_lower = {e.lower() for e in extensions}
ext_label = extensions_label(extensions)
2026-04-20 21:27:54 +00:00
session = boto3.Session(profile_name=AWS_PROFILE)
s3 = session.client("s3")
results = Results()
total = len(prefixes)
for idx, prefix in enumerate(prefixes, start=1):
2026-04-20 21:30:35 +00:00
print(
2026-04-22 01:05:26 +00:00
f"[{idx}/{total}] Checking {prefix} (filtering for {ext_label}) ...",
2026-04-20 21:30:35 +00:00
file=sys.stderr,
)
2026-04-20 21:27:54 +00:00
# --- Recursive listing ------------------------------------------------
try:
2026-04-22 01:05:26 +00:00
first_key, file_count, total_size = list_objects(
s3, S3_BUCKET, prefix, extensions=extensions,
)
2026-04-20 21:27:54 +00:00
except botocore.exceptions.ClientError as exc:
code = exc.response.get("Error", {}).get("Code", "Unknown")
message = exc.response.get("Error", {}).get("Message", str(exc))
results.blocked.append(
BlockedDir(prefix=prefix, file_count=0, error=f"{message} ({code})")
)
continue
except Exception as exc:
results.blocked.append(
BlockedDir(prefix=prefix, file_count=0, error=str(exc))
)
continue
if first_key is None:
results.empty.append(EmptyDir(prefix=prefix))
continue
# --- Permission check -------------------------------------------------
# Prefer a real object over a zero-byte directory marker (key ending
2026-04-20 21:30:35 +00:00
# in "/") for the head_object test. The selected key must also match
2026-04-22 01:05:26 +00:00
# the extension filter. If no suitable key is found, fall back to
# first_key.
2026-04-20 21:27:54 +00:00
test_key = first_key
if first_key.endswith("/") and total_size > 0:
2026-04-22 01:05:26 +00:00
# Re-scan the first page to find a non-marker key matching the extensions
2026-04-20 21:27:54 +00:00
try:
probe_paginator = s3.get_paginator("list_objects_v2")
for probe_page in probe_paginator.paginate(
Bucket=S3_BUCKET, Prefix=prefix, PaginationConfig={"MaxItems": 1000}
):
for obj in probe_page.get("Contents", []):
2026-04-20 21:30:35 +00:00
if (
not (obj["Key"].endswith("/") and obj["Size"] == 0)
2026-04-22 01:05:26 +00:00
and any(obj["Key"].lower().endswith(ext) for ext in exts_lower)
2026-04-20 21:30:35 +00:00
):
2026-04-20 21:27:54 +00:00
test_key = obj["Key"]
break
if test_key != first_key:
break
except Exception:
pass # Fall back to first_key
error = check_read_permission(s3, S3_BUCKET, test_key)
if error is None:
results.available.append(
AvailableDir(prefix=prefix, file_count=file_count, total_size=total_size)
)
else:
results.blocked.append(
BlockedDir(prefix=prefix, file_count=file_count, error=error)
)
return results
# ---------------------------------------------------------------------------
# Output
# ---------------------------------------------------------------------------
2026-04-22 01:05:26 +00:00
def print_results(results: Results, *, extensions: Set[str] | None = None) -> None:
"""Print a clean, human-readable summary to stdout.
Parameters
----------
results:
The exploration results to display.
extensions:
The set of extensions that were used for filtering. Used only for
labelling in the output. Defaults to ``FILE_EXTENSIONS``.
"""
if extensions is None:
extensions = FILE_EXTENSIONS
ext_label = extensions_label(extensions)
2026-04-20 21:27:54 +00:00
print()
print("=== S3 Directory Explorer Results ===")
print(f"Bucket: {S3_BUCKET}")
2026-04-22 01:05:26 +00:00
print(f"Extensions: {ext_label}")
2026-04-20 21:27:54 +00:00
# --- Available ---
print()
print(f"--- Available ({len(results.available)}) ---")
if results.available:
for d in results.available:
print(f" {d.prefix}")
2026-04-22 01:05:26 +00:00
print(f" Matching files ({ext_label}): {d.file_count} | Total Size: {format_size(d.total_size)}")
2026-04-20 21:27:54 +00:00
else:
print(" (none)")
# --- Blocked ---
print()
print(f"--- Blocked ({len(results.blocked)}) ---")
if results.blocked:
for d in results.blocked:
if d.file_count:
print(f" {d.prefix}")
2026-04-22 01:05:26 +00:00
print(f" Matching files ({ext_label}) found: {d.file_count} | Error: {d.error}")
2026-04-20 21:27:54 +00:00
else:
print(f" {d.prefix}")
print(f" Error: {d.error}")
else:
print(" (none)")
# --- Empty ---
print()
2026-04-22 01:05:26 +00:00
print(f"--- Empty / no matching files ({len(results.empty)}) ---")
2026-04-20 21:27:54 +00:00
if results.empty:
for d in results.empty:
print(f" {d.prefix}")
else:
print(" (none)")
print()
2026-04-22 01:05:26 +00:00
# ---------------------------------------------------------------------------
# CLI argument parsing
# ---------------------------------------------------------------------------
def build_arg_parser() -> argparse.ArgumentParser:
"""Build and return the CLI argument parser.
Supports selecting file-type filters, text-file reading parameters, and
overriding the default bucket / profile / input-file settings.
"""
parser = argparse.ArgumentParser(
description=(
"Explore S3 directories and categorise them by accessibility. "
"Supports SAS files (.sas7bdat, .xpt, .xport) and delimited text "
"files (.txt, .csv, .tsv)."
),
)
# --- File-type / extension selection ---
type_group = parser.add_argument_group("File-type selection")
type_group.add_argument(
"--file-type",
choices=["sas", "text", "all"],
default="all",
help=(
"Restrict the scan to a specific file type. "
"'sas' = .sas7bdat/.xpt/.xport only; "
"'text' = .txt/.csv/.tsv only; "
"'all' = both (default)."
),
)
type_group.add_argument(
"--extensions",
nargs="+",
metavar="EXT",
help=(
"Explicit list of extensions to filter on (e.g. --extensions .csv .tsv). "
"Overrides --file-type when provided."
),
)
# --- Text-file reading parameters ---
text_group = parser.add_argument_group(
"Text-file parameters",
description=(
"Parameters used when reading delimited text files. These are "
"stored for downstream consumers and do not affect the S3 scan "
"itself."
),
)
text_group.add_argument(
"--delimiter",
default=None,
help=(
"Field delimiter for text files (default: ',' for .csv/.txt, "
"'\\t' for .tsv). Use 'tab' or '\\t' for a tab character."
),
)
text_group.add_argument(
"--encoding",
default=DEFAULT_ENCODING,
help=f"Character encoding for text files (default: {DEFAULT_ENCODING}).",
)
text_group.add_argument(
"--quotechar",
default=DEFAULT_QUOTECHAR,
help=f"Quote character for text files (default: {DEFAULT_QUOTECHAR!r}).",
)
# --- S3 / general settings ---
s3_group = parser.add_argument_group("S3 settings")
s3_group.add_argument(
"--bucket",
default=None,
help=f"S3 bucket name (default: {S3_BUCKET}).",
)
s3_group.add_argument(
"--profile",
default=None,
help=f"AWS CLI profile name (default: {AWS_PROFILE}).",
)
s3_group.add_argument(
"--input-file",
default=None,
help=f"Path to the text file with S3 prefixes (default: {INPUT_FILE}).",
)
return parser
def resolve_extensions(args: argparse.Namespace) -> Set[str]:
"""Determine the active extension set from parsed CLI *args*.
If ``--extensions`` is provided it takes precedence. Otherwise
``--file-type`` is used to select a predefined set.
"""
if args.extensions:
# Normalise: ensure each extension starts with a dot and is lowercase
exts: Set[str] = set()
for ext in args.extensions:
ext = ext.strip().lower()
if not ext.startswith("."):
ext = "." + ext
exts.add(ext)
return exts
if args.file_type == "sas":
return SAS_EXTENSIONS
if args.file_type == "text":
return TEXT_EXTENSIONS
return SUPPORTED_EXTENSIONS
def resolve_delimiter(args: argparse.Namespace) -> str:
"""Return the effective delimiter from parsed CLI *args*.
Handles the special values ``'tab'`` and ``'\\t'`` so users can specify a
tab character on the command line without shell-escaping issues.
"""
if args.delimiter is None:
return DEFAULT_DELIMITER
raw = args.delimiter
if raw.lower() in ("tab", "\\t"):
return "\t"
return raw
2026-04-20 21:27:54 +00:00
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
if __name__ == "__main__":
2026-04-22 01:05:26 +00:00
parser = build_arg_parser()
args = parser.parse_args()
# --- Apply CLI overrides to module-level config ---------------------------
if args.bucket:
S3_BUCKET = args.bucket
if args.profile:
AWS_PROFILE = args.profile
input_file = args.input_file if args.input_file else INPUT_FILE
active_extensions = resolve_extensions(args)
FILE_EXTENSIONS = active_extensions
delimiter = resolve_delimiter(args)
encoding = args.encoding
quotechar = args.quotechar
2026-04-20 21:27:54 +00:00
# --- Read input file ------------------------------------------------------
2026-04-22 01:05:26 +00:00
if not os.path.exists(input_file):
print(f"ERROR: Input file not found: {input_file}", file=sys.stderr)
2026-04-20 21:27:54 +00:00
sys.exit(1)
try:
2026-04-22 01:05:26 +00:00
prefixes = read_input_file(input_file)
2026-04-20 21:27:54 +00:00
except Exception as exc:
print(f"ERROR: Could not read input file: {exc}", file=sys.stderr)
sys.exit(1)
if not prefixes:
print("No valid S3 prefixes found in the input file. Nothing to do.")
sys.exit(0)
# --- Validate AWS profile -------------------------------------------------
try:
session = boto3.Session(profile_name=AWS_PROFILE)
# Force credential resolution to catch bad profiles early
credentials = session.get_credentials()
if credentials is None:
raise RuntimeError(
f"No credentials found for AWS profile {AWS_PROFILE!r}"
)
except botocore.exceptions.ProfileNotFound as exc:
print(f"ERROR: {exc}", file=sys.stderr)
sys.exit(1)
except Exception as exc:
print(f"ERROR: AWS profile validation failed: {exc}", file=sys.stderr)
sys.exit(1)
2026-04-22 01:05:26 +00:00
# --- Print active configuration -------------------------------------------
ext_label = extensions_label(active_extensions)
print(f"Bucket: {S3_BUCKET}", file=sys.stderr)
print(f"Extensions: {ext_label}", file=sys.stderr)
if active_extensions & TEXT_EXTENSIONS:
print(
f"Text opts: delimiter={delimiter!r} encoding={encoding!r} "
f"quotechar={quotechar!r}",
file=sys.stderr,
)
2026-04-20 21:27:54 +00:00
# --- Explore --------------------------------------------------------------
2026-04-22 01:05:26 +00:00
results = explore_directories(prefixes, extensions=active_extensions)
print_results(results, extensions=active_extensions)