2026-04-20 21:27:54 +00:00
|
|
|
"""Explore S3 directories and categorise them by accessibility.
|
|
|
|
|
|
|
|
|
|
Reads a text file containing one S3 prefix per line (paths within the bucket
|
2026-04-22 01:05:26 +00:00
|
|
|
configured by the ``S3_BUCKET`` constant or ``--bucket`` CLI argument), then
|
|
|
|
|
for each prefix:
|
|
|
|
|
|
2026-04-20 21:27:54 +00:00
|
|
|
- Lists all objects recursively (via ``list_objects_v2`` paginator)
|
2026-04-22 01:05:26 +00:00
|
|
|
- **Only considers files matching the configured extensions** (default: all
|
|
|
|
|
supported extensions — SAS and text). All other file types are ignored.
|
2026-04-20 21:30:35 +00:00
|
|
|
- Tests read permission with ``head_object`` on the first matching file found
|
2026-04-20 22:02:35 +00:00
|
|
|
- If the first file is accessible, tests ALL remaining files individually
|
|
|
|
|
- Categorises the directory as **Available**, **Blocked**, **Empty**, and
|
|
|
|
|
tracks individual file **Exceptions** within available directories
|
2026-04-20 21:27:54 +00:00
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
Supported file types
|
|
|
|
|
--------------------
|
|
|
|
|
* **SAS files**: ``.sas7bdat``, ``.xpt``, ``.xport``
|
|
|
|
|
* **Text / delimited files**: ``.txt``, ``.csv``, ``.tsv``
|
|
|
|
|
|
2026-04-20 21:30:35 +00:00
|
|
|
A directory is considered *empty* if it contains no files matching the
|
|
|
|
|
extension filter, even when other file types are present.
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
Configure the constants below (or use CLI arguments), then run::
|
2026-04-20 21:27:54 +00:00
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
python3 data_explorer.py [OPTIONS]
|
2026-04-20 21:27:54 +00:00
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
Python 3.10+ compatible. Requires ``boto3`` / ``botocore`` and stdlib.
|
2026-04-20 21:27:54 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
import argparse
|
|
|
|
|
import os
|
2026-04-20 21:27:54 +00:00
|
|
|
import sys
|
|
|
|
|
from dataclasses import dataclass, field
|
2026-04-22 01:05:26 +00:00
|
|
|
from typing import List, Set, Tuple
|
2026-04-20 21:27:54 +00:00
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Dependency check
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import boto3 # noqa: F401
|
|
|
|
|
import botocore.exceptions # noqa: F401
|
|
|
|
|
except ImportError:
|
|
|
|
|
print(
|
|
|
|
|
"ERROR: boto3 / botocore is not installed.\n"
|
|
|
|
|
"Install with: pip install boto3",
|
|
|
|
|
file=sys.stderr,
|
|
|
|
|
)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
2026-04-22 01:05:26 +00:00
|
|
|
# Extension constants
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
SAS_EXTENSIONS: Set[str] = {".sas7bdat", ".xpt", ".xport"}
|
|
|
|
|
"""File extensions recognised as SAS data files."""
|
|
|
|
|
|
|
|
|
|
TEXT_EXTENSIONS: Set[str] = {".txt", ".csv", ".tsv"}
|
|
|
|
|
"""File extensions recognised as delimited text / CSV files."""
|
|
|
|
|
|
|
|
|
|
SUPPORTED_EXTENSIONS: Set[str] = SAS_EXTENSIONS | TEXT_EXTENSIONS
|
|
|
|
|
"""Union of all file extensions this tool can work with."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Configuration defaults — edit these or override via CLI arguments
|
2026-04-20 21:27:54 +00:00
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
FILE_EXTENSIONS: Set[str] = SUPPORTED_EXTENSIONS
|
|
|
|
|
"""Set of extensions to filter on (case-insensitive). Defaults to all supported."""
|
2026-04-20 21:30:35 +00:00
|
|
|
|
2026-04-20 21:27:54 +00:00
|
|
|
INPUT_FILE: str = "s3_directories.txt"
|
|
|
|
|
"""Path to the text file containing one S3 prefix per line."""
|
|
|
|
|
|
|
|
|
|
S3_BUCKET: str = "my-bucket"
|
|
|
|
|
"""S3 bucket name (all prefixes are assumed to live in this bucket)."""
|
|
|
|
|
|
|
|
|
|
AWS_PROFILE: str = "default"
|
|
|
|
|
"""AWS CLI profile name used for authentication."""
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
# Text-file reading defaults (used when downloading / previewing text files)
|
|
|
|
|
DEFAULT_DELIMITER: str = ","
|
|
|
|
|
DEFAULT_ENCODING: str = "utf-8"
|
|
|
|
|
DEFAULT_QUOTECHAR: str = '"'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Auto-detection helpers
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_file_type(filename: str) -> str:
|
|
|
|
|
"""Return ``'sas'``, ``'text'``, or ``'unknown'`` based on *filename* extension.
|
|
|
|
|
|
|
|
|
|
The check is case-insensitive. For ``.tsv`` files the caller should
|
|
|
|
|
default the delimiter to a tab character (``'\\t'``).
|
|
|
|
|
|
|
|
|
|
Examples
|
|
|
|
|
--------
|
|
|
|
|
>>> detect_file_type("data.sas7bdat")
|
|
|
|
|
'sas'
|
|
|
|
|
>>> detect_file_type("report.CSV")
|
|
|
|
|
'text'
|
|
|
|
|
>>> detect_file_type("archive.zip")
|
|
|
|
|
'unknown'
|
|
|
|
|
"""
|
|
|
|
|
ext = os.path.splitext(filename)[1].lower()
|
|
|
|
|
if ext in SAS_EXTENSIONS:
|
|
|
|
|
return "sas"
|
|
|
|
|
if ext in TEXT_EXTENSIONS:
|
|
|
|
|
return "text"
|
|
|
|
|
return "unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def default_delimiter_for(filename: str) -> str:
|
|
|
|
|
"""Return a sensible default delimiter for *filename*.
|
|
|
|
|
|
|
|
|
|
* ``.tsv`` → ``'\\t'``
|
|
|
|
|
* everything else → ``','``
|
|
|
|
|
"""
|
|
|
|
|
ext = os.path.splitext(filename)[1].lower()
|
|
|
|
|
if ext == ".tsv":
|
|
|
|
|
return "\t"
|
|
|
|
|
return ","
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def matches_extensions(key: str, extensions: Set[str]) -> bool:
|
|
|
|
|
"""Return ``True`` if *key* ends with any extension in *extensions* (case-insensitive)."""
|
|
|
|
|
key_lower = key.lower()
|
|
|
|
|
return any(key_lower.endswith(ext) for ext in extensions)
|
|
|
|
|
|
2026-04-20 21:27:54 +00:00
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Data structures
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class AvailableDir:
|
|
|
|
|
"""An S3 directory that is readable."""
|
|
|
|
|
|
|
|
|
|
prefix: str
|
|
|
|
|
file_count: int
|
|
|
|
|
total_size: int # bytes
|
2026-04-20 22:02:35 +00:00
|
|
|
accessible_count: int = 0 # files that passed head_object
|
|
|
|
|
total_count: int = 0 # total .sas7bdat files found
|
|
|
|
|
accessible_size: int = 0 # total size of accessible files only
|
2026-04-20 21:27:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class BlockedDir:
|
|
|
|
|
"""An S3 directory where access was denied or an error occurred."""
|
|
|
|
|
|
|
|
|
|
prefix: str
|
|
|
|
|
file_count: int
|
|
|
|
|
error: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class EmptyDir:
|
|
|
|
|
"""An S3 directory with no objects."""
|
|
|
|
|
|
|
|
|
|
prefix: str
|
|
|
|
|
|
|
|
|
|
|
2026-04-20 22:02:35 +00:00
|
|
|
@dataclass
|
|
|
|
|
class ExceptionFile:
|
|
|
|
|
"""A specific file that failed permission check within an otherwise available directory."""
|
|
|
|
|
|
|
|
|
|
prefix: str # the directory prefix
|
|
|
|
|
key: str # the full S3 key of the failed file
|
|
|
|
|
error: str # the error message
|
|
|
|
|
|
|
|
|
|
|
2026-04-20 21:27:54 +00:00
|
|
|
@dataclass
|
|
|
|
|
class Results:
|
|
|
|
|
"""Aggregated exploration results."""
|
|
|
|
|
|
|
|
|
|
available: List[AvailableDir] = field(default_factory=list)
|
|
|
|
|
blocked: List[BlockedDir] = field(default_factory=list)
|
|
|
|
|
empty: List[EmptyDir] = field(default_factory=list)
|
2026-04-20 22:02:35 +00:00
|
|
|
exceptions: List[ExceptionFile] = field(default_factory=list)
|
2026-04-20 21:27:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Helpers
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_input_file(path: str) -> List[str]:
|
|
|
|
|
"""Return a list of S3 prefixes from *path*, ignoring blanks and comments.
|
|
|
|
|
|
|
|
|
|
Each line is stripped and normalised so that non-empty prefixes always end
|
|
|
|
|
with a trailing ``/``.
|
|
|
|
|
"""
|
|
|
|
|
prefixes: List[str] = []
|
|
|
|
|
with open(path, encoding="utf-8") as fh:
|
|
|
|
|
for raw_line in fh:
|
|
|
|
|
line = raw_line.strip()
|
|
|
|
|
if not line or line.startswith("#"):
|
|
|
|
|
continue
|
|
|
|
|
# Normalise: strip surrounding whitespace/slashes, then re-add
|
|
|
|
|
# a single trailing slash (unless the prefix is empty/root).
|
|
|
|
|
line = line.strip("/")
|
|
|
|
|
if line:
|
|
|
|
|
line += "/"
|
|
|
|
|
prefixes.append(line)
|
|
|
|
|
return prefixes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def format_size(size_bytes: int) -> str:
|
|
|
|
|
"""Return a human-readable size string (KB, MB, GB, TB)."""
|
|
|
|
|
if size_bytes < 1024:
|
|
|
|
|
return f"{size_bytes} B"
|
|
|
|
|
for unit in ("KB", "MB", "GB", "TB"):
|
|
|
|
|
size_bytes /= 1024.0
|
|
|
|
|
if size_bytes < 1024.0 or unit == "TB":
|
|
|
|
|
return f"{size_bytes:,.1f} {unit}"
|
|
|
|
|
# Fallback (should not be reached)
|
|
|
|
|
return f"{size_bytes:,.1f} TB"
|
|
|
|
|
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
def extensions_label(extensions: Set[str]) -> str:
|
|
|
|
|
"""Return a compact, sorted label for a set of extensions (e.g. ``.csv/.tsv/.txt``)."""
|
|
|
|
|
return "/".join(sorted(extensions))
|
|
|
|
|
|
|
|
|
|
|
2026-04-20 21:27:54 +00:00
|
|
|
def list_objects(
|
|
|
|
|
s3_client: "botocore.client.S3",
|
|
|
|
|
bucket: str,
|
|
|
|
|
prefix: str,
|
2026-04-22 01:05:26 +00:00
|
|
|
extensions: Set[str] | None = None,
|
2026-04-20 22:02:35 +00:00
|
|
|
) -> Tuple[List[Tuple[str, int]], int]:
|
|
|
|
|
"""Recursively list all objects under *prefix*.
|
2026-04-20 21:27:54 +00:00
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
Only objects whose key ends with one of *extensions* (case-insensitive) are
|
|
|
|
|
counted. All other files are silently skipped. When *extensions* is
|
|
|
|
|
``None`` the module-level ``FILE_EXTENSIONS`` set is used.
|
2026-04-20 21:30:35 +00:00
|
|
|
|
2026-04-20 22:02:35 +00:00
|
|
|
Returns ``(files, total_size)`` where *files* is a list of
|
|
|
|
|
``(key, size)`` tuples for every matching object and *total_size* is the
|
|
|
|
|
sum of their sizes in bytes.
|
2026-04-20 21:27:54 +00:00
|
|
|
"""
|
2026-04-22 01:05:26 +00:00
|
|
|
if extensions is None:
|
|
|
|
|
extensions = FILE_EXTENSIONS
|
|
|
|
|
exts_lower = {e.lower() for e in extensions}
|
2026-04-20 21:27:54 +00:00
|
|
|
paginator = s3_client.get_paginator("list_objects_v2")
|
2026-04-20 22:02:35 +00:00
|
|
|
files: List[Tuple[str, int]] = []
|
2026-04-20 21:27:54 +00:00
|
|
|
total_size: int = 0
|
|
|
|
|
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
|
|
|
|
|
for obj in page.get("Contents", []):
|
2026-04-22 01:05:26 +00:00
|
|
|
if not any(obj["Key"].lower().endswith(ext) for ext in exts_lower):
|
2026-04-20 21:30:35 +00:00
|
|
|
continue
|
2026-04-20 22:02:35 +00:00
|
|
|
files.append((obj["Key"], obj["Size"]))
|
2026-04-20 21:27:54 +00:00
|
|
|
total_size += obj["Size"]
|
2026-04-20 22:02:35 +00:00
|
|
|
return files, total_size
|
2026-04-20 21:27:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_read_permission(
|
|
|
|
|
s3_client: "botocore.client.S3",
|
|
|
|
|
bucket: str,
|
|
|
|
|
key: str,
|
|
|
|
|
) -> str | None:
|
|
|
|
|
"""Try ``head_object`` on *key*. Return ``None`` on success or an error string."""
|
|
|
|
|
try:
|
|
|
|
|
s3_client.head_object(Bucket=bucket, Key=key)
|
|
|
|
|
except botocore.exceptions.ClientError as exc:
|
|
|
|
|
code = exc.response.get("Error", {}).get("Code", "Unknown")
|
|
|
|
|
message = exc.response.get("Error", {}).get("Message", str(exc))
|
|
|
|
|
return f"{message} ({code})"
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Core logic
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
def explore_directories(
|
|
|
|
|
prefixes: List[str],
|
|
|
|
|
*,
|
|
|
|
|
extensions: Set[str] | None = None,
|
|
|
|
|
) -> Results:
|
|
|
|
|
"""Explore every prefix in ``S3_BUCKET`` and return categorised *Results*.
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
----------
|
|
|
|
|
prefixes:
|
|
|
|
|
List of S3 key prefixes to explore.
|
|
|
|
|
extensions:
|
|
|
|
|
Set of file extensions to filter on. Defaults to the module-level
|
|
|
|
|
``FILE_EXTENSIONS`` (which itself defaults to ``SUPPORTED_EXTENSIONS``).
|
|
|
|
|
"""
|
|
|
|
|
if extensions is None:
|
|
|
|
|
extensions = FILE_EXTENSIONS
|
|
|
|
|
exts_lower = {e.lower() for e in extensions}
|
|
|
|
|
ext_label = extensions_label(extensions)
|
|
|
|
|
|
2026-04-20 21:27:54 +00:00
|
|
|
session = boto3.Session(profile_name=AWS_PROFILE)
|
|
|
|
|
s3 = session.client("s3")
|
|
|
|
|
|
|
|
|
|
results = Results()
|
|
|
|
|
total = len(prefixes)
|
|
|
|
|
|
|
|
|
|
for idx, prefix in enumerate(prefixes, start=1):
|
2026-04-20 21:30:35 +00:00
|
|
|
print(
|
2026-04-22 01:05:26 +00:00
|
|
|
f"[{idx}/{total}] Checking {prefix} (filtering for {ext_label}) ...",
|
2026-04-20 21:30:35 +00:00
|
|
|
file=sys.stderr,
|
|
|
|
|
)
|
2026-04-20 21:27:54 +00:00
|
|
|
|
|
|
|
|
# --- Recursive listing ------------------------------------------------
|
|
|
|
|
try:
|
2026-04-22 14:12:16 +00:00
|
|
|
files, total_size = list_objects(s3, S3_BUCKET, prefix, extensions=extensions)
|
2026-04-20 21:27:54 +00:00
|
|
|
except botocore.exceptions.ClientError as exc:
|
|
|
|
|
code = exc.response.get("Error", {}).get("Code", "Unknown")
|
|
|
|
|
message = exc.response.get("Error", {}).get("Message", str(exc))
|
|
|
|
|
results.blocked.append(
|
|
|
|
|
BlockedDir(prefix=prefix, file_count=0, error=f"{message} ({code})")
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
results.blocked.append(
|
|
|
|
|
BlockedDir(prefix=prefix, file_count=0, error=str(exc))
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
|
2026-04-20 22:02:35 +00:00
|
|
|
if not files:
|
2026-04-20 21:27:54 +00:00
|
|
|
results.empty.append(EmptyDir(prefix=prefix))
|
|
|
|
|
continue
|
|
|
|
|
|
2026-04-20 22:02:35 +00:00
|
|
|
file_count = len(files)
|
|
|
|
|
|
|
|
|
|
# --- Permission check on first file -----------------------------------
|
2026-04-20 21:27:54 +00:00
|
|
|
# Prefer a real object over a zero-byte directory marker (key ending
|
2026-04-20 21:30:35 +00:00
|
|
|
# in "/") for the head_object test. The selected key must also match
|
2026-04-22 14:12:16 +00:00
|
|
|
# the extension filter.
|
2026-04-20 22:02:35 +00:00
|
|
|
first_key, _ = files[0]
|
2026-04-20 21:27:54 +00:00
|
|
|
test_key = first_key
|
|
|
|
|
if first_key.endswith("/") and total_size > 0:
|
2026-04-20 22:02:35 +00:00
|
|
|
for key, size in files:
|
2026-04-22 14:12:16 +00:00
|
|
|
if not (key.endswith("/") and size == 0) and matches_extensions(key, exts_lower):
|
2026-04-20 22:02:35 +00:00
|
|
|
test_key = key
|
|
|
|
|
break
|
2026-04-20 21:27:54 +00:00
|
|
|
|
|
|
|
|
error = check_read_permission(s3, S3_BUCKET, test_key)
|
2026-04-20 22:02:35 +00:00
|
|
|
if error is not None:
|
|
|
|
|
# First file blocked → entire directory is blocked
|
2026-04-20 21:27:54 +00:00
|
|
|
results.blocked.append(
|
|
|
|
|
BlockedDir(prefix=prefix, file_count=file_count, error=error)
|
|
|
|
|
)
|
2026-04-20 22:02:35 +00:00
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# --- First file accessible → check ALL remaining files ----------------
|
|
|
|
|
accessible_count = 1 # the first (test_key) already passed
|
|
|
|
|
accessible_size = 0
|
|
|
|
|
dir_exceptions: List[ExceptionFile] = []
|
|
|
|
|
|
|
|
|
|
# Find the size of the test_key to count it
|
|
|
|
|
for key, size in files:
|
|
|
|
|
if key == test_key:
|
|
|
|
|
accessible_size = size
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Build list of remaining files to check
|
|
|
|
|
remaining = [(key, size) for key, size in files if key != test_key]
|
|
|
|
|
|
|
|
|
|
if remaining:
|
|
|
|
|
if len(remaining) > 10:
|
|
|
|
|
print(
|
2026-04-22 14:12:16 +00:00
|
|
|
f" Verifying access to {file_count} {ext_label} files in {prefix} ...",
|
2026-04-20 22:02:35 +00:00
|
|
|
file=sys.stderr,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for key, size in remaining:
|
|
|
|
|
file_error = check_read_permission(s3, S3_BUCKET, key)
|
|
|
|
|
if file_error is None:
|
|
|
|
|
accessible_count += 1
|
|
|
|
|
accessible_size += size
|
|
|
|
|
else:
|
|
|
|
|
dir_exceptions.append(
|
|
|
|
|
ExceptionFile(prefix=prefix, key=key, error=file_error)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
# Only one file and it passed
|
|
|
|
|
accessible_size = total_size
|
|
|
|
|
|
|
|
|
|
results.available.append(
|
|
|
|
|
AvailableDir(
|
|
|
|
|
prefix=prefix,
|
|
|
|
|
file_count=file_count,
|
|
|
|
|
total_size=total_size,
|
|
|
|
|
accessible_count=accessible_count,
|
|
|
|
|
total_count=file_count,
|
|
|
|
|
accessible_size=accessible_size,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
results.exceptions.extend(dir_exceptions)
|
2026-04-20 21:27:54 +00:00
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Output
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
def print_results(results: Results, *, extensions: Set[str] | None = None) -> None:
|
|
|
|
|
"""Print a clean, human-readable summary to stdout.
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
----------
|
|
|
|
|
results:
|
|
|
|
|
The exploration results to display.
|
|
|
|
|
extensions:
|
|
|
|
|
The set of extensions that were used for filtering. Used only for
|
|
|
|
|
labelling in the output. Defaults to ``FILE_EXTENSIONS``.
|
|
|
|
|
"""
|
|
|
|
|
if extensions is None:
|
|
|
|
|
extensions = FILE_EXTENSIONS
|
|
|
|
|
ext_label = extensions_label(extensions)
|
|
|
|
|
|
2026-04-20 21:27:54 +00:00
|
|
|
print()
|
|
|
|
|
print("=== S3 Directory Explorer Results ===")
|
|
|
|
|
print(f"Bucket: {S3_BUCKET}")
|
2026-04-22 01:05:26 +00:00
|
|
|
print(f"Extensions: {ext_label}")
|
2026-04-20 21:27:54 +00:00
|
|
|
|
|
|
|
|
# --- Available ---
|
|
|
|
|
print()
|
|
|
|
|
print(f"--- Available ({len(results.available)}) ---")
|
|
|
|
|
if results.available:
|
|
|
|
|
for d in results.available:
|
|
|
|
|
print(f" {d.prefix}")
|
2026-04-20 22:02:35 +00:00
|
|
|
print(
|
2026-04-22 14:12:16 +00:00
|
|
|
f" Matching files ({ext_label}): {d.accessible_count}/{d.total_count} accessible"
|
2026-04-20 22:02:35 +00:00
|
|
|
f" | Total Size: {format_size(d.accessible_size)}"
|
|
|
|
|
)
|
2026-04-20 21:27:54 +00:00
|
|
|
else:
|
|
|
|
|
print(" (none)")
|
|
|
|
|
|
|
|
|
|
# --- Blocked ---
|
|
|
|
|
print()
|
|
|
|
|
print(f"--- Blocked ({len(results.blocked)}) ---")
|
|
|
|
|
if results.blocked:
|
|
|
|
|
for d in results.blocked:
|
|
|
|
|
if d.file_count:
|
|
|
|
|
print(f" {d.prefix}")
|
2026-04-22 01:05:26 +00:00
|
|
|
print(f" Matching files ({ext_label}) found: {d.file_count} | Error: {d.error}")
|
2026-04-20 21:27:54 +00:00
|
|
|
else:
|
|
|
|
|
print(f" {d.prefix}")
|
|
|
|
|
print(f" Error: {d.error}")
|
|
|
|
|
else:
|
|
|
|
|
print(" (none)")
|
|
|
|
|
|
2026-04-20 22:02:35 +00:00
|
|
|
# --- Exceptions ---
|
|
|
|
|
print()
|
|
|
|
|
print(f"--- Exceptions ({len(results.exceptions)}) ---")
|
|
|
|
|
if results.exceptions:
|
|
|
|
|
for exc in results.exceptions:
|
|
|
|
|
print(f" {exc.key}")
|
|
|
|
|
print(f" Directory: {exc.prefix} | Error: {exc.error}")
|
|
|
|
|
else:
|
|
|
|
|
print(" (none)")
|
|
|
|
|
|
2026-04-20 21:27:54 +00:00
|
|
|
# --- Empty ---
|
|
|
|
|
print()
|
2026-04-22 01:05:26 +00:00
|
|
|
print(f"--- Empty / no matching files ({len(results.empty)}) ---")
|
2026-04-20 21:27:54 +00:00
|
|
|
if results.empty:
|
|
|
|
|
for d in results.empty:
|
|
|
|
|
print(f" {d.prefix}")
|
|
|
|
|
else:
|
|
|
|
|
print(" (none)")
|
|
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# CLI argument parsing
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_arg_parser() -> argparse.ArgumentParser:
|
|
|
|
|
"""Build and return the CLI argument parser.
|
|
|
|
|
|
|
|
|
|
Supports selecting file-type filters, text-file reading parameters, and
|
|
|
|
|
overriding the default bucket / profile / input-file settings.
|
|
|
|
|
"""
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
description=(
|
|
|
|
|
"Explore S3 directories and categorise them by accessibility. "
|
|
|
|
|
"Supports SAS files (.sas7bdat, .xpt, .xport) and delimited text "
|
|
|
|
|
"files (.txt, .csv, .tsv)."
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# --- File-type / extension selection ---
|
|
|
|
|
type_group = parser.add_argument_group("File-type selection")
|
|
|
|
|
type_group.add_argument(
|
|
|
|
|
"--file-type",
|
|
|
|
|
choices=["sas", "text", "all"],
|
|
|
|
|
default="all",
|
|
|
|
|
help=(
|
|
|
|
|
"Restrict the scan to a specific file type. "
|
|
|
|
|
"'sas' = .sas7bdat/.xpt/.xport only; "
|
|
|
|
|
"'text' = .txt/.csv/.tsv only; "
|
|
|
|
|
"'all' = both (default)."
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
type_group.add_argument(
|
|
|
|
|
"--extensions",
|
|
|
|
|
nargs="+",
|
|
|
|
|
metavar="EXT",
|
|
|
|
|
help=(
|
|
|
|
|
"Explicit list of extensions to filter on (e.g. --extensions .csv .tsv). "
|
|
|
|
|
"Overrides --file-type when provided."
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# --- Text-file reading parameters ---
|
|
|
|
|
text_group = parser.add_argument_group(
|
|
|
|
|
"Text-file parameters",
|
|
|
|
|
description=(
|
|
|
|
|
"Parameters used when reading delimited text files. These are "
|
|
|
|
|
"stored for downstream consumers and do not affect the S3 scan "
|
|
|
|
|
"itself."
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
text_group.add_argument(
|
|
|
|
|
"--delimiter",
|
|
|
|
|
default=None,
|
|
|
|
|
help=(
|
|
|
|
|
"Field delimiter for text files (default: ',' for .csv/.txt, "
|
|
|
|
|
"'\\t' for .tsv). Use 'tab' or '\\t' for a tab character."
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
text_group.add_argument(
|
|
|
|
|
"--encoding",
|
|
|
|
|
default=DEFAULT_ENCODING,
|
|
|
|
|
help=f"Character encoding for text files (default: {DEFAULT_ENCODING}).",
|
|
|
|
|
)
|
|
|
|
|
text_group.add_argument(
|
|
|
|
|
"--quotechar",
|
|
|
|
|
default=DEFAULT_QUOTECHAR,
|
|
|
|
|
help=f"Quote character for text files (default: {DEFAULT_QUOTECHAR!r}).",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# --- S3 / general settings ---
|
|
|
|
|
s3_group = parser.add_argument_group("S3 settings")
|
|
|
|
|
s3_group.add_argument(
|
|
|
|
|
"--bucket",
|
|
|
|
|
default=None,
|
|
|
|
|
help=f"S3 bucket name (default: {S3_BUCKET}).",
|
|
|
|
|
)
|
|
|
|
|
s3_group.add_argument(
|
|
|
|
|
"--profile",
|
|
|
|
|
default=None,
|
|
|
|
|
help=f"AWS CLI profile name (default: {AWS_PROFILE}).",
|
|
|
|
|
)
|
|
|
|
|
s3_group.add_argument(
|
|
|
|
|
"--input-file",
|
|
|
|
|
default=None,
|
|
|
|
|
help=f"Path to the text file with S3 prefixes (default: {INPUT_FILE}).",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resolve_extensions(args: argparse.Namespace) -> Set[str]:
|
|
|
|
|
"""Determine the active extension set from parsed CLI *args*.
|
|
|
|
|
|
|
|
|
|
If ``--extensions`` is provided it takes precedence. Otherwise
|
|
|
|
|
``--file-type`` is used to select a predefined set.
|
|
|
|
|
"""
|
|
|
|
|
if args.extensions:
|
|
|
|
|
# Normalise: ensure each extension starts with a dot and is lowercase
|
|
|
|
|
exts: Set[str] = set()
|
|
|
|
|
for ext in args.extensions:
|
|
|
|
|
ext = ext.strip().lower()
|
|
|
|
|
if not ext.startswith("."):
|
|
|
|
|
ext = "." + ext
|
|
|
|
|
exts.add(ext)
|
|
|
|
|
return exts
|
|
|
|
|
|
|
|
|
|
if args.file_type == "sas":
|
|
|
|
|
return SAS_EXTENSIONS
|
|
|
|
|
if args.file_type == "text":
|
|
|
|
|
return TEXT_EXTENSIONS
|
|
|
|
|
return SUPPORTED_EXTENSIONS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resolve_delimiter(args: argparse.Namespace) -> str:
|
|
|
|
|
"""Return the effective delimiter from parsed CLI *args*.
|
|
|
|
|
|
|
|
|
|
Handles the special values ``'tab'`` and ``'\\t'`` so users can specify a
|
|
|
|
|
tab character on the command line without shell-escaping issues.
|
|
|
|
|
"""
|
|
|
|
|
if args.delimiter is None:
|
|
|
|
|
return DEFAULT_DELIMITER
|
|
|
|
|
raw = args.delimiter
|
|
|
|
|
if raw.lower() in ("tab", "\\t"):
|
|
|
|
|
return "\t"
|
|
|
|
|
return raw
|
|
|
|
|
|
|
|
|
|
|
2026-04-20 21:27:54 +00:00
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Main
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2026-04-22 01:05:26 +00:00
|
|
|
parser = build_arg_parser()
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
# --- Apply CLI overrides to module-level config ---------------------------
|
|
|
|
|
if args.bucket:
|
|
|
|
|
S3_BUCKET = args.bucket
|
|
|
|
|
if args.profile:
|
|
|
|
|
AWS_PROFILE = args.profile
|
|
|
|
|
input_file = args.input_file if args.input_file else INPUT_FILE
|
|
|
|
|
|
|
|
|
|
active_extensions = resolve_extensions(args)
|
|
|
|
|
FILE_EXTENSIONS = active_extensions
|
|
|
|
|
|
|
|
|
|
delimiter = resolve_delimiter(args)
|
|
|
|
|
encoding = args.encoding
|
|
|
|
|
quotechar = args.quotechar
|
2026-04-20 21:27:54 +00:00
|
|
|
|
|
|
|
|
# --- Read input file ------------------------------------------------------
|
2026-04-22 01:05:26 +00:00
|
|
|
if not os.path.exists(input_file):
|
|
|
|
|
print(f"ERROR: Input file not found: {input_file}", file=sys.stderr)
|
2026-04-20 21:27:54 +00:00
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
try:
|
2026-04-22 01:05:26 +00:00
|
|
|
prefixes = read_input_file(input_file)
|
2026-04-20 21:27:54 +00:00
|
|
|
except Exception as exc:
|
|
|
|
|
print(f"ERROR: Could not read input file: {exc}", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
if not prefixes:
|
|
|
|
|
print("No valid S3 prefixes found in the input file. Nothing to do.")
|
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
|
|
|
|
# --- Validate AWS profile -------------------------------------------------
|
|
|
|
|
try:
|
|
|
|
|
session = boto3.Session(profile_name=AWS_PROFILE)
|
|
|
|
|
# Force credential resolution to catch bad profiles early
|
|
|
|
|
credentials = session.get_credentials()
|
|
|
|
|
if credentials is None:
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
f"No credentials found for AWS profile {AWS_PROFILE!r}"
|
|
|
|
|
)
|
|
|
|
|
except botocore.exceptions.ProfileNotFound as exc:
|
|
|
|
|
print(f"ERROR: {exc}", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
print(f"ERROR: AWS profile validation failed: {exc}", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
# --- Print active configuration -------------------------------------------
|
|
|
|
|
ext_label = extensions_label(active_extensions)
|
|
|
|
|
print(f"Bucket: {S3_BUCKET}", file=sys.stderr)
|
|
|
|
|
print(f"Extensions: {ext_label}", file=sys.stderr)
|
|
|
|
|
if active_extensions & TEXT_EXTENSIONS:
|
|
|
|
|
print(
|
|
|
|
|
f"Text opts: delimiter={delimiter!r} encoding={encoding!r} "
|
|
|
|
|
f"quotechar={quotechar!r}",
|
|
|
|
|
file=sys.stderr,
|
|
|
|
|
)
|
|
|
|
|
|
2026-04-20 21:27:54 +00:00
|
|
|
# --- Explore --------------------------------------------------------------
|
2026-04-22 01:05:26 +00:00
|
|
|
results = explore_directories(prefixes, extensions=active_extensions)
|
|
|
|
|
print_results(results, extensions=active_extensions)
|