foxtrot/utils/data_explorer.py

"""Explore S3 directories and categorise them by accessibility.

Reads a text file containing one S3 prefix per line (paths within the bucket
configured by the ``S3_BUCKET`` constant or ``--bucket`` CLI argument), then
for each prefix:

- Lists all objects recursively (via ``list_objects_v2`` paginator)
- **Only considers files matching the configured extensions** (default: all
  supported extensions — SAS and text).  All other file types are ignored.
- Tests read permission with ``head_object`` on the first matching file found
- Categorises the directory as **Available**, **Blocked**, or **Empty**

Supported file types
--------------------
* **SAS files**: ``.sas7bdat``, ``.xpt``, ``.xport``
* **Text / delimited files**: ``.txt``, ``.csv``, ``.tsv``

A directory is considered *empty* if it contains no files matching the
extension filter, even when other file types are present.

Configure the constants below (or use CLI arguments), then run::

    python3 data_explorer.py [OPTIONS]

Python 3.10+ compatible.  Requires ``boto3`` / ``botocore`` and stdlib.
"""

from __future__ import annotations

import argparse
import os
import sys
from dataclasses import dataclass, field
from typing import List, Set, Tuple

# ---------------------------------------------------------------------------
# Dependency check
# ---------------------------------------------------------------------------

try:
    import boto3  # noqa: F401
    import botocore.exceptions  # noqa: F401
except ImportError:
    print(
        "ERROR: boto3 / botocore is not installed.\n"
        "Install with:  pip install boto3",
        file=sys.stderr,
    )
    sys.exit(1)


# ---------------------------------------------------------------------------
# Extension constants
# ---------------------------------------------------------------------------

SAS_EXTENSIONS: Set[str] = {".sas7bdat", ".xpt", ".xport"}
"""File extensions recognised as SAS data files."""

TEXT_EXTENSIONS: Set[str] = {".txt", ".csv", ".tsv"}
"""File extensions recognised as delimited text / CSV files."""

SUPPORTED_EXTENSIONS: Set[str] = SAS_EXTENSIONS | TEXT_EXTENSIONS
"""Union of all file extensions this tool can work with."""


# ---------------------------------------------------------------------------
# Configuration defaults — edit these or override via CLI arguments
# ---------------------------------------------------------------------------

FILE_EXTENSIONS: Set[str] = SUPPORTED_EXTENSIONS
"""Set of extensions to filter on (case-insensitive).  Defaults to all supported."""

INPUT_FILE: str = "s3_directories.txt"
"""Path to the text file containing one S3 prefix per line."""

S3_BUCKET: str = "my-bucket"
"""S3 bucket name (all prefixes are assumed to live in this bucket)."""

AWS_PROFILE: str = "default"
"""AWS CLI profile name used for authentication."""

# Text-file reading defaults (used when downloading / previewing text files)
DEFAULT_DELIMITER: str = ","
DEFAULT_ENCODING: str = "utf-8"
DEFAULT_QUOTECHAR: str = '"'


# ---------------------------------------------------------------------------
# Auto-detection helpers
# ---------------------------------------------------------------------------


def detect_file_type(filename: str) -> str:
    """Return ``'sas'``, ``'text'``, or ``'unknown'`` based on *filename* extension.

    The check is case-insensitive.  For ``.tsv`` files the caller should
    default the delimiter to a tab character (``'\\t'``).

    Examples
    --------
    >>> detect_file_type("data.sas7bdat")
    'sas'
    >>> detect_file_type("report.CSV")
    'text'
    >>> detect_file_type("archive.zip")
    'unknown'
    """
    ext = os.path.splitext(filename)[1].lower()
    if ext in SAS_EXTENSIONS:
        return "sas"
    if ext in TEXT_EXTENSIONS:
        return "text"
    return "unknown"


def default_delimiter_for(filename: str) -> str:
    """Return a sensible default delimiter for *filename*.

    * ``.tsv`` → ``'\\t'``
    * everything else → ``','``
    """
    ext = os.path.splitext(filename)[1].lower()
    if ext == ".tsv":
        return "\t"
    return ","


def matches_extensions(key: str, extensions: Set[str]) -> bool:
    """Return ``True`` if *key* ends with any extension in *extensions* (case-insensitive)."""
    key_lower = key.lower()
    return any(key_lower.endswith(ext) for ext in extensions)


# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------


@dataclass
class AvailableDir:
    """An S3 directory that is readable."""

    prefix: str
    file_count: int
    total_size: int  # bytes


@dataclass
class BlockedDir:
    """An S3 directory where access was denied or an error occurred."""

    prefix: str
    file_count: int
    error: str


@dataclass
class EmptyDir:
    """An S3 directory with no objects."""

    prefix: str


@dataclass
class Results:
    """Aggregated exploration results."""

    available: List[AvailableDir] = field(default_factory=list)
    blocked: List[BlockedDir] = field(default_factory=list)
    empty: List[EmptyDir] = field(default_factory=list)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def read_input_file(path: str) -> List[str]:
    """Return a list of S3 prefixes from *path*, ignoring blanks and comments.

    Each line is stripped and normalised so that non-empty prefixes always end
    with a trailing ``/``.
    """
    prefixes: List[str] = []
    with open(path, encoding="utf-8") as fh:
        for raw_line in fh:
            line = raw_line.strip()
            if not line or line.startswith("#"):
                continue
            # Normalise: strip surrounding whitespace/slashes, then re-add
            # a single trailing slash (unless the prefix is empty/root).
            line = line.strip("/")
            if line:
                line += "/"
            prefixes.append(line)
    return prefixes


def format_size(size_bytes: int) -> str:
    """Return a human-readable size string (KB, MB, GB, TB)."""
    if size_bytes < 1024:
        return f"{size_bytes} B"
    for unit in ("KB", "MB", "GB", "TB"):
        size_bytes /= 1024.0
        if size_bytes < 1024.0 or unit == "TB":
            return f"{size_bytes:,.1f} {unit}"
    # Fallback (should not be reached)
    return f"{size_bytes:,.1f} TB"


def extensions_label(extensions: Set[str]) -> str:
    """Return a compact, sorted label for a set of extensions (e.g. ``.csv/.tsv/.txt``)."""
    return "/".join(sorted(extensions))


def list_objects(
    s3_client: "botocore.client.S3",
    bucket: str,
    prefix: str,
    extensions: Set[str] | None = None,
) -> Tuple[str | None, int, int]:
    """Recursively list all objects under *prefix* using streaming counters.

    Only objects whose key ends with one of *extensions* (case-insensitive) are
    counted.  All other files are silently skipped.  When *extensions* is
    ``None`` the module-level ``FILE_EXTENSIONS`` set is used.

    Returns ``(first_key, file_count, total_size)`` where *first_key* is the
    key of the first matching object found (or ``None`` if no matching files
    exist), *file_count* is the total number of matching objects, and
    *total_size* is the sum of their sizes in bytes.

    Unlike the previous implementation this never accumulates all keys in
    memory, making it safe for prefixes with millions of objects.
    """
    if extensions is None:
        extensions = FILE_EXTENSIONS
    exts_lower = {e.lower() for e in extensions}
    paginator = s3_client.get_paginator("list_objects_v2")
    first_key: str | None = None
    file_count: int = 0
    total_size: int = 0
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            if not any(obj["Key"].lower().endswith(ext) for ext in exts_lower):
                continue
            if first_key is None:
                first_key = obj["Key"]
            file_count += 1
            total_size += obj["Size"]
    return first_key, file_count, total_size


def check_read_permission(
    s3_client: "botocore.client.S3",
    bucket: str,
    key: str,
) -> str | None:
    """Try ``head_object`` on *key*.  Return ``None`` on success or an error string."""
    try:
        s3_client.head_object(Bucket=bucket, Key=key)
    except botocore.exceptions.ClientError as exc:
        code = exc.response.get("Error", {}).get("Code", "Unknown")
        message = exc.response.get("Error", {}).get("Message", str(exc))
        return f"{message} ({code})"
    return None


# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------


def explore_directories(
    prefixes: List[str],
    *,
    extensions: Set[str] | None = None,
) -> Results:
    """Explore every prefix in ``S3_BUCKET`` and return categorised *Results*.

    Parameters
    ----------
    prefixes:
        List of S3 key prefixes to explore.
    extensions:
        Set of file extensions to filter on.  Defaults to the module-level
        ``FILE_EXTENSIONS`` (which itself defaults to ``SUPPORTED_EXTENSIONS``).
    """
    if extensions is None:
        extensions = FILE_EXTENSIONS
    exts_lower = {e.lower() for e in extensions}
    ext_label = extensions_label(extensions)

    session = boto3.Session(profile_name=AWS_PROFILE)
    s3 = session.client("s3")

    results = Results()
    total = len(prefixes)

    for idx, prefix in enumerate(prefixes, start=1):
        print(
            f"[{idx}/{total}] Checking {prefix} (filtering for {ext_label}) ...",
            file=sys.stderr,
        )

        # --- Recursive listing ------------------------------------------------
        try:
            first_key, file_count, total_size = list_objects(
                s3, S3_BUCKET, prefix, extensions=extensions,
            )
        except botocore.exceptions.ClientError as exc:
            code = exc.response.get("Error", {}).get("Code", "Unknown")
            message = exc.response.get("Error", {}).get("Message", str(exc))
            results.blocked.append(
                BlockedDir(prefix=prefix, file_count=0, error=f"{message} ({code})")
            )
            continue
        except Exception as exc:
            results.blocked.append(
                BlockedDir(prefix=prefix, file_count=0, error=str(exc))
            )
            continue

        if first_key is None:
            results.empty.append(EmptyDir(prefix=prefix))
            continue

        # --- Permission check -------------------------------------------------
        # Prefer a real object over a zero-byte directory marker (key ending
        # in "/") for the head_object test.  The selected key must also match
        # the extension filter.  If no suitable key is found, fall back to
        # first_key.
        test_key = first_key
        if first_key.endswith("/") and total_size > 0:
            # Re-scan the first page to find a non-marker key matching the extensions
            try:
                probe_paginator = s3.get_paginator("list_objects_v2")
                for probe_page in probe_paginator.paginate(
                    Bucket=S3_BUCKET, Prefix=prefix, PaginationConfig={"MaxItems": 1000}
                ):
                    for obj in probe_page.get("Contents", []):
                        if (
                            not (obj["Key"].endswith("/") and obj["Size"] == 0)
                            and any(obj["Key"].lower().endswith(ext) for ext in exts_lower)
                        ):
                            test_key = obj["Key"]
                            break
                    if test_key != first_key:
                        break
            except Exception:
                pass  # Fall back to first_key

        error = check_read_permission(s3, S3_BUCKET, test_key)
        if error is None:
            results.available.append(
                AvailableDir(prefix=prefix, file_count=file_count, total_size=total_size)
            )
        else:
            results.blocked.append(
                BlockedDir(prefix=prefix, file_count=file_count, error=error)
            )

    return results


# ---------------------------------------------------------------------------
# Output
# ---------------------------------------------------------------------------


def print_results(results: Results, *, extensions: Set[str] | None = None) -> None:
    """Print a clean, human-readable summary to stdout.

    Parameters
    ----------
    results:
        The exploration results to display.
    extensions:
        The set of extensions that were used for filtering.  Used only for
        labelling in the output.  Defaults to ``FILE_EXTENSIONS``.
    """
    if extensions is None:
        extensions = FILE_EXTENSIONS
    ext_label = extensions_label(extensions)

    print()
    print("=== S3 Directory Explorer Results ===")
    print(f"Bucket: {S3_BUCKET}")
    print(f"Extensions: {ext_label}")

    # --- Available ---
    print()
    print(f"--- Available ({len(results.available)}) ---")
    if results.available:
        for d in results.available:
            print(f"  {d.prefix}")
            print(f"    Matching files ({ext_label}): {d.file_count} | Total Size: {format_size(d.total_size)}")
    else:
        print("  (none)")

    # --- Blocked ---
    print()
    print(f"--- Blocked ({len(results.blocked)}) ---")
    if results.blocked:
        for d in results.blocked:
            if d.file_count:
                print(f"  {d.prefix}")
                print(f"    Matching files ({ext_label}) found: {d.file_count} | Error: {d.error}")
            else:
                print(f"  {d.prefix}")
                print(f"    Error: {d.error}")
    else:
        print("  (none)")

    # --- Empty ---
    print()
    print(f"--- Empty / no matching files ({len(results.empty)}) ---")
    if results.empty:
        for d in results.empty:
            print(f"  {d.prefix}")
    else:
        print("  (none)")

    print()


# ---------------------------------------------------------------------------
# CLI argument parsing
# ---------------------------------------------------------------------------


def build_arg_parser() -> argparse.ArgumentParser:
    """Build and return the CLI argument parser.

    Supports selecting file-type filters, text-file reading parameters, and
    overriding the default bucket / profile / input-file settings.
    """
    parser = argparse.ArgumentParser(
        description=(
            "Explore S3 directories and categorise them by accessibility.  "
            "Supports SAS files (.sas7bdat, .xpt, .xport) and delimited text "
            "files (.txt, .csv, .tsv)."
        ),
    )

    # --- File-type / extension selection ---
    type_group = parser.add_argument_group("File-type selection")
    type_group.add_argument(
        "--file-type",
        choices=["sas", "text", "all"],
        default="all",
        help=(
            "Restrict the scan to a specific file type.  "
            "'sas' = .sas7bdat/.xpt/.xport only; "
            "'text' = .txt/.csv/.tsv only; "
            "'all' = both (default)."
        ),
    )
    type_group.add_argument(
        "--extensions",
        nargs="+",
        metavar="EXT",
        help=(
            "Explicit list of extensions to filter on (e.g. --extensions .csv .tsv).  "
            "Overrides --file-type when provided."
        ),
    )

    # --- Text-file reading parameters ---
    text_group = parser.add_argument_group(
        "Text-file parameters",
        description=(
            "Parameters used when reading delimited text files.  These are "
            "stored for downstream consumers and do not affect the S3 scan "
            "itself."
        ),
    )
    text_group.add_argument(
        "--delimiter",
        default=None,
        help=(
            "Field delimiter for text files (default: ',' for .csv/.txt, "
            "'\\t' for .tsv).  Use 'tab' or '\\t' for a tab character."
        ),
    )
    text_group.add_argument(
        "--encoding",
        default=DEFAULT_ENCODING,
        help=f"Character encoding for text files (default: {DEFAULT_ENCODING}).",
    )
    text_group.add_argument(
        "--quotechar",
        default=DEFAULT_QUOTECHAR,
        help=f"Quote character for text files (default: {DEFAULT_QUOTECHAR!r}).",
    )

    # --- S3 / general settings ---
    s3_group = parser.add_argument_group("S3 settings")
    s3_group.add_argument(
        "--bucket",
        default=None,
        help=f"S3 bucket name (default: {S3_BUCKET}).",
    )
    s3_group.add_argument(
        "--profile",
        default=None,
        help=f"AWS CLI profile name (default: {AWS_PROFILE}).",
    )
    s3_group.add_argument(
        "--input-file",
        default=None,
        help=f"Path to the text file with S3 prefixes (default: {INPUT_FILE}).",
    )

    return parser


def resolve_extensions(args: argparse.Namespace) -> Set[str]:
    """Determine the active extension set from parsed CLI *args*.

    If ``--extensions`` is provided it takes precedence.  Otherwise
    ``--file-type`` is used to select a predefined set.
    """
    if args.extensions:
        # Normalise: ensure each extension starts with a dot and is lowercase
        exts: Set[str] = set()
        for ext in args.extensions:
            ext = ext.strip().lower()
            if not ext.startswith("."):
                ext = "." + ext
            exts.add(ext)
        return exts

    if args.file_type == "sas":
        return SAS_EXTENSIONS
    if args.file_type == "text":
        return TEXT_EXTENSIONS
    return SUPPORTED_EXTENSIONS


def resolve_delimiter(args: argparse.Namespace) -> str:
    """Return the effective delimiter from parsed CLI *args*.

    Handles the special values ``'tab'`` and ``'\\t'`` so users can specify a
    tab character on the command line without shell-escaping issues.
    """
    if args.delimiter is None:
        return DEFAULT_DELIMITER
    raw = args.delimiter
    if raw.lower() in ("tab", "\\t"):
        return "\t"
    return raw


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    parser = build_arg_parser()
    args = parser.parse_args()

    # --- Apply CLI overrides to module-level config ---------------------------
    if args.bucket:
        S3_BUCKET = args.bucket
    if args.profile:
        AWS_PROFILE = args.profile
    input_file = args.input_file if args.input_file else INPUT_FILE

    active_extensions = resolve_extensions(args)
    FILE_EXTENSIONS = active_extensions

    delimiter = resolve_delimiter(args)
    encoding = args.encoding
    quotechar = args.quotechar

    # --- Read input file ------------------------------------------------------
    if not os.path.exists(input_file):
        print(f"ERROR: Input file not found: {input_file}", file=sys.stderr)
        sys.exit(1)

    try:
        prefixes = read_input_file(input_file)
    except Exception as exc:
        print(f"ERROR: Could not read input file: {exc}", file=sys.stderr)
        sys.exit(1)

    if not prefixes:
        print("No valid S3 prefixes found in the input file. Nothing to do.")
        sys.exit(0)

    # --- Validate AWS profile -------------------------------------------------
    try:
        session = boto3.Session(profile_name=AWS_PROFILE)
        # Force credential resolution to catch bad profiles early
        credentials = session.get_credentials()
        if credentials is None:
            raise RuntimeError(
                f"No credentials found for AWS profile {AWS_PROFILE!r}"
            )
    except botocore.exceptions.ProfileNotFound as exc:
        print(f"ERROR: {exc}", file=sys.stderr)
        sys.exit(1)
    except Exception as exc:
        print(f"ERROR: AWS profile validation failed: {exc}", file=sys.stderr)
        sys.exit(1)

    # --- Print active configuration -------------------------------------------
    ext_label = extensions_label(active_extensions)
    print(f"Bucket:     {S3_BUCKET}", file=sys.stderr)
    print(f"Extensions: {ext_label}", file=sys.stderr)
    if active_extensions & TEXT_EXTENSIONS:
        print(
            f"Text opts:  delimiter={delimiter!r}  encoding={encoding!r}  "
            f"quotechar={quotechar!r}",
            file=sys.stderr,
        )

    # --- Explore --------------------------------------------------------------
    results = explore_directories(prefixes, extensions=active_extensions)
    print_results(results, extensions=active_extensions)