foxtrot/utils/file_viewer.py

"""Standalone utility to download a .sas7bdat file from S3 and print a
column-level summary of the first 10 rows.

Configure the four constants below, then run::

    python3 file_viewer.py

Python 3.9 compatible.
"""

from __future__ import annotations

import os
import sys

import boto3
import pandas as pd
import pyreadstat


# ---------------------------------------------------------------------------
# Configuration — edit these before running
# ---------------------------------------------------------------------------

S3_BUCKET: str = "my-bucket"
"""S3 bucket name."""

S3_KEY: str = "path/to/file.sas7bdat"
"""Object key (path) within the bucket to the .sas7bdat file."""

LOCAL_FOLDER: str = "./downloads"
"""Local directory to download the file into."""

AWS_PROFILE: str = "default"
"""AWS CLI profile name used for authentication."""


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _download_from_s3(bucket: str, key: str, local_path: str) -> None:
    """Download *key* from *bucket* to *local_path* using a named session."""
    session = boto3.Session(profile_name=AWS_PROFILE)
    s3 = session.client("s3")
    print(f"Downloading s3://{bucket}/{key} -> {local_path}")
    s3.download_file(bucket, key, local_path)
    print("Download complete.")


def _read_sas_head(path: str, row_count: int = 10) -> pd.DataFrame:
    """Read the first *row_count* rows of a .sas7bdat file."""
    df, _ = pyreadstat.read_sas7bdat(path, row_offset=0, row_limit=row_count)
    return df


def _sample_values(series: pd.Series, n: int = 3) -> str:
    """Return up to *n* non-null sample values as a comma-separated string."""
    non_null = series.dropna()
    samples = non_null.head(n).tolist()
    if not samples:
        return "(all null)"
    return ", ".join(repr(v) for v in samples)


def _print_summary(df: pd.DataFrame) -> None:
    """Print a nicely formatted summary table to stdout."""
    # Pre-compute column data
    rows = []
    for col in df.columns:
        rows.append((col, str(df[col].dtype), _sample_values(df[col], 3)))

    # Determine column widths
    hdr_name = "Column Name"
    hdr_dtype = "Data Type"
    hdr_samples = "Sample Values (up to 3)"

    w_name = max(len(hdr_name), *(len(r[0]) for r in rows))
    w_dtype = max(len(hdr_dtype), *(len(r[1]) for r in rows))
    w_samples = max(len(hdr_samples), *(len(r[2]) for r in rows))

    fmt = f"  {{:<{w_name}}}  {{:<{w_dtype}}}  {{:<{w_samples}}}"
    sep = f"  {'-' * w_name}  {'-' * w_dtype}  {'-' * w_samples}"

    print()
    print(f"  Summary of first {len(df)} row(s)  ({len(df.columns)} columns)")
    print(sep)
    print(fmt.format(hdr_name, hdr_dtype, hdr_samples))
    print(sep)
    for name, dtype, samples in rows:
        print(fmt.format(name, dtype, samples))
    print(sep)
    print()


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


if __name__ == "__main__":
    # --- Download -----------------------------------------------------------
    os.makedirs(LOCAL_FOLDER, exist_ok=True)
    local_filename = os.path.basename(S3_KEY)
    local_path = os.path.join(LOCAL_FOLDER, local_filename)

    try:
        _download_from_s3(S3_BUCKET, S3_KEY, local_path)
    except Exception as exc:
        print(f"S3 download error: {exc}", file=sys.stderr)
        sys.exit(1)

    # --- Read & summarize ---------------------------------------------------
    try:
        df = _read_sas_head(local_path, row_count=10)
    except Exception as exc:
        print(f"File read error: {exc}", file=sys.stderr)
        sys.exit(2)

    _print_summary(df)