foxtrot/utils/file_viewer.py
2026-04-20 08:25:27 -05:00

142 lines
4.4 KiB
Python

"""Standalone utility to download a .sas7bdat file from S3 and print a
column-level summary of the first 10 rows.
Configure the four constants below, then run::
python3 file_viewer.py
Python 3.14 compatible.
"""
from __future__ import annotations
import os
import sys
import boto3
import pandas as pd
import pyreadstat
# ---------------------------------------------------------------------------
# Configuration — edit these before running
# ---------------------------------------------------------------------------
S3_BUCKET: str = "my-bucket"
"""S3 bucket name."""
S3_KEY: str = "path/to/file.sas7bdat"
"""Object key (path) within the bucket to the .sas7bdat file."""
LOCAL_FOLDER: str = "./downloads"
"""Local directory to download the file into."""
AWS_PROFILE: str = "default"
"""AWS CLI profile name used for authentication."""
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _ensure_local_copy(bucket: str, key: str, local_path: str) -> None:
"""Download *key* from *bucket* to *local_path*, skipping if already present.
If *local_path* exists and its size matches the S3 object's size, the
download is skipped and a message is printed.
"""
session = boto3.Session(profile_name=AWS_PROFILE)
s3 = session.client("s3")
remote_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"]
if os.path.exists(local_path):
local_size = os.path.getsize(local_path)
if local_size == remote_size:
print(
f"Local file {local_path} already matches s3://{bucket}/{key} "
f"({local_size} bytes); skipping download."
)
return
print(
f"Local file {local_path} size ({local_size} bytes) differs from "
f"S3 ({remote_size} bytes); re-downloading."
)
print(f"Downloading s3://{bucket}/{key} -> {local_path}")
s3.download_file(bucket, key, local_path)
print("Download complete.")
def _read_sas_head(path: str, row_count: int = 10) -> pd.DataFrame:
"""Read the first *row_count* rows of a .sas7bdat file."""
df, _ = pyreadstat.read_sas7bdat(path, row_offset=0, row_limit=row_count)
return df
def _sample_values(series: pd.Series, n: int = 3) -> str:
"""Return up to *n* non-null sample values as a comma-separated string."""
non_null = series.dropna()
samples = non_null.head(n).tolist()
if not samples:
return "(all null)"
return ", ".join(repr(v) for v in samples)
def _print_summary(df: pd.DataFrame) -> None:
"""Print a nicely formatted summary table to stdout."""
# Pre-compute column data
rows = []
for col in df.columns:
rows.append((col, str(df[col].dtype), _sample_values(df[col], 3)))
# Determine column widths
hdr_name = "Column Name"
hdr_dtype = "Data Type"
hdr_samples = "Sample Values (up to 3)"
w_name = max(len(hdr_name), *(len(r[0]) for r in rows))
w_dtype = max(len(hdr_dtype), *(len(r[1]) for r in rows))
w_samples = max(len(hdr_samples), *(len(r[2]) for r in rows))
fmt = f" {{:<{w_name}}} {{:<{w_dtype}}} {{:<{w_samples}}}"
sep = f" {'-' * w_name} {'-' * w_dtype} {'-' * w_samples}"
print()
print(f" Summary of first {len(df)} row(s) ({len(df.columns)} columns)")
print(sep)
print(fmt.format(hdr_name, hdr_dtype, hdr_samples))
print(sep)
for name, dtype, samples in rows:
print(fmt.format(name, dtype, samples))
print(sep)
print()
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
if __name__ == "__main__":
# --- Download -----------------------------------------------------------
os.makedirs(LOCAL_FOLDER, exist_ok=True)
local_filename = os.path.basename(S3_KEY)
local_path = os.path.join(LOCAL_FOLDER, local_filename)
try:
_ensure_local_copy(S3_BUCKET, S3_KEY, local_path)
except Exception as exc:
print(f"S3 download error: {exc}", file=sys.stderr)
sys.exit(1)
# --- Read & summarize ---------------------------------------------------
try:
df = _read_sas_head(local_path, row_count=10)
except Exception as exc:
print(f"File read error: {exc}", file=sys.stderr)
sys.exit(2)
_print_summary(df)