"""Standalone utility to download a .sas7bdat file from S3 and print a column-level summary of the first 10 rows. Configure the four constants below, then run:: python3 file_viewer.py Python 3.9 compatible. """ from __future__ import annotations import os import sys import boto3 import pandas as pd import pyreadstat # --------------------------------------------------------------------------- # Configuration — edit these before running # --------------------------------------------------------------------------- S3_BUCKET: str = "my-bucket" """S3 bucket name.""" S3_KEY: str = "path/to/file.sas7bdat" """Object key (path) within the bucket to the .sas7bdat file.""" LOCAL_FOLDER: str = "./downloads" """Local directory to download the file into.""" AWS_PROFILE: str = "default" """AWS CLI profile name used for authentication.""" # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _download_from_s3(bucket: str, key: str, local_path: str) -> None: """Download *key* from *bucket* to *local_path* using a named session.""" session = boto3.Session(profile_name=AWS_PROFILE) s3 = session.client("s3") print(f"Downloading s3://{bucket}/{key} -> {local_path}") s3.download_file(bucket, key, local_path) print("Download complete.") def _read_sas_head(path: str, row_count: int = 10) -> pd.DataFrame: """Read the first *row_count* rows of a .sas7bdat file.""" df, _ = pyreadstat.read_sas7bdat(path, row_offset=0, row_limit=row_count) return df def _sample_values(series: pd.Series, n: int = 3) -> str: """Return up to *n* non-null sample values as a comma-separated string.""" non_null = series.dropna() samples = non_null.head(n).tolist() if not samples: return "(all null)" return ", ".join(repr(v) for v in samples) def _print_summary(df: pd.DataFrame) -> None: """Print a nicely formatted summary table to stdout.""" # Pre-compute column data rows = [] for col in df.columns: rows.append((col, str(df[col].dtype), _sample_values(df[col], 3))) # Determine column widths hdr_name = "Column Name" hdr_dtype = "Data Type" hdr_samples = "Sample Values (up to 3)" w_name = max(len(hdr_name), *(len(r[0]) for r in rows)) w_dtype = max(len(hdr_dtype), *(len(r[1]) for r in rows)) w_samples = max(len(hdr_samples), *(len(r[2]) for r in rows)) fmt = f" {{:<{w_name}}} {{:<{w_dtype}}} {{:<{w_samples}}}" sep = f" {'-' * w_name} {'-' * w_dtype} {'-' * w_samples}" print() print(f" Summary of first {len(df)} row(s) ({len(df.columns)} columns)") print(sep) print(fmt.format(hdr_name, hdr_dtype, hdr_samples)) print(sep) for name, dtype, samples in rows: print(fmt.format(name, dtype, samples)) print(sep) print() # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- if __name__ == "__main__": # --- Download ----------------------------------------------------------- os.makedirs(LOCAL_FOLDER, exist_ok=True) local_filename = os.path.basename(S3_KEY) local_path = os.path.join(LOCAL_FOLDER, local_filename) try: _download_from_s3(S3_BUCKET, S3_KEY, local_path) except Exception as exc: print(f"S3 download error: {exc}", file=sys.stderr) sys.exit(1) # --- Read & summarize --------------------------------------------------- try: df = _read_sas_head(local_path, row_count=10) except Exception as exc: print(f"File read error: {exc}", file=sys.stderr) sys.exit(2) _print_summary(df)