diff --git a/generic_loader/.gitignore b/generic_loader/.gitignore index c93b13d..f3d3618 100644 --- a/generic_loader/.gitignore +++ b/generic_loader/.gitignore @@ -1,3 +1,5 @@ /.venv /samples /.env +__pycache__/ +venv/ \ No newline at end of file diff --git a/generic_loader/file_viewer.py b/generic_loader/file_viewer.py new file mode 100644 index 0000000..36ff020 --- /dev/null +++ b/generic_loader/file_viewer.py @@ -0,0 +1,121 @@ +"""Standalone utility to download a .sas7bdat file from S3 and print a +column-level summary of the first 10 rows. + +Configure the four constants below, then run:: + + python3 file_viewer.py + +Python 3.9 compatible. +""" + +from __future__ import annotations + +import os +import sys + +import boto3 +import pandas as pd +import pyreadstat + + +# --------------------------------------------------------------------------- +# Configuration — edit these before running +# --------------------------------------------------------------------------- + +S3_BUCKET: str = "my-bucket" +"""S3 bucket name.""" + +S3_KEY: str = "path/to/file.sas7bdat" +"""Object key (path) within the bucket to the .sas7bdat file.""" + +LOCAL_FOLDER: str = "./downloads" +"""Local directory to download the file into.""" + +AWS_PROFILE: str = "default" +"""AWS CLI profile name used for authentication.""" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _download_from_s3(bucket: str, key: str, local_path: str) -> None: + """Download *key* from *bucket* to *local_path* using a named session.""" + session = boto3.Session(profile_name=AWS_PROFILE) + s3 = session.client("s3") + print(f"Downloading s3://{bucket}/{key} -> {local_path}") + s3.download_file(bucket, key, local_path) + print("Download complete.") + + +def _read_sas_head(path: str, row_count: int = 10) -> pd.DataFrame: + """Read the first *row_count* rows of a .sas7bdat file.""" + df, _ = pyreadstat.read_sas7bdat(path, encoding="latin-1", row_count=row_count) + return df + + +def _sample_values(series: pd.Series, n: int = 3) -> str: + """Return up to *n* non-null sample values as a comma-separated string.""" + non_null = series.dropna() + samples = non_null.head(n).tolist() + if not samples: + return "(all null)" + return ", ".join(repr(v) for v in samples) + + +def _print_summary(df: pd.DataFrame) -> None: + """Print a nicely formatted summary table to stdout.""" + # Pre-compute column data + rows = [] + for col in df.columns: + rows.append((col, str(df[col].dtype), _sample_values(df[col], 3))) + + # Determine column widths + hdr_name = "Column Name" + hdr_dtype = "Data Type" + hdr_samples = "Sample Values (up to 3)" + + w_name = max(len(hdr_name), *(len(r[0]) for r in rows)) + w_dtype = max(len(hdr_dtype), *(len(r[1]) for r in rows)) + w_samples = max(len(hdr_samples), *(len(r[2]) for r in rows)) + + fmt = f" {{:<{w_name}}} {{:<{w_dtype}}} {{:<{w_samples}}}" + sep = f" {'-' * w_name} {'-' * w_dtype} {'-' * w_samples}" + + print() + print(f" Summary of first {len(df)} row(s) ({len(df.columns)} columns)") + print(sep) + print(fmt.format(hdr_name, hdr_dtype, hdr_samples)) + print(sep) + for name, dtype, samples in rows: + print(fmt.format(name, dtype, samples)) + print(sep) + print() + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +if __name__ == "__main__": + # --- Download ----------------------------------------------------------- + os.makedirs(LOCAL_FOLDER, exist_ok=True) + local_filename = os.path.basename(S3_KEY) + local_path = os.path.join(LOCAL_FOLDER, local_filename) + + try: + _download_from_s3(S3_BUCKET, S3_KEY, local_path) + except Exception as exc: + print(f"S3 download error: {exc}", file=sys.stderr) + sys.exit(1) + + # --- Read & summarize --------------------------------------------------- + try: + df = _read_sas_head(local_path, row_count=10) + except Exception as exc: + print(f"File read error: {exc}", file=sys.stderr) + sys.exit(2) + + _print_summary(df) diff --git a/generic_loader/requirements.txt b/generic_loader/requirements.txt index c481d42..d49644a 100644 --- a/generic_loader/requirements.txt +++ b/generic_loader/requirements.txt @@ -4,3 +4,4 @@ numpy>=1.24,<2.1 pyyaml>=6.0,<7.0 psycopg2-binary>=2.9,<3.0 python-dotenv>=1.0,<2.0 +boto3>=1.28,<2.0