"""Standalone utility to download a SAS or delimited text file from S3 and print a column-level summary of the first *N* rows. Supported formats ----------------- * **SAS** – ``.sas7bdat``, ``.xpt``, ``.xport`` (read via *pyreadstat*) * **Text** – ``.csv``, ``.tsv``, ``.txt`` (read via *pandas.read_csv*) Configure the four constants below **or** use the CLI arguments, then run:: python3 file_viewer.py python3 file_viewer.py --local path/to/file.csv python3 file_viewer.py --local path/to/data.tsv --delimiter $'\\t' Python 3.14 compatible. """ from __future__ import annotations import argparse import os import sys import boto3 import pandas as pd import pyreadstat # --------------------------------------------------------------------------- # Supported file extensions # --------------------------------------------------------------------------- SAS_EXTENSIONS: set[str] = {".sas7bdat", ".xpt", ".xport"} """File extensions recognised as SAS data files.""" TEXT_EXTENSIONS: set[str] = {".txt", ".csv", ".tsv"} """File extensions recognised as delimited text files.""" SUPPORTED_EXTENSIONS: set[str] = SAS_EXTENSIONS | TEXT_EXTENSIONS """Union of all supported file extensions.""" # --------------------------------------------------------------------------- # Configuration — edit these before running (or use CLI arguments) # --------------------------------------------------------------------------- S3_BUCKET: str = "my-bucket" """S3 bucket name.""" S3_KEY: str = "path/to/file.sas7bdat" """Object key (path) within the bucket to a supported data file.""" LOCAL_FOLDER: str = "./downloads" """Local directory to download the file into.""" AWS_PROFILE: str = "default" """AWS CLI profile name used for authentication.""" # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _ensure_local_copy(bucket: str, key: str, local_path: str) -> None: """Download *key* from *bucket* to *local_path*, skipping if already present. If *local_path* exists and its size matches the S3 object's size, the download is skipped and a message is printed. Supports any file whose extension is in :data:`SUPPORTED_EXTENSIONS`. """ session = boto3.Session(profile_name=AWS_PROFILE) s3 = session.client("s3") remote_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"] if os.path.exists(local_path): local_size = os.path.getsize(local_path) if local_size == remote_size: print( f"Local file {local_path} already matches s3://{bucket}/{key} " f"({local_size} bytes); skipping download." ) return print( f"Local file {local_path} size ({local_size} bytes) differs from " f"S3 ({remote_size} bytes); re-downloading." ) print(f"Downloading s3://{bucket}/{key} -> {local_path}") s3.download_file(bucket, key, local_path) print("Download complete.") # -- SAS readers ------------------------------------------------------------- def _read_sas_head(path: str, row_count: int = 10) -> pd.DataFrame: """Read the first *row_count* rows of a SAS file (``.sas7bdat``, ``.xpt``, ``.xport``).""" ext = os.path.splitext(path)[1].lower() if ext == ".sas7bdat": df, _ = pyreadstat.read_sas7bdat(path, row_offset=0, row_limit=row_count) elif ext in {".xpt", ".xport"}: df, _ = pyreadstat.read_xport(path, row_offset=0, row_limit=row_count) else: raise ValueError(f"Unsupported SAS extension: {ext}") return df # -- Text readers ------------------------------------------------------------ def _read_text_head( path: str, row_count: int = 10, delimiter: str = ",", encoding: str = "utf-8", quotechar: str = '"', ) -> pd.DataFrame: """Read the first *row_count* rows of a delimited text file. Parameters ---------- path : str Path to the ``.csv``, ``.tsv``, or ``.txt`` file. row_count : int, optional Number of data rows to read (default ``10``). delimiter : str, optional Column delimiter (default ``","``). For ``.tsv`` files the caller should pass ``"\\t"``. encoding : str, optional File encoding (default ``"utf-8"``). quotechar : str, optional Character used to quote fields (default ``'"'``). """ return pd.read_csv( path, sep=delimiter, encoding=encoding, quotechar=quotechar, nrows=row_count, ) # -- Unified reader ---------------------------------------------------------- def _read_head( path: str, row_count: int = 10, delimiter: str | None = None, encoding: str = "utf-8", quotechar: str = '"', ) -> pd.DataFrame: """Read the first *row_count* rows of a supported data file. Auto-detects the file type from its extension and delegates to the appropriate reader. For ``.tsv`` files the delimiter defaults to tab (``"\\t"``); for other text files it defaults to ``","``. Parameters ---------- path : str Path to the data file. row_count : int, optional Number of data rows to read (default ``10``). delimiter : str or None, optional Column delimiter for text files. ``None`` means *auto-detect* (tab for ``.tsv``, comma otherwise). encoding : str, optional Encoding for text files (default ``"utf-8"``). quotechar : str, optional Quote character for text files (default ``'"'``). Returns ------- pandas.DataFrame """ ext = os.path.splitext(path)[1].lower() if ext not in SUPPORTED_EXTENSIONS: raise ValueError( f"Unsupported file extension '{ext}'. " f"Supported extensions: {sorted(SUPPORTED_EXTENSIONS)}" ) if ext in SAS_EXTENSIONS: return _read_sas_head(path, row_count=row_count) # --- Text file path --- if delimiter is None: delimiter = "\t" if ext == ".tsv" else "," return _read_text_head( path, row_count=row_count, delimiter=delimiter, encoding=encoding, quotechar=quotechar, ) # -- Display ----------------------------------------------------------------- def _sample_values(series: pd.Series, n: int = 3) -> str: """Return up to *n* non-null sample values as a comma-separated string.""" non_null = series.dropna() samples = non_null.head(n).tolist() if not samples: return "(all null)" return ", ".join(repr(v) for v in samples) def _print_summary(df: pd.DataFrame) -> None: """Print a nicely formatted summary table to stdout.""" # Pre-compute column data rows = [] for col in df.columns: rows.append((col, str(df[col].dtype), _sample_values(df[col], 3))) # Determine column widths hdr_name = "Column Name" hdr_dtype = "Data Type" hdr_samples = "Sample Values (up to 3)" w_name = max(len(hdr_name), *(len(r[0]) for r in rows)) w_dtype = max(len(hdr_dtype), *(len(r[1]) for r in rows)) w_samples = max(len(hdr_samples), *(len(r[2]) for r in rows)) fmt = f" {{:<{w_name}}} {{:<{w_dtype}}} {{:<{w_samples}}}" sep = f" {'-' * w_name} {'-' * w_dtype} {'-' * w_samples}" print() print(f" Summary of first {len(df)} row(s) ({len(df.columns)} columns)") print(sep) print(fmt.format(hdr_name, hdr_dtype, hdr_samples)) print(sep) for name, dtype, samples in rows: print(fmt.format(name, dtype, samples)) print(sep) print() # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def _build_parser() -> argparse.ArgumentParser: """Build the argument parser for the file-viewer CLI.""" parser = argparse.ArgumentParser( description=( "Download a SAS or delimited text file from S3 (or read a local " "file) and print a column-level summary of the first N rows.\n\n" "Supported extensions: " + ", ".join(sorted(SUPPORTED_EXTENSIONS)) ), formatter_class=argparse.RawDescriptionHelpFormatter, ) source = parser.add_mutually_exclusive_group() source.add_argument( "--local", metavar="FILE", default=None, help=( "Path to a local data file to summarise (skips S3 download). " "Supported extensions: " + ", ".join(sorted(SUPPORTED_EXTENSIONS)) ), ) source.add_argument( "--s3-key", metavar="KEY", default=None, help="Override the S3_KEY constant with this object key.", ) parser.add_argument( "--rows", type=int, default=10, metavar="N", help="Number of rows to read (default: 10).", ) # Text-file-specific options text_group = parser.add_argument_group( "text file options", "These options apply only to .csv / .tsv / .txt files.", ) text_group.add_argument( "--delimiter", default=None, help=( 'Column delimiter for text files (default: "," for .csv/.txt, ' '"\\t" for .tsv). Use $\'\\t\' in the shell for a literal tab.' ), ) text_group.add_argument( "--encoding", default="utf-8", help='File encoding for text files (default: "utf-8").', ) text_group.add_argument( "--quotechar", default='"', help='Quote character for text files (default: \'"\').', ) return parser # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- if __name__ == "__main__": parser = _build_parser() args = parser.parse_args() if args.local: # ---- Local file mode ----------------------------------------------- local_path = args.local ext = os.path.splitext(local_path)[1].lower() if ext not in SUPPORTED_EXTENSIONS: parser.error( f"Unsupported file extension '{ext}'. " f"Supported: {sorted(SUPPORTED_EXTENSIONS)}" ) if not os.path.isfile(local_path): print(f"File not found: {local_path}", file=sys.stderr) sys.exit(1) else: # ---- S3 download mode ---------------------------------------------- s3_key = args.s3_key or S3_KEY ext = os.path.splitext(s3_key)[1].lower() if ext not in SUPPORTED_EXTENSIONS: parser.error( f"Unsupported file extension '{ext}' in S3 key. " f"Supported: {sorted(SUPPORTED_EXTENSIONS)}" ) os.makedirs(LOCAL_FOLDER, exist_ok=True) local_filename = os.path.basename(s3_key) local_path = os.path.join(LOCAL_FOLDER, local_filename) try: _ensure_local_copy(S3_BUCKET, s3_key, local_path) except Exception as exc: print(f"S3 download error: {exc}", file=sys.stderr) sys.exit(1) # ---- Read & summarise -------------------------------------------------- try: df = _read_head( local_path, row_count=args.rows, delimiter=args.delimiter, encoding=args.encoding, quotechar=args.quotechar, ) except Exception as exc: print(f"File read error: {exc}", file=sys.stderr) sys.exit(2) _print_summary(df)