2026-04-22 01:05:26 +00:00
|
|
|
|
"""Standalone utility to download a SAS or delimited text file from S3 and
|
|
|
|
|
|
print a column-level summary of the first *N* rows.
|
2026-04-18 16:19:38 +00:00
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
|
Supported formats
|
|
|
|
|
|
-----------------
|
|
|
|
|
|
* **SAS** – ``.sas7bdat``, ``.xpt``, ``.xport`` (read via *pyreadstat*)
|
|
|
|
|
|
* **Text** – ``.csv``, ``.tsv``, ``.txt`` (read via *pandas.read_csv*)
|
|
|
|
|
|
|
|
|
|
|
|
Configure the four constants below **or** use the CLI arguments, then run::
|
2026-04-18 16:19:38 +00:00
|
|
|
|
|
|
|
|
|
|
python3 file_viewer.py
|
2026-04-22 01:05:26 +00:00
|
|
|
|
python3 file_viewer.py --local path/to/file.csv
|
|
|
|
|
|
python3 file_viewer.py --local path/to/data.tsv --delimiter $'\\t'
|
2026-04-18 16:19:38 +00:00
|
|
|
|
|
2026-04-18 18:43:29 +00:00
|
|
|
|
Python 3.14 compatible.
|
2026-04-18 16:19:38 +00:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
|
import argparse
|
2026-04-18 16:19:38 +00:00
|
|
|
|
import os
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
|
|
import boto3
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
import pyreadstat
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
2026-04-22 01:05:26 +00:00
|
|
|
|
# Supported file extensions
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
SAS_EXTENSIONS: set[str] = {".sas7bdat", ".xpt", ".xport"}
|
|
|
|
|
|
"""File extensions recognised as SAS data files."""
|
|
|
|
|
|
|
|
|
|
|
|
TEXT_EXTENSIONS: set[str] = {".txt", ".csv", ".tsv"}
|
|
|
|
|
|
"""File extensions recognised as delimited text files."""
|
|
|
|
|
|
|
|
|
|
|
|
SUPPORTED_EXTENSIONS: set[str] = SAS_EXTENSIONS | TEXT_EXTENSIONS
|
|
|
|
|
|
"""Union of all supported file extensions."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Configuration — edit these before running (or use CLI arguments)
|
2026-04-18 16:19:38 +00:00
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
S3_BUCKET: str = "my-bucket"
|
|
|
|
|
|
"""S3 bucket name."""
|
|
|
|
|
|
|
|
|
|
|
|
S3_KEY: str = "path/to/file.sas7bdat"
|
2026-04-22 01:05:26 +00:00
|
|
|
|
"""Object key (path) within the bucket to a supported data file."""
|
2026-04-18 16:19:38 +00:00
|
|
|
|
|
|
|
|
|
|
LOCAL_FOLDER: str = "./downloads"
|
|
|
|
|
|
"""Local directory to download the file into."""
|
|
|
|
|
|
|
|
|
|
|
|
AWS_PROFILE: str = "default"
|
|
|
|
|
|
"""AWS CLI profile name used for authentication."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Helpers
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-20 13:25:27 +00:00
|
|
|
|
def _ensure_local_copy(bucket: str, key: str, local_path: str) -> None:
|
|
|
|
|
|
"""Download *key* from *bucket* to *local_path*, skipping if already present.
|
|
|
|
|
|
|
|
|
|
|
|
If *local_path* exists and its size matches the S3 object's size, the
|
|
|
|
|
|
download is skipped and a message is printed.
|
2026-04-22 01:05:26 +00:00
|
|
|
|
|
|
|
|
|
|
Supports any file whose extension is in :data:`SUPPORTED_EXTENSIONS`.
|
2026-04-20 13:25:27 +00:00
|
|
|
|
"""
|
2026-04-18 16:19:38 +00:00
|
|
|
|
session = boto3.Session(profile_name=AWS_PROFILE)
|
|
|
|
|
|
s3 = session.client("s3")
|
2026-04-20 13:25:27 +00:00
|
|
|
|
|
|
|
|
|
|
remote_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"]
|
|
|
|
|
|
|
|
|
|
|
|
if os.path.exists(local_path):
|
|
|
|
|
|
local_size = os.path.getsize(local_path)
|
|
|
|
|
|
if local_size == remote_size:
|
|
|
|
|
|
print(
|
|
|
|
|
|
f"Local file {local_path} already matches s3://{bucket}/{key} "
|
|
|
|
|
|
f"({local_size} bytes); skipping download."
|
|
|
|
|
|
)
|
|
|
|
|
|
return
|
|
|
|
|
|
print(
|
|
|
|
|
|
f"Local file {local_path} size ({local_size} bytes) differs from "
|
|
|
|
|
|
f"S3 ({remote_size} bytes); re-downloading."
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-18 16:19:38 +00:00
|
|
|
|
print(f"Downloading s3://{bucket}/{key} -> {local_path}")
|
|
|
|
|
|
s3.download_file(bucket, key, local_path)
|
|
|
|
|
|
print("Download complete.")
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
|
# -- SAS readers -------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-18 16:19:38 +00:00
|
|
|
|
def _read_sas_head(path: str, row_count: int = 10) -> pd.DataFrame:
|
2026-04-22 01:05:26 +00:00
|
|
|
|
"""Read the first *row_count* rows of a SAS file (``.sas7bdat``, ``.xpt``, ``.xport``)."""
|
|
|
|
|
|
ext = os.path.splitext(path)[1].lower()
|
|
|
|
|
|
if ext == ".sas7bdat":
|
|
|
|
|
|
df, _ = pyreadstat.read_sas7bdat(path, row_offset=0, row_limit=row_count)
|
|
|
|
|
|
elif ext in {".xpt", ".xport"}:
|
|
|
|
|
|
df, _ = pyreadstat.read_xport(path, row_offset=0, row_limit=row_count)
|
|
|
|
|
|
else:
|
|
|
|
|
|
raise ValueError(f"Unsupported SAS extension: {ext}")
|
2026-04-18 16:19:38 +00:00
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
|
# -- Text readers ------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _read_text_head(
|
|
|
|
|
|
path: str,
|
|
|
|
|
|
row_count: int = 10,
|
|
|
|
|
|
delimiter: str = ",",
|
|
|
|
|
|
encoding: str = "utf-8",
|
|
|
|
|
|
quotechar: str = '"',
|
|
|
|
|
|
) -> pd.DataFrame:
|
|
|
|
|
|
"""Read the first *row_count* rows of a delimited text file.
|
|
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
|
----------
|
|
|
|
|
|
path : str
|
|
|
|
|
|
Path to the ``.csv``, ``.tsv``, or ``.txt`` file.
|
|
|
|
|
|
row_count : int, optional
|
|
|
|
|
|
Number of data rows to read (default ``10``).
|
|
|
|
|
|
delimiter : str, optional
|
|
|
|
|
|
Column delimiter (default ``","``). For ``.tsv`` files the caller
|
|
|
|
|
|
should pass ``"\\t"``.
|
|
|
|
|
|
encoding : str, optional
|
|
|
|
|
|
File encoding (default ``"utf-8"``).
|
|
|
|
|
|
quotechar : str, optional
|
|
|
|
|
|
Character used to quote fields (default ``'"'``).
|
|
|
|
|
|
"""
|
|
|
|
|
|
return pd.read_csv(
|
|
|
|
|
|
path,
|
|
|
|
|
|
sep=delimiter,
|
|
|
|
|
|
encoding=encoding,
|
|
|
|
|
|
quotechar=quotechar,
|
|
|
|
|
|
nrows=row_count,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- Unified reader ----------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _read_head(
|
|
|
|
|
|
path: str,
|
|
|
|
|
|
row_count: int = 10,
|
|
|
|
|
|
delimiter: str | None = None,
|
|
|
|
|
|
encoding: str = "utf-8",
|
|
|
|
|
|
quotechar: str = '"',
|
|
|
|
|
|
) -> pd.DataFrame:
|
|
|
|
|
|
"""Read the first *row_count* rows of a supported data file.
|
|
|
|
|
|
|
|
|
|
|
|
Auto-detects the file type from its extension and delegates to the
|
|
|
|
|
|
appropriate reader. For ``.tsv`` files the delimiter defaults to tab
|
|
|
|
|
|
(``"\\t"``); for other text files it defaults to ``","``.
|
|
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
|
----------
|
|
|
|
|
|
path : str
|
|
|
|
|
|
Path to the data file.
|
|
|
|
|
|
row_count : int, optional
|
|
|
|
|
|
Number of data rows to read (default ``10``).
|
|
|
|
|
|
delimiter : str or None, optional
|
|
|
|
|
|
Column delimiter for text files. ``None`` means *auto-detect*
|
|
|
|
|
|
(tab for ``.tsv``, comma otherwise).
|
|
|
|
|
|
encoding : str, optional
|
|
|
|
|
|
Encoding for text files (default ``"utf-8"``).
|
|
|
|
|
|
quotechar : str, optional
|
|
|
|
|
|
Quote character for text files (default ``'"'``).
|
|
|
|
|
|
|
|
|
|
|
|
Returns
|
|
|
|
|
|
-------
|
|
|
|
|
|
pandas.DataFrame
|
|
|
|
|
|
"""
|
|
|
|
|
|
ext = os.path.splitext(path)[1].lower()
|
|
|
|
|
|
|
|
|
|
|
|
if ext not in SUPPORTED_EXTENSIONS:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"Unsupported file extension '{ext}'. "
|
|
|
|
|
|
f"Supported extensions: {sorted(SUPPORTED_EXTENSIONS)}"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if ext in SAS_EXTENSIONS:
|
|
|
|
|
|
return _read_sas_head(path, row_count=row_count)
|
|
|
|
|
|
|
|
|
|
|
|
# --- Text file path ---
|
|
|
|
|
|
if delimiter is None:
|
|
|
|
|
|
delimiter = "\t" if ext == ".tsv" else ","
|
|
|
|
|
|
|
|
|
|
|
|
return _read_text_head(
|
|
|
|
|
|
path,
|
|
|
|
|
|
row_count=row_count,
|
|
|
|
|
|
delimiter=delimiter,
|
|
|
|
|
|
encoding=encoding,
|
|
|
|
|
|
quotechar=quotechar,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- Display -----------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-18 16:19:38 +00:00
|
|
|
|
def _sample_values(series: pd.Series, n: int = 3) -> str:
|
|
|
|
|
|
"""Return up to *n* non-null sample values as a comma-separated string."""
|
|
|
|
|
|
non_null = series.dropna()
|
|
|
|
|
|
samples = non_null.head(n).tolist()
|
|
|
|
|
|
if not samples:
|
|
|
|
|
|
return "(all null)"
|
|
|
|
|
|
return ", ".join(repr(v) for v in samples)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _print_summary(df: pd.DataFrame) -> None:
|
|
|
|
|
|
"""Print a nicely formatted summary table to stdout."""
|
|
|
|
|
|
# Pre-compute column data
|
|
|
|
|
|
rows = []
|
|
|
|
|
|
for col in df.columns:
|
|
|
|
|
|
rows.append((col, str(df[col].dtype), _sample_values(df[col], 3)))
|
|
|
|
|
|
|
|
|
|
|
|
# Determine column widths
|
|
|
|
|
|
hdr_name = "Column Name"
|
|
|
|
|
|
hdr_dtype = "Data Type"
|
|
|
|
|
|
hdr_samples = "Sample Values (up to 3)"
|
|
|
|
|
|
|
|
|
|
|
|
w_name = max(len(hdr_name), *(len(r[0]) for r in rows))
|
|
|
|
|
|
w_dtype = max(len(hdr_dtype), *(len(r[1]) for r in rows))
|
|
|
|
|
|
w_samples = max(len(hdr_samples), *(len(r[2]) for r in rows))
|
|
|
|
|
|
|
|
|
|
|
|
fmt = f" {{:<{w_name}}} {{:<{w_dtype}}} {{:<{w_samples}}}"
|
|
|
|
|
|
sep = f" {'-' * w_name} {'-' * w_dtype} {'-' * w_samples}"
|
|
|
|
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
|
print(f" Summary of first {len(df)} row(s) ({len(df.columns)} columns)")
|
|
|
|
|
|
print(sep)
|
|
|
|
|
|
print(fmt.format(hdr_name, hdr_dtype, hdr_samples))
|
|
|
|
|
|
print(sep)
|
|
|
|
|
|
for name, dtype, samples in rows:
|
|
|
|
|
|
print(fmt.format(name, dtype, samples))
|
|
|
|
|
|
print(sep)
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# CLI
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_parser() -> argparse.ArgumentParser:
|
|
|
|
|
|
"""Build the argument parser for the file-viewer CLI."""
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
|
description=(
|
|
|
|
|
|
"Download a SAS or delimited text file from S3 (or read a local "
|
|
|
|
|
|
"file) and print a column-level summary of the first N rows.\n\n"
|
|
|
|
|
|
"Supported extensions: "
|
|
|
|
|
|
+ ", ".join(sorted(SUPPORTED_EXTENSIONS))
|
|
|
|
|
|
),
|
|
|
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
source = parser.add_mutually_exclusive_group()
|
|
|
|
|
|
source.add_argument(
|
|
|
|
|
|
"--local",
|
|
|
|
|
|
metavar="FILE",
|
|
|
|
|
|
default=None,
|
|
|
|
|
|
help=(
|
|
|
|
|
|
"Path to a local data file to summarise (skips S3 download). "
|
|
|
|
|
|
"Supported extensions: "
|
|
|
|
|
|
+ ", ".join(sorted(SUPPORTED_EXTENSIONS))
|
|
|
|
|
|
),
|
|
|
|
|
|
)
|
|
|
|
|
|
source.add_argument(
|
|
|
|
|
|
"--s3-key",
|
|
|
|
|
|
metavar="KEY",
|
|
|
|
|
|
default=None,
|
|
|
|
|
|
help="Override the S3_KEY constant with this object key.",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
"--rows",
|
|
|
|
|
|
type=int,
|
|
|
|
|
|
default=10,
|
|
|
|
|
|
metavar="N",
|
|
|
|
|
|
help="Number of rows to read (default: 10).",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# Text-file-specific options
|
|
|
|
|
|
text_group = parser.add_argument_group(
|
|
|
|
|
|
"text file options",
|
|
|
|
|
|
"These options apply only to .csv / .tsv / .txt files.",
|
|
|
|
|
|
)
|
|
|
|
|
|
text_group.add_argument(
|
|
|
|
|
|
"--delimiter",
|
|
|
|
|
|
default=None,
|
|
|
|
|
|
help=(
|
|
|
|
|
|
'Column delimiter for text files (default: "," for .csv/.txt, '
|
|
|
|
|
|
'"\\t" for .tsv). Use $\'\\t\' in the shell for a literal tab.'
|
|
|
|
|
|
),
|
|
|
|
|
|
)
|
|
|
|
|
|
text_group.add_argument(
|
|
|
|
|
|
"--encoding",
|
|
|
|
|
|
default="utf-8",
|
|
|
|
|
|
help='File encoding for text files (default: "utf-8").',
|
|
|
|
|
|
)
|
|
|
|
|
|
text_group.add_argument(
|
|
|
|
|
|
"--quotechar",
|
|
|
|
|
|
default='"',
|
|
|
|
|
|
help='Quote character for text files (default: \'"\').',
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-18 16:19:38 +00:00
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Main
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2026-04-22 01:05:26 +00:00
|
|
|
|
parser = _build_parser()
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
if args.local:
|
|
|
|
|
|
# ---- Local file mode -----------------------------------------------
|
|
|
|
|
|
local_path = args.local
|
|
|
|
|
|
ext = os.path.splitext(local_path)[1].lower()
|
|
|
|
|
|
if ext not in SUPPORTED_EXTENSIONS:
|
|
|
|
|
|
parser.error(
|
|
|
|
|
|
f"Unsupported file extension '{ext}'. "
|
|
|
|
|
|
f"Supported: {sorted(SUPPORTED_EXTENSIONS)}"
|
|
|
|
|
|
)
|
|
|
|
|
|
if not os.path.isfile(local_path):
|
|
|
|
|
|
print(f"File not found: {local_path}", file=sys.stderr)
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
else:
|
|
|
|
|
|
# ---- S3 download mode ----------------------------------------------
|
|
|
|
|
|
s3_key = args.s3_key or S3_KEY
|
|
|
|
|
|
ext = os.path.splitext(s3_key)[1].lower()
|
|
|
|
|
|
if ext not in SUPPORTED_EXTENSIONS:
|
|
|
|
|
|
parser.error(
|
|
|
|
|
|
f"Unsupported file extension '{ext}' in S3 key. "
|
|
|
|
|
|
f"Supported: {sorted(SUPPORTED_EXTENSIONS)}"
|
|
|
|
|
|
)
|
2026-04-18 16:19:38 +00:00
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
|
os.makedirs(LOCAL_FOLDER, exist_ok=True)
|
|
|
|
|
|
local_filename = os.path.basename(s3_key)
|
|
|
|
|
|
local_path = os.path.join(LOCAL_FOLDER, local_filename)
|
2026-04-18 16:19:38 +00:00
|
|
|
|
|
2026-04-22 01:05:26 +00:00
|
|
|
|
try:
|
|
|
|
|
|
_ensure_local_copy(S3_BUCKET, s3_key, local_path)
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
print(f"S3 download error: {exc}", file=sys.stderr)
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
# ---- Read & summarise --------------------------------------------------
|
2026-04-18 16:19:38 +00:00
|
|
|
|
try:
|
2026-04-22 01:05:26 +00:00
|
|
|
|
df = _read_head(
|
|
|
|
|
|
local_path,
|
|
|
|
|
|
row_count=args.rows,
|
|
|
|
|
|
delimiter=args.delimiter,
|
|
|
|
|
|
encoding=args.encoding,
|
|
|
|
|
|
quotechar=args.quotechar,
|
|
|
|
|
|
)
|
2026-04-18 16:19:38 +00:00
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
print(f"File read error: {exc}", file=sys.stderr)
|
|
|
|
|
|
sys.exit(2)
|
|
|
|
|
|
|
|
|
|
|
|
_print_summary(df)
|