Compare commits
5 Commits
1bbe0d4cd6
...
2d95711d9d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2d95711d9d | ||
|
|
f1e99d887d | ||
|
|
f101eacffd | ||
|
|
edb9146682 | ||
|
|
6b12ab969b |
4
generic_loader/.gitignore
vendored
4
generic_loader/.gitignore
vendored
@ -1,5 +1,5 @@
|
|||||||
/.venv
|
/.venv
|
||||||
/samples
|
/samples
|
||||||
/.env
|
/.env
|
||||||
/__pycache__
|
__pycache__/
|
||||||
/venv
|
venv/
|
||||||
|
|||||||
@ -4,8 +4,8 @@ Library-style functions plus a thin CLI wrapper. Designed so an orchestrator
|
|||||||
can wrap the library for directory/batch mode; orchestration is out of scope
|
can wrap the library for directory/batch mode; orchestration is out of scope
|
||||||
here.
|
here.
|
||||||
|
|
||||||
Python 3.9 compatible (target is an air-gapped host that currently only has
|
Python 3.14 compatible (target is an air-gapped host that currently only has
|
||||||
3.9). ``from __future__ import annotations`` lets us use PEP 585 generics
|
3.14). ``from __future__ import annotations`` lets us use PEP 585 generics
|
||||||
as annotations; runtime-resolved type uses (dataclass defaults, etc.) stick
|
as annotations; runtime-resolved type uses (dataclass defaults, etc.) stick
|
||||||
to ``typing``.
|
to ``typing``.
|
||||||
|
|
||||||
|
|||||||
121
utils/file_viewer.py
Normal file
121
utils/file_viewer.py
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
"""Standalone utility to download a .sas7bdat file from S3 and print a
|
||||||
|
column-level summary of the first 10 rows.
|
||||||
|
|
||||||
|
Configure the four constants below, then run::
|
||||||
|
|
||||||
|
python3 file_viewer.py
|
||||||
|
|
||||||
|
Python 3.14 compatible.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import boto3
|
||||||
|
import pandas as pd
|
||||||
|
import pyreadstat
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Configuration — edit these before running
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
S3_BUCKET: str = "my-bucket"
|
||||||
|
"""S3 bucket name."""
|
||||||
|
|
||||||
|
S3_KEY: str = "path/to/file.sas7bdat"
|
||||||
|
"""Object key (path) within the bucket to the .sas7bdat file."""
|
||||||
|
|
||||||
|
LOCAL_FOLDER: str = "./downloads"
|
||||||
|
"""Local directory to download the file into."""
|
||||||
|
|
||||||
|
AWS_PROFILE: str = "default"
|
||||||
|
"""AWS CLI profile name used for authentication."""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _download_from_s3(bucket: str, key: str, local_path: str) -> None:
|
||||||
|
"""Download *key* from *bucket* to *local_path* using a named session."""
|
||||||
|
session = boto3.Session(profile_name=AWS_PROFILE)
|
||||||
|
s3 = session.client("s3")
|
||||||
|
print(f"Downloading s3://{bucket}/{key} -> {local_path}")
|
||||||
|
s3.download_file(bucket, key, local_path)
|
||||||
|
print("Download complete.")
|
||||||
|
|
||||||
|
|
||||||
|
def _read_sas_head(path: str, row_count: int = 10) -> pd.DataFrame:
|
||||||
|
"""Read the first *row_count* rows of a .sas7bdat file."""
|
||||||
|
df, _ = pyreadstat.read_sas7bdat(path, row_offset=0, row_limit=row_count)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def _sample_values(series: pd.Series, n: int = 3) -> str:
|
||||||
|
"""Return up to *n* non-null sample values as a comma-separated string."""
|
||||||
|
non_null = series.dropna()
|
||||||
|
samples = non_null.head(n).tolist()
|
||||||
|
if not samples:
|
||||||
|
return "(all null)"
|
||||||
|
return ", ".join(repr(v) for v in samples)
|
||||||
|
|
||||||
|
|
||||||
|
def _print_summary(df: pd.DataFrame) -> None:
|
||||||
|
"""Print a nicely formatted summary table to stdout."""
|
||||||
|
# Pre-compute column data
|
||||||
|
rows = []
|
||||||
|
for col in df.columns:
|
||||||
|
rows.append((col, str(df[col].dtype), _sample_values(df[col], 3)))
|
||||||
|
|
||||||
|
# Determine column widths
|
||||||
|
hdr_name = "Column Name"
|
||||||
|
hdr_dtype = "Data Type"
|
||||||
|
hdr_samples = "Sample Values (up to 3)"
|
||||||
|
|
||||||
|
w_name = max(len(hdr_name), *(len(r[0]) for r in rows))
|
||||||
|
w_dtype = max(len(hdr_dtype), *(len(r[1]) for r in rows))
|
||||||
|
w_samples = max(len(hdr_samples), *(len(r[2]) for r in rows))
|
||||||
|
|
||||||
|
fmt = f" {{:<{w_name}}} {{:<{w_dtype}}} {{:<{w_samples}}}"
|
||||||
|
sep = f" {'-' * w_name} {'-' * w_dtype} {'-' * w_samples}"
|
||||||
|
|
||||||
|
print()
|
||||||
|
print(f" Summary of first {len(df)} row(s) ({len(df.columns)} columns)")
|
||||||
|
print(sep)
|
||||||
|
print(fmt.format(hdr_name, hdr_dtype, hdr_samples))
|
||||||
|
print(sep)
|
||||||
|
for name, dtype, samples in rows:
|
||||||
|
print(fmt.format(name, dtype, samples))
|
||||||
|
print(sep)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# --- Download -----------------------------------------------------------
|
||||||
|
os.makedirs(LOCAL_FOLDER, exist_ok=True)
|
||||||
|
local_filename = os.path.basename(S3_KEY)
|
||||||
|
local_path = os.path.join(LOCAL_FOLDER, local_filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
_download_from_s3(S3_BUCKET, S3_KEY, local_path)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"S3 download error: {exc}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# --- Read & summarize ---------------------------------------------------
|
||||||
|
try:
|
||||||
|
df = _read_sas_head(local_path, row_count=10)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"File read error: {exc}", file=sys.stderr)
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
_print_summary(df)
|
||||||
Loading…
Reference in New Issue
Block a user