381 lines
14 KiB
Python
381 lines
14 KiB
Python
"""Generate a kitchen-sink SAS XPORT file plus an expected-types manifest.
|
|
|
|
Running this script produces two files under samples/:
|
|
- sample_kitchensink.xpt the SAS XPORT test fixture
|
|
- sample_kitchensink.expected.json ground-truth Postgres types for the loader
|
|
|
|
Tune behavior via the top-level constants below.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import datetime as dt
|
|
import json
|
|
import string
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pyreadstat
|
|
|
|
SEED = 42
|
|
N_ROWS = 1000
|
|
NULL_FRACTION = 0.20
|
|
OUT_DIR = Path("samples")
|
|
OUT_PATH = OUT_DIR / "sample_kitchensink.xpt"
|
|
MANIFEST_PATH = OUT_DIR / "sample_kitchensink.expected.json"
|
|
|
|
POSITIVE_CONTROLS = {"ID", "INTCOL", "STRCOL", "DATECOL", "CONST"}
|
|
ALL_NULL_COLS = {"ALLNULL", "ALLNULLC"}
|
|
|
|
|
|
def _missing_mask(rng: np.random.Generator, n: int, frac: float) -> np.ndarray:
|
|
"""Return a boolean array of length n with exactly round(frac * n) True positions.
|
|
|
|
Using an exact count (rather than per-row Bernoulli draws) keeps the observed
|
|
missing fraction tight so the round-trip assertion can use a small tolerance.
|
|
"""
|
|
mask = np.zeros(n, dtype=bool)
|
|
k = int(round(frac * n))
|
|
if k > 0:
|
|
idx = rng.choice(n, size=k, replace=False)
|
|
mask[idx] = True
|
|
return mask
|
|
|
|
|
|
def _random_word(rng: np.random.Generator, min_len: int = 3, max_len: int = 10) -> str:
|
|
length = int(rng.integers(min_len, max_len + 1))
|
|
letters = np.array(list(string.ascii_lowercase))
|
|
return "".join(rng.choice(letters, size=length))
|
|
|
|
|
|
def _random_sentence(rng: np.random.Generator, min_words: int = 8, max_words: int = 20) -> str:
|
|
n_words = int(rng.integers(min_words, max_words + 1))
|
|
return " ".join(_random_word(rng) for _ in range(n_words))
|
|
|
|
|
|
def build_dataframe(rng: np.random.Generator) -> pd.DataFrame:
|
|
n = N_ROWS
|
|
|
|
ids = np.arange(1, n + 1, dtype=np.int64)
|
|
|
|
int_vals = rng.integers(0, 1000, size=n).astype(np.float64)
|
|
|
|
bigint_vals = rng.integers(10_000_000_000, 20_000_000_000, size=n).astype(np.float64)
|
|
bigint_vals[_missing_mask(rng, n, NULL_FRACTION)] = np.nan
|
|
|
|
float_vals = rng.normal(loc=100.0, scale=15.0, size=n)
|
|
float_vals[_missing_mask(rng, n, NULL_FRACTION)] = np.nan
|
|
|
|
bool_vals = rng.integers(0, 2, size=n).astype(np.float64)
|
|
bool_vals[_missing_mask(rng, n, NULL_FRACTION)] = np.nan
|
|
|
|
str_vals = [_random_word(rng, 3, 8) for _ in range(n)]
|
|
|
|
long_str_vals: list[str] = []
|
|
long_mask = _missing_mask(rng, n, NULL_FRACTION)
|
|
for i in range(n):
|
|
long_str_vals.append("" if long_mask[i] else _random_sentence(rng))
|
|
|
|
base_date = dt.date(2020, 1, 1)
|
|
date_vals = [base_date + dt.timedelta(days=int(rng.integers(0, 2000))) for _ in range(n)]
|
|
|
|
dt_vals_mask = _missing_mask(rng, n, NULL_FRACTION)
|
|
dt_vals: list = []
|
|
base_dt = dt.datetime(2020, 1, 1, 0, 0, 0)
|
|
for i in range(n):
|
|
if dt_vals_mask[i]:
|
|
dt_vals.append(pd.NaT)
|
|
else:
|
|
offset_seconds = int(rng.integers(0, 2000 * 24 * 3600))
|
|
dt_vals.append(base_dt + dt.timedelta(seconds=offset_seconds))
|
|
dt_series = pd.to_datetime(dt_vals)
|
|
|
|
time_mask = _missing_mask(rng, n, NULL_FRACTION)
|
|
time_vals: list = []
|
|
for i in range(n):
|
|
if time_mask[i]:
|
|
time_vals.append(None)
|
|
else:
|
|
seconds_into_day = int(rng.integers(0, 24 * 3600))
|
|
h, rem = divmod(seconds_into_day, 3600)
|
|
m, s = divmod(rem, 60)
|
|
time_vals.append(dt.time(h, m, s))
|
|
|
|
numasstr_mask = _missing_mask(rng, n, NULL_FRACTION)
|
|
numasstr_vals: list[str] = []
|
|
for i in range(n):
|
|
if numasstr_mask[i]:
|
|
numasstr_vals.append("")
|
|
elif rng.random() < 0.5:
|
|
numasstr_vals.append(str(int(rng.integers(-500, 500))))
|
|
else:
|
|
numasstr_vals.append(f"{rng.normal(0, 50):.2f}")
|
|
|
|
dateasstr_mask = _missing_mask(rng, n, NULL_FRACTION)
|
|
dateasstr_vals: list[str] = []
|
|
for i in range(n):
|
|
if dateasstr_mask[i]:
|
|
dateasstr_vals.append("")
|
|
else:
|
|
d = base_date + dt.timedelta(days=int(rng.integers(0, 2000)))
|
|
dateasstr_vals.append(d.isoformat())
|
|
|
|
mixed_mask = _missing_mask(rng, n, NULL_FRACTION)
|
|
mixed_vals: list[str] = []
|
|
choices = ["number", "date", "text", "text"]
|
|
for i in range(n):
|
|
if mixed_mask[i]:
|
|
mixed_vals.append("")
|
|
continue
|
|
kind = choices[int(rng.integers(0, len(choices)))]
|
|
if kind == "number":
|
|
mixed_vals.append(str(int(rng.integers(0, 1000))))
|
|
elif kind == "date":
|
|
d = base_date + dt.timedelta(days=int(rng.integers(0, 2000)))
|
|
mixed_vals.append(d.isoformat())
|
|
else:
|
|
mixed_vals.append(_random_word(rng, 4, 12))
|
|
|
|
const_vals = ["CONSTANT"] * n
|
|
|
|
allnull_vals = np.full(n, np.nan, dtype=np.float64)
|
|
allnullc_vals = [""] * n
|
|
|
|
df = pd.DataFrame(
|
|
{
|
|
"ID": ids,
|
|
"INTCOL": int_vals,
|
|
"BIGINT": bigint_vals,
|
|
"FLOATCOL": float_vals,
|
|
"BOOLCOL": bool_vals,
|
|
"STRCOL": str_vals,
|
|
"LONGSTR": long_str_vals,
|
|
"DATECOL": date_vals,
|
|
"DTCOL": dt_series,
|
|
"TIMECOL": time_vals,
|
|
"NUMASSTR": numasstr_vals,
|
|
"DATEASTR": dateasstr_vals,
|
|
"MIXED": mixed_vals,
|
|
"CONST": const_vals,
|
|
"ALLNULL": allnull_vals,
|
|
"ALLNULLC": allnullc_vals,
|
|
}
|
|
)
|
|
return df
|
|
|
|
|
|
COLUMN_LABELS: dict[str, str] = {
|
|
"ID": "Row identifier",
|
|
"INTCOL": "Integer positive control",
|
|
"BIGINT": "Big integer beyond int32 range",
|
|
"FLOATCOL": "Floating point with decimals",
|
|
"BOOLCOL": "Nullable boolean 0/1/NaN",
|
|
"STRCOL": "Short string positive control",
|
|
"LONGSTR": "Longer free-text string",
|
|
"DATECOL": "Date positive control",
|
|
"DTCOL": "Datetime with missing values",
|
|
"TIMECOL": "Time of day with missing values",
|
|
"NUMASSTR": "Numeric-looking strings in a char column",
|
|
"DATEASTR": "Date-looking strings in a char column",
|
|
"MIXED": "Heterogeneous strings: fallback to text",
|
|
"CONST": "Constant repeated value",
|
|
"ALLNULL": "Entirely missing numeric column",
|
|
"ALLNULLC": "Entirely missing character column",
|
|
}
|
|
|
|
|
|
VARIABLE_FORMATS: dict[str, str] = {
|
|
"DATECOL": "DATE9.",
|
|
"DTCOL": "DATETIME20.",
|
|
"TIMECOL": "TIME8.",
|
|
}
|
|
|
|
|
|
EXPECTED_MANIFEST: dict[str, dict] = {
|
|
"ID": {"postgres_type": "INTEGER", "nullable": False},
|
|
"INTCOL": {"postgres_type": "INTEGER", "nullable": False, "note": "positive control"},
|
|
"BIGINT": {"postgres_type": "BIGINT", "nullable": True, "note": "values beyond int32 range"},
|
|
"FLOATCOL": {"acceptable_types": ["DOUBLE PRECISION", "NUMERIC"], "nullable": True},
|
|
"BOOLCOL": {
|
|
"acceptable_types": ["BOOLEAN", "SMALLINT", "INTEGER"],
|
|
"nullable": True,
|
|
"note": "{0,1,NaN} is genuinely ambiguous; loader's choice is a design decision",
|
|
},
|
|
"STRCOL": {"acceptable_types": ["TEXT", "VARCHAR"], "nullable": False, "note": "positive control"},
|
|
"LONGSTR": {"acceptable_types": ["TEXT", "VARCHAR"], "nullable": True},
|
|
"DATECOL": {"postgres_type": "DATE", "nullable": False, "note": "positive control"},
|
|
"DTCOL": {"acceptable_types": ["TIMESTAMP", "TIMESTAMP WITHOUT TIME ZONE"], "nullable": True},
|
|
"TIMECOL": {"postgres_type": "TIME", "nullable": True},
|
|
"NUMASSTR": {
|
|
"acceptable_types": ["NUMERIC", "DOUBLE PRECISION"],
|
|
"nullable": True,
|
|
"note": "stored as char in SAS; loader should coerce numeric-looking strings",
|
|
},
|
|
"DATEASTR": {
|
|
"postgres_type": "DATE",
|
|
"nullable": True,
|
|
"note": "stored as char in SAS; loader should coerce ISO-date strings",
|
|
},
|
|
"MIXED": {
|
|
"acceptable_types": ["TEXT", "VARCHAR"],
|
|
"nullable": True,
|
|
"note": "heterogeneous content; loader should fall back to text",
|
|
},
|
|
"CONST": {"acceptable_types": ["TEXT", "VARCHAR"], "nullable": False},
|
|
"ALLNULL": {
|
|
"acceptable_types": ["TEXT", "VARCHAR"],
|
|
"nullable": True,
|
|
"note": "entirely null numeric; loader must pick a default type, typically TEXT",
|
|
},
|
|
"ALLNULLC": {
|
|
"acceptable_types": ["TEXT", "VARCHAR"],
|
|
"nullable": True,
|
|
"note": "entirely null character",
|
|
},
|
|
}
|
|
|
|
|
|
def write_manifest(df: pd.DataFrame) -> None:
|
|
manifest_cols = set(EXPECTED_MANIFEST.keys())
|
|
df_cols = set(df.columns)
|
|
missing = df_cols - manifest_cols
|
|
extra = manifest_cols - df_cols
|
|
if missing or extra:
|
|
raise AssertionError(
|
|
f"Manifest/DataFrame column mismatch. Missing from manifest: {missing}. "
|
|
f"Extra in manifest: {extra}."
|
|
)
|
|
with MANIFEST_PATH.open("w", encoding="utf-8") as f:
|
|
json.dump(EXPECTED_MANIFEST, f, indent=2, sort_keys=True)
|
|
f.write("\n")
|
|
|
|
|
|
def _char_missing_fraction(series: pd.Series) -> float:
|
|
return float((series.fillna("").astype(str) == "").mean())
|
|
|
|
|
|
def _numeric_missing_fraction(series: pd.Series) -> float:
|
|
return float(series.isna().mean())
|
|
|
|
|
|
def verify_roundtrip(source_df: pd.DataFrame) -> pd.DataFrame:
|
|
# Use pyreadstat (the writer) to verify the writer's output. pyreadstat preserves
|
|
# SAS format metadata on readback, so we can confirm the date/datetime/time
|
|
# variable_format mappings actually took effect.
|
|
readback, _meta = pyreadstat.read_xport(str(OUT_PATH))
|
|
|
|
assert len(readback.columns) == len(source_df.columns), (
|
|
f"Column count mismatch: wrote {len(source_df.columns)}, read back {len(readback.columns)}"
|
|
)
|
|
assert set(readback.columns) == set(source_df.columns), (
|
|
f"Column name mismatch. Only in source: {set(source_df.columns) - set(readback.columns)}. "
|
|
f"Only in readback: {set(readback.columns) - set(source_df.columns)}."
|
|
)
|
|
assert len(readback) == len(source_df), (
|
|
f"Row count mismatch: wrote {len(source_df)}, read back {len(readback)}"
|
|
)
|
|
|
|
for col in ("DATECOL", "DTCOL"):
|
|
dtype = readback[col].dtype
|
|
is_datetime = pd.api.types.is_datetime64_any_dtype(dtype)
|
|
is_object_of_dates = pd.api.types.is_object_dtype(dtype) and readback[col].dropna().map(
|
|
lambda v: isinstance(v, (dt.date, dt.datetime, pd.Timestamp))
|
|
).all()
|
|
assert is_datetime or is_object_of_dates, (
|
|
f"{col} came back as {dtype}; expected datetime-like. "
|
|
f"variable_format mapping may not have taken effect."
|
|
)
|
|
|
|
time_dtype = readback["TIMECOL"].dtype
|
|
time_ok = (
|
|
pd.api.types.is_datetime64_any_dtype(time_dtype)
|
|
or pd.api.types.is_numeric_dtype(time_dtype)
|
|
or (
|
|
pd.api.types.is_object_dtype(time_dtype)
|
|
and readback["TIMECOL"].dropna().map(
|
|
lambda v: isinstance(v, (dt.time, dt.datetime, pd.Timestamp, int, float))
|
|
).all()
|
|
)
|
|
)
|
|
assert time_ok, f"TIMECOL came back as {time_dtype}; expected datetime/numeric/time-object"
|
|
|
|
tol = 0.10
|
|
for col in source_df.columns:
|
|
if col in POSITIVE_CONTROLS:
|
|
series = readback[col]
|
|
if pd.api.types.is_numeric_dtype(series) or pd.api.types.is_datetime64_any_dtype(series):
|
|
observed = _numeric_missing_fraction(series)
|
|
else:
|
|
observed = _char_missing_fraction(series)
|
|
assert observed == 0.0, (
|
|
f"Positive control {col!r} has {observed:.2%} missing; expected 0%."
|
|
)
|
|
continue
|
|
|
|
if col in ALL_NULL_COLS:
|
|
series = readback[col]
|
|
if pd.api.types.is_numeric_dtype(series):
|
|
observed = _numeric_missing_fraction(series)
|
|
else:
|
|
observed = _char_missing_fraction(series)
|
|
assert observed == 1.0, (
|
|
f"All-null column {col!r} has {observed:.2%} missing; expected 100%."
|
|
)
|
|
continue
|
|
|
|
series = readback[col]
|
|
if pd.api.types.is_numeric_dtype(series) or pd.api.types.is_datetime64_any_dtype(series):
|
|
observed = _numeric_missing_fraction(series)
|
|
else:
|
|
observed = _char_missing_fraction(series)
|
|
assert abs(observed - NULL_FRACTION) <= tol, (
|
|
f"Column {col!r}: observed missing fraction {observed:.2%} not within "
|
|
f"±{tol:.0%} of NULL_FRACTION={NULL_FRACTION:.2%}."
|
|
)
|
|
|
|
assert MANIFEST_PATH.exists(), f"Manifest file {MANIFEST_PATH} missing."
|
|
with MANIFEST_PATH.open("r", encoding="utf-8") as f:
|
|
manifest = json.load(f)
|
|
assert set(manifest.keys()) == set(readback.columns), (
|
|
f"Manifest/readback column set mismatch. "
|
|
f"Only in manifest: {set(manifest.keys()) - set(readback.columns)}. "
|
|
f"Only in readback: {set(readback.columns) - set(manifest.keys())}."
|
|
)
|
|
|
|
return readback
|
|
|
|
|
|
def main() -> None:
|
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
rng = np.random.default_rng(SEED)
|
|
df = build_dataframe(rng)
|
|
|
|
pyreadstat.write_xport(
|
|
df,
|
|
str(OUT_PATH),
|
|
file_format_version=5,
|
|
table_name="SAMPLE",
|
|
file_label="Kitchen sink sample for loader testing",
|
|
column_labels=COLUMN_LABELS,
|
|
variable_format=VARIABLE_FORMATS,
|
|
)
|
|
|
|
write_manifest(df)
|
|
|
|
readback = verify_roundtrip(df)
|
|
|
|
print(f"Wrote {OUT_PATH} ({N_ROWS} rows x {len(df.columns)} cols)")
|
|
print(f"Wrote {MANIFEST_PATH}")
|
|
print()
|
|
print("Readback via pyreadstat.read_xport (same reader the loader will use):")
|
|
print(readback.dtypes.to_string())
|
|
print()
|
|
print("Readback head:")
|
|
print(readback.head().to_string())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|