advanced_analyzer #8
@ -32,6 +32,17 @@ USAGE
|
|||||||
# include: [ID, INTCOL]
|
# include: [ID, INTCOL]
|
||||||
# exclude: [ALLNULL]
|
# exclude: [ALLNULL]
|
||||||
|
|
||||||
|
# Optional folder default for explicit column type overrides. These
|
||||||
|
# win over the cluster-wide auto-union computed during pre-scan; set
|
||||||
|
# them when a column's SAS-level type varies across files (e.g. phone
|
||||||
|
# IDs stored as CHAR in some years and NUM in others) and you want to
|
||||||
|
# pin the Postgres type yourself rather than accept the auto-derived
|
||||||
|
# one. Per-cluster column_types inside each clusters[*] entry are
|
||||||
|
# merged on top of this map.
|
||||||
|
# column_types:
|
||||||
|
# RESP_PH_PREFIX_ID: TEXT
|
||||||
|
# SOME_BIGINT_COL: BIGINT
|
||||||
|
|
||||||
# Optional folder default for LIST partitioning. Omit or set [] for no
|
# Optional folder default for LIST partitioning. Omit or set [] for no
|
||||||
# partitioning. Accepts a single string or a list of column names.
|
# partitioning. Accepts a single string or a list of column names.
|
||||||
# partition_by:
|
# partition_by:
|
||||||
@ -43,14 +54,16 @@ USAGE
|
|||||||
|
|
||||||
# Optional explicit cluster patterns. Each pattern is matched against the
|
# Optional explicit cluster patterns. Each pattern is matched against the
|
||||||
# file *basename*. Matched files are pulled out of the auto-detect pool.
|
# file *basename*. Matched files are pulled out of the auto-detect pool.
|
||||||
# Per-cluster if_exists/include/exclude/partition_by/max_partitions
|
# Per-cluster if_exists/include/exclude/partition_by/max_partitions/
|
||||||
# override the folder-level defaults.
|
# column_types override the folder-level defaults.
|
||||||
clusters:
|
clusters:
|
||||||
- pattern: '^group_a\\d+\\.sas7bdat$'
|
- pattern: '^group_a\\d+\\.sas7bdat$'
|
||||||
tablename: group_a
|
tablename: group_a
|
||||||
- pattern: '^group_b\\d+\\.sas7bdat$'
|
- pattern: '^group_b\\d+\\.sas7bdat$'
|
||||||
tablename: group_b
|
tablename: group_b
|
||||||
if_exists: replace
|
if_exists: replace
|
||||||
|
column_types:
|
||||||
|
PHONE_PREFIX: TEXT
|
||||||
|
|
||||||
2. Command-line interface
|
2. Command-line interface
|
||||||
-------------------------
|
-------------------------
|
||||||
@ -158,6 +171,7 @@ from load_sas import (
|
|||||||
create_indexes,
|
create_indexes,
|
||||||
create_table,
|
create_table,
|
||||||
discover_partition_values_chunked,
|
discover_partition_values_chunked,
|
||||||
|
extract_union_metadata,
|
||||||
infer_schema,
|
infer_schema,
|
||||||
iter_sas_chunks,
|
iter_sas_chunks,
|
||||||
read_sas_metadata,
|
read_sas_metadata,
|
||||||
@ -165,6 +179,7 @@ from load_sas import (
|
|||||||
render_create_indexes,
|
render_create_indexes,
|
||||||
render_create_table,
|
render_create_table,
|
||||||
render_partition_ddl,
|
render_partition_ddl,
|
||||||
|
union_column_types,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -182,7 +197,11 @@ class ClusterSpec:
|
|||||||
|
|
||||||
``partition_by``, ``max_partitions``, and ``indexes`` are resolved from
|
``partition_by``, ``max_partitions``, and ``indexes`` are resolved from
|
||||||
the folder defaults and any per-cluster overrides during
|
the folder defaults and any per-cluster overrides during
|
||||||
:func:`discover_clusters`.
|
:func:`discover_clusters`. ``column_types`` holds the effective type
|
||||||
|
overrides for this cluster: user-supplied YAML entries merged on top
|
||||||
|
of the auto-union result computed during pre-scan (see :func:`main`).
|
||||||
|
The same dict is threaded through to workers so every file in the
|
||||||
|
cluster infers the same schema.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tablename: str
|
tablename: str
|
||||||
@ -195,6 +214,7 @@ class ClusterSpec:
|
|||||||
partition_by: List[str] = field(default_factory=list)
|
partition_by: List[str] = field(default_factory=list)
|
||||||
max_partitions: int = 10_000
|
max_partitions: int = 10_000
|
||||||
indexes: List[str] = field(default_factory=list)
|
indexes: List[str] = field(default_factory=list)
|
||||||
|
column_types: Dict[str, str] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -205,6 +225,8 @@ class _ExplicitPattern:
|
|||||||
An explicit empty list ``[]`` means "disable partitioning for this cluster".
|
An explicit empty list ``[]`` means "disable partitioning for this cluster".
|
||||||
``max_partitions`` defaults to ``None`` meaning "inherit from folder level".
|
``max_partitions`` defaults to ``None`` meaning "inherit from folder level".
|
||||||
``indexes`` defaults to ``None`` meaning "inherit from folder level".
|
``indexes`` defaults to ``None`` meaning "inherit from folder level".
|
||||||
|
``column_types`` defaults to ``None`` meaning "inherit from folder level";
|
||||||
|
an explicit ``{}`` means "no user overrides for this cluster".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pattern: re.Pattern
|
pattern: re.Pattern
|
||||||
@ -216,6 +238,7 @@ class _ExplicitPattern:
|
|||||||
partition_by: Optional[List[str]] = None
|
partition_by: Optional[List[str]] = None
|
||||||
max_partitions: Optional[int] = None
|
max_partitions: Optional[int] = None
|
||||||
indexes: Optional[List[str]] = None
|
indexes: Optional[List[str]] = None
|
||||||
|
column_types: Optional[Dict[str, str]] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -224,6 +247,9 @@ class FolderConfig:
|
|||||||
|
|
||||||
``partition_by``, ``max_partitions``, and ``indexes`` serve as defaults
|
``partition_by``, ``max_partitions``, and ``indexes`` serve as defaults
|
||||||
for every cluster unless overridden at the cluster level.
|
for every cluster unless overridden at the cluster level.
|
||||||
|
``column_types`` is a ``{column_name: postgres_type_str}`` map of
|
||||||
|
user-supplied type overrides that win over the auto-union computed
|
||||||
|
during pre-scan.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
folder: Path
|
folder: Path
|
||||||
@ -236,6 +262,7 @@ class FolderConfig:
|
|||||||
partition_by: List[str] = field(default_factory=list)
|
partition_by: List[str] = field(default_factory=list)
|
||||||
max_partitions: int = 10_000
|
max_partitions: int = 10_000
|
||||||
indexes: List[str] = field(default_factory=list)
|
indexes: List[str] = field(default_factory=list)
|
||||||
|
column_types: Dict[str, str] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@ -396,6 +423,40 @@ def _validate_indexes_vs_columns(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_column_types(
|
||||||
|
raw_value: Any, where: str, *, allow_none: bool = False
|
||||||
|
) -> Optional[Dict[str, str]]:
|
||||||
|
"""Parse a ``column_types`` mapping from YAML.
|
||||||
|
|
||||||
|
The value must be a mapping ``{column_name: pg_type_str}``. Keys and
|
||||||
|
values are whitespace-stripped strings; empty strings raise. When
|
||||||
|
``allow_none`` is True (used for per-cluster entries), an omitted key
|
||||||
|
returns ``None`` to mean "inherit from folder level"; an explicit
|
||||||
|
empty mapping returns ``{}`` (no overrides for this cluster).
|
||||||
|
"""
|
||||||
|
if raw_value is None:
|
||||||
|
return None if allow_none else {}
|
||||||
|
if not isinstance(raw_value, dict):
|
||||||
|
raise ValueError(
|
||||||
|
f"{where}: 'column_types' must be a mapping of "
|
||||||
|
f"{{column_name: postgres_type}}."
|
||||||
|
)
|
||||||
|
out: Dict[str, str] = {}
|
||||||
|
for k, v in raw_value.items():
|
||||||
|
key = str(k).strip()
|
||||||
|
if not key:
|
||||||
|
raise ValueError(
|
||||||
|
f"{where}: 'column_types' contains an empty column name."
|
||||||
|
)
|
||||||
|
if not isinstance(v, str) or not v.strip():
|
||||||
|
raise ValueError(
|
||||||
|
f"{where}: 'column_types[{key}]' must be a non-empty "
|
||||||
|
f"Postgres type string (got {v!r})."
|
||||||
|
)
|
||||||
|
out[key] = v.strip()
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def load_folder_config(path: Path) -> FolderConfig:
|
def load_folder_config(path: Path) -> FolderConfig:
|
||||||
"""Parse and validate the folder-level YAML config at ``path``.
|
"""Parse and validate the folder-level YAML config at ``path``.
|
||||||
|
|
||||||
@ -438,6 +499,11 @@ def load_folder_config(path: Path) -> FolderConfig:
|
|||||||
indexes = _parse_indexes(raw.get("indexes"), f"Config {path}")
|
indexes = _parse_indexes(raw.get("indexes"), f"Config {path}")
|
||||||
_validate_indexes_vs_columns(indexes, exclude, f"Config {path}")
|
_validate_indexes_vs_columns(indexes, exclude, f"Config {path}")
|
||||||
|
|
||||||
|
# -- folder-level column_types overrides --------------------------------
|
||||||
|
column_types = _parse_column_types(
|
||||||
|
raw.get("column_types"), f"Config {path}"
|
||||||
|
)
|
||||||
|
|
||||||
explicit: List[_ExplicitPattern] = []
|
explicit: List[_ExplicitPattern] = []
|
||||||
clusters_raw = raw.get("clusters") or []
|
clusters_raw = raw.get("clusters") or []
|
||||||
if not isinstance(clusters_raw, list):
|
if not isinstance(clusters_raw, list):
|
||||||
@ -479,6 +545,11 @@ def load_folder_config(path: Path) -> FolderConfig:
|
|||||||
effective_idx = c_indexes if c_indexes is not None else indexes
|
effective_idx = c_indexes if c_indexes is not None else indexes
|
||||||
_validate_indexes_vs_columns(effective_idx, effective_exclude, where)
|
_validate_indexes_vs_columns(effective_idx, effective_exclude, where)
|
||||||
|
|
||||||
|
# -- per-cluster column_types overrides -----------------------------
|
||||||
|
c_column_types = _parse_column_types(
|
||||||
|
entry.get("column_types"), where, allow_none=True
|
||||||
|
)
|
||||||
|
|
||||||
explicit.append(
|
explicit.append(
|
||||||
_ExplicitPattern(
|
_ExplicitPattern(
|
||||||
pattern=compiled,
|
pattern=compiled,
|
||||||
@ -490,6 +561,7 @@ def load_folder_config(path: Path) -> FolderConfig:
|
|||||||
partition_by=c_partition_by,
|
partition_by=c_partition_by,
|
||||||
max_partitions=c_max_partitions,
|
max_partitions=c_max_partitions,
|
||||||
indexes=c_indexes,
|
indexes=c_indexes,
|
||||||
|
column_types=c_column_types,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -504,6 +576,7 @@ def load_folder_config(path: Path) -> FolderConfig:
|
|||||||
partition_by=partition_by,
|
partition_by=partition_by,
|
||||||
max_partitions=max_partitions,
|
max_partitions=max_partitions,
|
||||||
indexes=indexes,
|
indexes=indexes,
|
||||||
|
column_types=column_types or {},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -601,6 +674,14 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
|
|||||||
patt.indexes if patt.indexes is not None
|
patt.indexes if patt.indexes is not None
|
||||||
else cfg.indexes
|
else cfg.indexes
|
||||||
)
|
)
|
||||||
|
# Resolve column_types: user overrides only. The auto-union adds
|
||||||
|
# more entries later (in :func:`main`) after the metadata pre-scan.
|
||||||
|
# None = inherit folder, {} = no cluster-level overrides, dict =
|
||||||
|
# cluster-level overrides that win over folder-level entries.
|
||||||
|
if patt.column_types is None:
|
||||||
|
resolved_ct: Dict[str, str] = dict(cfg.column_types)
|
||||||
|
else:
|
||||||
|
resolved_ct = {**cfg.column_types, **patt.column_types}
|
||||||
|
|
||||||
matched = [f for f in remaining if patt.pattern.search(f.name)]
|
matched = [f for f in remaining if patt.pattern.search(f.name)]
|
||||||
if not matched:
|
if not matched:
|
||||||
@ -618,6 +699,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
|
|||||||
partition_by=resolved_pb,
|
partition_by=resolved_pb,
|
||||||
max_partitions=resolved_mp,
|
max_partitions=resolved_mp,
|
||||||
indexes=resolved_idx,
|
indexes=resolved_idx,
|
||||||
|
column_types=dict(resolved_ct),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
@ -634,6 +716,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
|
|||||||
partition_by=resolved_pb,
|
partition_by=resolved_pb,
|
||||||
max_partitions=resolved_mp,
|
max_partitions=resolved_mp,
|
||||||
indexes=resolved_idx,
|
indexes=resolved_idx,
|
||||||
|
column_types=dict(resolved_ct),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -654,6 +737,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
|
|||||||
partition_by=cfg.partition_by,
|
partition_by=cfg.partition_by,
|
||||||
max_partitions=cfg.max_partitions,
|
max_partitions=cfg.max_partitions,
|
||||||
indexes=cfg.indexes,
|
indexes=cfg.indexes,
|
||||||
|
column_types=dict(cfg.column_types),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -666,19 +750,29 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
|
|||||||
|
|
||||||
|
|
||||||
def _infer_cluster_schema(
|
def _infer_cluster_schema(
|
||||||
path: Path, include, exclude
|
path: Path,
|
||||||
|
include,
|
||||||
|
exclude,
|
||||||
|
*,
|
||||||
|
column_types: Optional[Dict[str, str]] = None,
|
||||||
) -> Tuple[Dict, Optional[int]]:
|
) -> Tuple[Dict, Optional[int]]:
|
||||||
"""Infer the Postgres column schema from a SAS file preview.
|
"""Infer the Postgres column schema from a SAS file preview.
|
||||||
|
|
||||||
Returns ``(columns, total_rows)``. ``total_rows`` comes from the
|
Returns ``(columns, total_rows)``. ``total_rows`` comes from the
|
||||||
pyreadstat metadata (the file's declared row count) and is threaded
|
pyreadstat metadata (the file's declared row count) and is threaded
|
||||||
through to :func:`_stream_file` so the tqdm progress bar has a real
|
through to :func:`_stream_file` so the tqdm progress bar has a real
|
||||||
denominator instead of an indeterminate spinner.
|
denominator instead of an indeterminate spinner. ``column_types``
|
||||||
|
lets the caller pin specific columns to a chosen Postgres type
|
||||||
|
(typically the merged auto-union + YAML overrides for the cluster).
|
||||||
"""
|
"""
|
||||||
preview_df, meta = read_sas_preview(path)
|
preview_df, meta = read_sas_preview(path)
|
||||||
preview_df = apply_column_filter(preview_df, include, exclude)
|
preview_df = apply_column_filter(preview_df, include, exclude)
|
||||||
total_rows = getattr(meta, "number_rows", None)
|
total_rows = getattr(meta, "number_rows", None)
|
||||||
columns = infer_schema(preview_df, meta, total_rows=total_rows)
|
columns = infer_schema(
|
||||||
|
preview_df, meta,
|
||||||
|
total_rows=total_rows,
|
||||||
|
column_types=column_types,
|
||||||
|
)
|
||||||
return columns, total_rows
|
return columns, total_rows
|
||||||
|
|
||||||
|
|
||||||
@ -748,7 +842,8 @@ def load_cluster(
|
|||||||
|
|
||||||
first, *rest = cluster.files
|
first, *rest = cluster.files
|
||||||
first_columns, first_total_rows = _infer_cluster_schema(
|
first_columns, first_total_rows = _infer_cluster_schema(
|
||||||
first, cluster.include, cluster.exclude
|
first, cluster.include, cluster.exclude,
|
||||||
|
column_types=cluster.column_types,
|
||||||
)
|
)
|
||||||
|
|
||||||
# -- Validate index columns early ---------------------------------------
|
# -- Validate index columns early ---------------------------------------
|
||||||
@ -827,6 +922,7 @@ def load_cluster(
|
|||||||
workers=workers,
|
workers=workers,
|
||||||
progress_queue=progress_queue,
|
progress_queue=progress_queue,
|
||||||
db_overrides=db_overrides,
|
db_overrides=db_overrides,
|
||||||
|
column_types=cluster.column_types,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Serial path: stream the first file on the main connection, then
|
# Serial path: stream the first file on the main connection, then
|
||||||
@ -842,7 +938,8 @@ def load_cluster(
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
for path in rest:
|
for path in rest:
|
||||||
columns, path_total_rows = _infer_cluster_schema(
|
columns, path_total_rows = _infer_cluster_schema(
|
||||||
path, cluster.include, cluster.exclude
|
path, cluster.include, cluster.exclude,
|
||||||
|
column_types=cluster.column_types,
|
||||||
)
|
)
|
||||||
# Uses the same check that if_exists=append runs. A type
|
# Uses the same check that if_exists=append runs. A type
|
||||||
# mismatch or missing column aborts the cluster; because
|
# mismatch or missing column aborts the cluster; because
|
||||||
@ -926,6 +1023,7 @@ def _worker_load_append_file(
|
|||||||
exclude: Optional[List[str]],
|
exclude: Optional[List[str]],
|
||||||
progress_queue: Any,
|
progress_queue: Any,
|
||||||
db_overrides: Optional[Dict[str, Optional[str]]],
|
db_overrides: Optional[Dict[str, Optional[str]]],
|
||||||
|
column_types: Optional[Dict[str, str]] = None,
|
||||||
) -> Tuple[str, int, Optional[str]]:
|
) -> Tuple[str, int, Optional[str]]:
|
||||||
"""Worker process: load one SAS file in append mode.
|
"""Worker process: load one SAS file in append mode.
|
||||||
|
|
||||||
@ -965,7 +1063,11 @@ def _worker_load_append_file(
|
|||||||
preview_df, meta = _read_sas_preview(path)
|
preview_df, meta = _read_sas_preview(path)
|
||||||
preview_df = _apply_column_filter(preview_df, include, exclude)
|
preview_df = _apply_column_filter(preview_df, include, exclude)
|
||||||
total_rows = getattr(meta, "number_rows", None)
|
total_rows = getattr(meta, "number_rows", None)
|
||||||
columns = _infer_schema(preview_df, meta, total_rows=total_rows)
|
columns = _infer_schema(
|
||||||
|
preview_df, meta,
|
||||||
|
total_rows=total_rows,
|
||||||
|
column_types=column_types,
|
||||||
|
)
|
||||||
# Drop the preview ASAP - on a 2M-row wide file it's hundreds of MB
|
# Drop the preview ASAP - on a 2M-row wide file it's hundreds of MB
|
||||||
# and we never need it again after schema inference.
|
# and we never need it again after schema inference.
|
||||||
del preview_df, meta
|
del preview_df, meta
|
||||||
@ -1031,6 +1133,7 @@ def _load_remaining_files_parallel(
|
|||||||
workers: int,
|
workers: int,
|
||||||
progress_queue: Any,
|
progress_queue: Any,
|
||||||
db_overrides: Optional[Dict[str, Optional[str]]],
|
db_overrides: Optional[Dict[str, Optional[str]]],
|
||||||
|
column_types: Optional[Dict[str, str]] = None,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Run append-mode loads for ``files`` across a process pool.
|
"""Run append-mode loads for ``files`` across a process pool.
|
||||||
|
|
||||||
@ -1069,6 +1172,7 @@ def _load_remaining_files_parallel(
|
|||||||
exclude,
|
exclude,
|
||||||
progress_queue,
|
progress_queue,
|
||||||
db_overrides,
|
db_overrides,
|
||||||
|
column_types,
|
||||||
)
|
)
|
||||||
for p in files
|
for p in files
|
||||||
]
|
]
|
||||||
@ -1219,7 +1323,14 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
print()
|
print()
|
||||||
for c in loadable:
|
for c in loadable:
|
||||||
print(f"--- DDL for cluster {c.tablename!r} ---")
|
print(f"--- DDL for cluster {c.tablename!r} ---")
|
||||||
columns, _ = _infer_cluster_schema(c.files[0], c.include, c.exclude)
|
# Dry-run skips the pre-scan (so no auto-union) but user-supplied
|
||||||
|
# ``column_types`` from YAML are already baked into ``c.column_types``
|
||||||
|
# by ``discover_clusters`` - honor them here so the previewed DDL
|
||||||
|
# matches what a real load would produce on a single-file cluster.
|
||||||
|
columns, _ = _infer_cluster_schema(
|
||||||
|
c.files[0], c.include, c.exclude,
|
||||||
|
column_types=c.column_types,
|
||||||
|
)
|
||||||
# Print parent CREATE TABLE (with PARTITION BY if applicable).
|
# Print parent CREATE TABLE (with PARTITION BY if applicable).
|
||||||
print(
|
print(
|
||||||
render_create_table(
|
render_create_table(
|
||||||
@ -1332,40 +1443,58 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
|
|
||||||
# -- Metadata pre-scan -----------------------------------------------------
|
# -- Metadata pre-scan -----------------------------------------------------
|
||||||
# Sum ``number_rows`` across every file so the tqdm bar has a real
|
# Sum ``number_rows`` across every file so the tqdm bar has a real
|
||||||
# denominator. ``read_sas_metadata`` uses pyreadstat's ``metadataonly=True``
|
# denominator, AND collect the per-column (readstat_type, sas_format)
|
||||||
# fast path, but on multi-GB sas7bdat files that still reads tens of MB
|
# tuples so we can union schemas across files in a cluster before any
|
||||||
# of scattered subheader pages per file - sequentially that's minutes for
|
# CREATE TABLE runs. ``read_sas_metadata`` uses pyreadstat's
|
||||||
# a 52-file folder. pyreadstat releases the GIL during I/O and C decoding,
|
# ``metadataonly=True`` fast path, but on multi-GB sas7bdat files
|
||||||
# so a ThreadPool gives near-linear scaling until the disk saturates.
|
# that still reads tens of MB of scattered subheader pages per file -
|
||||||
# ``--no-prescan`` bypasses the scan entirely; the progress bar then runs
|
# sequentially that's minutes for a 52-file folder. pyreadstat
|
||||||
# without an ETA - useful when pre-scan itself is expensive (half hour+
|
# releases the GIL during I/O and C decoding, so a ThreadPool gives
|
||||||
# on very large files) or when debugging iteratively.
|
# near-linear scaling until the disk saturates. ``--no-prescan``
|
||||||
|
# bypasses the scan entirely; the progress bar then runs without an
|
||||||
|
# ETA *and* the auto-union is skipped (user overrides from YAML
|
||||||
|
# still apply).
|
||||||
all_files: List[Path] = [p for c in loadable for p in c.files]
|
all_files: List[Path] = [p for c in loadable for p in c.files]
|
||||||
grand_total: Optional[int] = 0
|
grand_total: Optional[int] = 0
|
||||||
|
file_meta_by_path: Dict[str, Dict[str, Tuple[str, Optional[str]]]] = {}
|
||||||
|
|
||||||
if args.no_prescan:
|
if args.no_prescan:
|
||||||
grand_total = None
|
grand_total = None
|
||||||
print(
|
print(
|
||||||
f"[info] --no-prescan set: skipping row-count pre-scan for "
|
f"[info] --no-prescan set: skipping row-count pre-scan for "
|
||||||
f"{len(all_files)} file(s); progress bar will show rate + "
|
f"{len(all_files)} file(s); progress bar will show rate + "
|
||||||
f"elapsed but no ETA.",
|
f"elapsed but no ETA. Cluster-wide schema auto-union is also "
|
||||||
|
f"disabled; only user-specified column_types overrides apply.",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
prescan_workers = min(16, max(1, len(all_files)))
|
prescan_workers = min(16, max(1, len(all_files)))
|
||||||
print(
|
print(
|
||||||
f"pre-scanning row counts for {len(all_files)} file(s) "
|
f"pre-scanning row counts + per-column metadata for "
|
||||||
f"across {prescan_workers} thread(s)...",
|
f"{len(all_files)} file(s) across {prescan_workers} thread(s)...",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _scan_one(p: Path) -> Tuple[Path, Optional[int], Optional[str]]:
|
def _scan_one(
|
||||||
|
p: Path,
|
||||||
|
) -> Tuple[
|
||||||
|
Path,
|
||||||
|
Optional[int],
|
||||||
|
Optional[Dict[str, Tuple[str, Optional[str]]]],
|
||||||
|
Optional[str],
|
||||||
|
]:
|
||||||
try:
|
try:
|
||||||
meta = read_sas_metadata(p)
|
meta = read_sas_metadata(p)
|
||||||
n = getattr(meta, "number_rows", None)
|
n = getattr(meta, "number_rows", None)
|
||||||
return (p, int(n) if n is not None else None, None)
|
col_meta = extract_union_metadata(meta)
|
||||||
|
return (
|
||||||
|
p,
|
||||||
|
int(n) if n is not None else None,
|
||||||
|
col_meta,
|
||||||
|
None,
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return (p, None, str(e))
|
return (p, None, None, str(e))
|
||||||
|
|
||||||
unknown_total_files: List[str] = []
|
unknown_total_files: List[str] = []
|
||||||
running_total = 0
|
running_total = 0
|
||||||
@ -1378,7 +1507,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
dynamic_ncols=True,
|
dynamic_ncols=True,
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
for p, n, err in tpool.map(_scan_one, all_files):
|
for p, n, col_meta, err in tpool.map(_scan_one, all_files):
|
||||||
prescan_bar.update(1)
|
prescan_bar.update(1)
|
||||||
if err is not None:
|
if err is not None:
|
||||||
unknown_total_files.append(f"{p.name} ({err})")
|
unknown_total_files.append(f"{p.name} ({err})")
|
||||||
@ -1386,6 +1515,8 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
unknown_total_files.append(p.name)
|
unknown_total_files.append(p.name)
|
||||||
else:
|
else:
|
||||||
running_total += n
|
running_total += n
|
||||||
|
if col_meta is not None:
|
||||||
|
file_meta_by_path[str(p)] = col_meta
|
||||||
finally:
|
finally:
|
||||||
prescan_bar.close()
|
prescan_bar.close()
|
||||||
|
|
||||||
@ -1402,6 +1533,59 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
)
|
)
|
||||||
grand_total = running_total
|
grand_total = running_total
|
||||||
|
|
||||||
|
# -- Cluster-wide schema auto-union ---------------------------------------
|
||||||
|
# For each cluster, compute ``auto_types`` from the union of every
|
||||||
|
# file's metadata (see :func:`load_sas.union_column_types`). Merge with
|
||||||
|
# any user-supplied YAML overrides (user wins) and attach the result
|
||||||
|
# back onto the cluster so every later read - first-file inference,
|
||||||
|
# worker inference, schema-compat check - sees the same frozen schema.
|
||||||
|
# With ``--no-prescan`` the file_meta_by_path dict is empty and
|
||||||
|
# ``auto_types`` resolves to {}, so only the YAML overrides survive.
|
||||||
|
for c in loadable:
|
||||||
|
per_file = [
|
||||||
|
file_meta_by_path[str(p)]
|
||||||
|
for p in c.files
|
||||||
|
if str(p) in file_meta_by_path
|
||||||
|
]
|
||||||
|
auto_types = union_column_types(per_file) if per_file else {}
|
||||||
|
user_overrides = dict(c.column_types) # already merged folder+cluster
|
||||||
|
# User-supplied overrides win over the auto-union.
|
||||||
|
merged = {**auto_types, **user_overrides}
|
||||||
|
c.column_types = merged
|
||||||
|
|
||||||
|
if auto_types:
|
||||||
|
# Only call out columns where auto-union *changed* something
|
||||||
|
# relative to the default "first file wins" inference. We
|
||||||
|
# don't have the default inference in hand at this point, so
|
||||||
|
# log the full resolved map at a debug-friendly level - it's
|
||||||
|
# bounded by column count and the user asked for visibility
|
||||||
|
# into what got overridden.
|
||||||
|
shown = auto_types
|
||||||
|
if user_overrides:
|
||||||
|
# Distinguish the user-forced entries in the log so it's
|
||||||
|
# obvious which types came from YAML.
|
||||||
|
shown = {
|
||||||
|
col: (
|
||||||
|
f"{user_overrides[col]} (user override)"
|
||||||
|
if col in user_overrides
|
||||||
|
else pg
|
||||||
|
)
|
||||||
|
for col, pg in merged.items()
|
||||||
|
}
|
||||||
|
print(
|
||||||
|
f"[info] cluster {c.tablename!r}: auto-union derived "
|
||||||
|
f"{len(auto_types)} column type(s) across "
|
||||||
|
f"{len(per_file)} file(s): {shown}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
elif user_overrides and args.no_prescan:
|
||||||
|
print(
|
||||||
|
f"[info] cluster {c.tablename!r}: using {len(user_overrides)} "
|
||||||
|
f"user-supplied column_types override(s); auto-union "
|
||||||
|
f"disabled by --no-prescan.",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
|
||||||
# -- Shared progress plumbing ---------------------------------------------
|
# -- Shared progress plumbing ---------------------------------------------
|
||||||
# The queue crosses process boundaries when workers > 1 (managed proxy)
|
# The queue crosses process boundaries when workers > 1 (managed proxy)
|
||||||
# and is a plain in-process queue otherwise; the put/get contract is
|
# and is a plain in-process queue otherwise; the put/get contract is
|
||||||
|
|||||||
@ -307,6 +307,7 @@ class LoaderConfig:
|
|||||||
partition_by: List[str] = field(default_factory=list)
|
partition_by: List[str] = field(default_factory=list)
|
||||||
max_partitions: int = 10_000
|
max_partitions: int = 10_000
|
||||||
indexes: List[str] = field(default_factory=list)
|
indexes: List[str] = field(default_factory=list)
|
||||||
|
column_types: Dict[str, str] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -517,6 +518,35 @@ def load_config(path: Path) -> LoaderConfig:
|
|||||||
f"{missing_in_include}"
|
f"{missing_in_include}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# -- column_types -------------------------------------------------------
|
||||||
|
# Optional ``{column_name: pg_type}`` escape hatch that bypasses
|
||||||
|
# automatic type inference for specific columns. Useful when
|
||||||
|
# pyreadstat reports a column as NUM but the downstream consumer
|
||||||
|
# expects TEXT (e.g. phone-id columns), or when a column has drifted
|
||||||
|
# between CHAR and NUM across file versions and you want to pin
|
||||||
|
# TEXT up front. See also :func:`infer_schema`.
|
||||||
|
raw_ct = raw.get("column_types")
|
||||||
|
column_types: Dict[str, str] = {}
|
||||||
|
if raw_ct is not None:
|
||||||
|
if not isinstance(raw_ct, dict):
|
||||||
|
raise ValueError(
|
||||||
|
f"Config {path}: 'column_types' must be a mapping of "
|
||||||
|
f"{{column_name: postgres_type}}."
|
||||||
|
)
|
||||||
|
for k, v in raw_ct.items():
|
||||||
|
key = str(k).strip()
|
||||||
|
if not key:
|
||||||
|
raise ValueError(
|
||||||
|
f"Config {path}: 'column_types' contains an empty "
|
||||||
|
f"column name."
|
||||||
|
)
|
||||||
|
if not isinstance(v, str) or not v.strip():
|
||||||
|
raise ValueError(
|
||||||
|
f"Config {path}: 'column_types[{key}]' must be a "
|
||||||
|
f"non-empty Postgres type string (got {v!r})."
|
||||||
|
)
|
||||||
|
column_types[key] = v.strip()
|
||||||
|
|
||||||
return LoaderConfig(
|
return LoaderConfig(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
schemaname=schemaname,
|
schemaname=schemaname,
|
||||||
@ -527,6 +557,7 @@ def load_config(path: Path) -> LoaderConfig:
|
|||||||
partition_by=partition_by,
|
partition_by=partition_by,
|
||||||
max_partitions=max_partitions,
|
max_partitions=max_partitions,
|
||||||
indexes=indexes,
|
indexes=indexes,
|
||||||
|
column_types=column_types,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -687,6 +718,117 @@ def _format_driven_type(sas_format: Optional[str]) -> Optional[str]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
_DECIMAL_FORMAT_RE = re.compile(r"\.(\d+)")
|
||||||
|
|
||||||
|
|
||||||
|
def _format_hints_decimal(sas_format: Optional[str]) -> bool:
|
||||||
|
"""True if a numeric SAS format string explicitly carries decimal places.
|
||||||
|
|
||||||
|
SAS numeric formats are ``NAMEw.d``; ``d > 0`` means the variable was
|
||||||
|
intended to render with ``d`` decimal digits (COMMA10.2, F8.3, ...).
|
||||||
|
A bare width like ``BEST12.`` or ``F8.`` has no digits after the dot
|
||||||
|
and is treated as integer-presenting. Used by
|
||||||
|
:func:`union_column_types` to pick BIGINT vs DOUBLE PRECISION when a
|
||||||
|
column is numeric in every file of a cluster.
|
||||||
|
"""
|
||||||
|
if not sas_format:
|
||||||
|
return False
|
||||||
|
m = _DECIMAL_FORMAT_RE.search(sas_format)
|
||||||
|
if not m:
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
return int(m.group(1)) > 0
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def extract_union_metadata(
|
||||||
|
meta: Any,
|
||||||
|
) -> Dict[str, Tuple[str, Optional[str]]]:
|
||||||
|
"""Pull the (readstat_type, sas_format) pair for every column in ``meta``.
|
||||||
|
|
||||||
|
Returns a plain dict that's safe to pass between processes and to
|
||||||
|
:func:`union_column_types`. ``readstat_type`` is the simplified type
|
||||||
|
reported by pyreadstat: ``"string"`` for SAS CHAR, ``"double"`` for
|
||||||
|
SAS NUM. ``sas_format`` comes from ``meta.original_variable_types``
|
||||||
|
and drives date/datetime detection during union.
|
||||||
|
"""
|
||||||
|
var_types = dict(getattr(meta, "variable_types", None) or {})
|
||||||
|
formats = dict(getattr(meta, "original_variable_types", None) or {})
|
||||||
|
names = list(
|
||||||
|
getattr(meta, "column_names", None)
|
||||||
|
or list(var_types.keys())
|
||||||
|
or list(formats.keys())
|
||||||
|
)
|
||||||
|
out: Dict[str, Tuple[str, Optional[str]]] = {}
|
||||||
|
for col in names:
|
||||||
|
rtype = str(var_types.get(col, "")) if var_types else ""
|
||||||
|
fmt = formats.get(col)
|
||||||
|
out[col] = (rtype, fmt if fmt else None)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def union_column_types(
|
||||||
|
per_file_metas: Iterable[Dict[str, Tuple[str, Optional[str]]]],
|
||||||
|
) -> Dict[str, str]:
|
||||||
|
"""Derive one Postgres type per column that's safe across every file.
|
||||||
|
|
||||||
|
``per_file_metas`` is an iterable (one entry per file in a cluster) of
|
||||||
|
``{column_name: (readstat_type, sas_format)}`` dicts as produced by
|
||||||
|
:func:`extract_union_metadata`.
|
||||||
|
|
||||||
|
Rules, evaluated per column:
|
||||||
|
|
||||||
|
* **CHAR/NUM drift wins TEXT.** If any file stores the column as CHAR
|
||||||
|
(``readstat_type != "double"``) the union is ``TEXT``. This covers
|
||||||
|
the phone-id case where some years stored ``RESP_PH_PREFIX_ID`` as
|
||||||
|
CHAR and others as NUM.
|
||||||
|
* **All NUM, format hints DATETIME → TIMESTAMP.** Any file whose
|
||||||
|
format resolves to ``TIMESTAMP`` (via :func:`_format_driven_type`)
|
||||||
|
pins the column to ``TIMESTAMP`` even if other files left the
|
||||||
|
format blank.
|
||||||
|
* **All NUM, format hints DATE → DATE.** Same idea for date-only
|
||||||
|
formats.
|
||||||
|
* **All NUM, any decimal hint → DOUBLE PRECISION.** A ``w.d`` format
|
||||||
|
with ``d > 0`` in any file implies fractional values somewhere.
|
||||||
|
* **All NUM, otherwise → BIGINT.** Default to BIGINT per user
|
||||||
|
preference: integer-presenting NUM columns drift between
|
||||||
|
INTEGER/BIGINT/DOUBLE across files, and the few extra bytes are
|
||||||
|
worth not re-failing every load.
|
||||||
|
|
||||||
|
Columns missing from a given file are simply skipped for that file;
|
||||||
|
the union is computed over whichever files *did* supply the column.
|
||||||
|
Columns that never appear anywhere are omitted from the result.
|
||||||
|
"""
|
||||||
|
per_col: Dict[str, List[Tuple[str, Optional[str]]]] = {}
|
||||||
|
for meta in per_file_metas:
|
||||||
|
for col, pair in meta.items():
|
||||||
|
per_col.setdefault(col, []).append(pair)
|
||||||
|
|
||||||
|
result: Dict[str, str] = {}
|
||||||
|
for col, entries in per_col.items():
|
||||||
|
any_char = any(
|
||||||
|
rtype and rtype.lower() != "double" for rtype, _ in entries
|
||||||
|
)
|
||||||
|
if any_char:
|
||||||
|
result[col] = "TEXT"
|
||||||
|
continue
|
||||||
|
formats = [fmt for _, fmt in entries if fmt]
|
||||||
|
driven = [_format_driven_type(f) for f in formats]
|
||||||
|
if "TIMESTAMP" in driven:
|
||||||
|
result[col] = "TIMESTAMP"
|
||||||
|
elif "DATE" in driven:
|
||||||
|
result[col] = "DATE"
|
||||||
|
elif any(_format_hints_decimal(f) for f in formats):
|
||||||
|
result[col] = "DOUBLE PRECISION"
|
||||||
|
else:
|
||||||
|
# Safe default: BIGINT. The user explicitly accepted wasting a
|
||||||
|
# few bytes here to avoid INTEGER→BIGINT widening failures on
|
||||||
|
# multi-year clusters.
|
||||||
|
result[col] = "BIGINT"
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _all_null(series: pd.Series) -> bool:
|
def _all_null(series: pd.Series) -> bool:
|
||||||
if pd.api.types.is_object_dtype(series):
|
if pd.api.types.is_object_dtype(series):
|
||||||
return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all())
|
return bool(series.map(lambda v: v is None or (isinstance(v, str) and v == "") or (isinstance(v, float) and pd.isna(v))).all())
|
||||||
@ -812,6 +954,7 @@ def infer_schema(
|
|||||||
*,
|
*,
|
||||||
coerce_chars: bool = COERCE_CHAR_COLUMNS,
|
coerce_chars: bool = COERCE_CHAR_COLUMNS,
|
||||||
total_rows: Optional[int] = None,
|
total_rows: Optional[int] = None,
|
||||||
|
column_types: Optional[Dict[str, str]] = None,
|
||||||
) -> Dict[str, ColumnSpec]:
|
) -> Dict[str, ColumnSpec]:
|
||||||
"""Infer a Postgres column spec for each column in ``df``.
|
"""Infer a Postgres column spec for each column in ``df``.
|
||||||
|
|
||||||
@ -827,6 +970,14 @@ def infer_schema(
|
|||||||
``total_rows`` lets callers who already sampled the frame (e.g. via
|
``total_rows`` lets callers who already sampled the frame (e.g. via
|
||||||
:func:`read_sas_preview`) report the real file size in the per-column
|
:func:`read_sas_preview`) report the real file size in the per-column
|
||||||
"inferred from first N of M rows" note. Falls back to ``len(df)``.
|
"inferred from first N of M rows" note. Falls back to ``len(df)``.
|
||||||
|
|
||||||
|
``column_types`` is an optional map ``{column_name: pg_type_str}``
|
||||||
|
whose entries bypass inference entirely - the caller has already
|
||||||
|
decided the type (e.g. via :func:`union_column_types` across a
|
||||||
|
cluster, or a YAML ``column_types`` override). Nullability is still
|
||||||
|
computed from the data. Columns in ``column_types`` that don't exist
|
||||||
|
in ``df`` are ignored so a shared override dict can apply to clusters
|
||||||
|
with different column sets.
|
||||||
"""
|
"""
|
||||||
original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {})
|
original_formats: Dict[str, str] = dict(getattr(meta, "original_variable_types", {}) or {})
|
||||||
|
|
||||||
@ -846,6 +997,8 @@ def infer_schema(
|
|||||||
sample_size = df_rows
|
sample_size = df_rows
|
||||||
sampled = sample_size < effective_total
|
sampled = sample_size < effective_total
|
||||||
|
|
||||||
|
overrides: Dict[str, str] = dict(column_types or {})
|
||||||
|
|
||||||
# Temporarily flip the module-level flag if the caller asked us to.
|
# Temporarily flip the module-level flag if the caller asked us to.
|
||||||
global COERCE_CHAR_COLUMNS
|
global COERCE_CHAR_COLUMNS
|
||||||
saved = COERCE_CHAR_COLUMNS
|
saved = COERCE_CHAR_COLUMNS
|
||||||
@ -858,6 +1011,23 @@ def infer_schema(
|
|||||||
sas_format = original_formats.get(col)
|
sas_format = original_formats.get(col)
|
||||||
notes: List[str] = []
|
notes: List[str] = []
|
||||||
|
|
||||||
|
if col in overrides:
|
||||||
|
pg_type = overrides[col]
|
||||||
|
notes.append(
|
||||||
|
f"type forced to {pg_type} via column_types override"
|
||||||
|
)
|
||||||
|
nullable = _is_nullable(series)
|
||||||
|
out[col] = ColumnSpec(
|
||||||
|
name=col,
|
||||||
|
postgres_type=pg_type,
|
||||||
|
nullable=nullable,
|
||||||
|
sas_format=sas_format,
|
||||||
|
source_dtype=str(series.dtype),
|
||||||
|
notes=notes,
|
||||||
|
sampled=sampled,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
pg_type = _format_driven_type(sas_format)
|
pg_type = _format_driven_type(sas_format)
|
||||||
|
|
||||||
if pg_type is None:
|
if pg_type is None:
|
||||||
@ -1832,7 +2002,33 @@ def _prepare_for_copy(df: pd.DataFrame, columns: Dict[str, ColumnSpec]) -> pd.Da
|
|||||||
# astype(str) stringifies NaN/None to the literal "nan"/"None",
|
# astype(str) stringifies NaN/None to the literal "nan"/"None",
|
||||||
# so we mask those after the fact rather than branching per cell.
|
# so we mask those after the fact rather than branching per cell.
|
||||||
na_mask = series.isna()
|
na_mask = series.isna()
|
||||||
out[name] = series.astype(str).mask(na_mask, "")
|
if pd.api.types.is_numeric_dtype(series):
|
||||||
|
# Hit when a column was auto-unioned to TEXT because at
|
||||||
|
# least one file of the cluster stored it as CHAR but this
|
||||||
|
# particular file stored it as NUM (typical of SAS phone-id
|
||||||
|
# columns). Default float formatting would emit "123.0" -
|
||||||
|
# which doesn't match the plain "123" coming from the CHAR
|
||||||
|
# files. When the whole chunk is integer-valued, round to
|
||||||
|
# int before stringifying; when any fractional value is
|
||||||
|
# present we leave float formatting alone so we don't
|
||||||
|
# silently drop precision.
|
||||||
|
nonnull = series.dropna()
|
||||||
|
int_like = False
|
||||||
|
if not nonnull.empty:
|
||||||
|
try:
|
||||||
|
int_like = bool(((nonnull % 1) == 0).all())
|
||||||
|
except TypeError:
|
||||||
|
int_like = False
|
||||||
|
if int_like:
|
||||||
|
# ``Int64`` preserves NA; ``.astype(str)`` renders NA
|
||||||
|
# as '<NA>', which we then mask out alongside original
|
||||||
|
# NaNs.
|
||||||
|
as_str = series.astype("Int64").astype(str)
|
||||||
|
out[name] = as_str.mask(na_mask, "")
|
||||||
|
else:
|
||||||
|
out[name] = series.astype(str).mask(na_mask, "")
|
||||||
|
else:
|
||||||
|
out[name] = series.astype(str).mask(na_mask, "")
|
||||||
elif pg == "BOOLEAN":
|
elif pg == "BOOLEAN":
|
||||||
out[name] = series.astype("boolean") if series.dtype != object else series
|
out[name] = series.astype("boolean") if series.dtype != object else series
|
||||||
else:
|
else:
|
||||||
@ -2064,7 +2260,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
# on columns whose nulls live past the window.
|
# on columns whose nulls live past the window.
|
||||||
preview_df, meta = read_sas_preview(cfg.filename)
|
preview_df, meta = read_sas_preview(cfg.filename)
|
||||||
preview_df = apply_column_filter(preview_df, cfg.include, cfg.exclude)
|
preview_df = apply_column_filter(preview_df, cfg.include, cfg.exclude)
|
||||||
columns = infer_schema(preview_df, meta)
|
columns = infer_schema(preview_df, meta, column_types=cfg.column_types)
|
||||||
|
|
||||||
# Validate partition columns exist in the schema after filtering.
|
# Validate partition columns exist in the schema after filtering.
|
||||||
if cfg.partition_by:
|
if cfg.partition_by:
|
||||||
|
|||||||
@ -38,3 +38,15 @@ if_exists: append
|
|||||||
# indexes:
|
# indexes:
|
||||||
# - state
|
# - state
|
||||||
# - zip
|
# - zip
|
||||||
|
|
||||||
|
# column_types: Explicit {column_name: postgres_type} overrides that
|
||||||
|
# bypass automatic type inference for the listed columns. Useful when
|
||||||
|
# pyreadstat reports a column as NUM but you want it stored as TEXT
|
||||||
|
# (phone/ID columns that are conceptually strings), or when a column's
|
||||||
|
# inferred type is off for any other reason. Columns not listed here
|
||||||
|
# fall through to the normal inference path. Nullability is always
|
||||||
|
# computed from the data.
|
||||||
|
#
|
||||||
|
# column_types:
|
||||||
|
# RESP_PH_PREFIX_ID: TEXT
|
||||||
|
# SOMELONG_ID: BIGINT
|
||||||
|
|||||||
@ -61,15 +61,40 @@ auto_detect: true
|
|||||||
# - state
|
# - state
|
||||||
# - zip
|
# - zip
|
||||||
|
|
||||||
|
# Folder-level column_types: Explicit {column_name: postgres_type} map that
|
||||||
|
# bypasses automatic type inference for the listed columns. Applied to
|
||||||
|
# every cluster unless a cluster supplies its own column_types, which are
|
||||||
|
# merged on top (cluster entries win on conflict).
|
||||||
|
#
|
||||||
|
# During --workers>1 runs the pre-scan derives a cluster-wide "auto-union"
|
||||||
|
# type per column (e.g. any file stores the column as CHAR -> TEXT; all
|
||||||
|
# NUM with any format hinting decimals -> DOUBLE PRECISION; otherwise
|
||||||
|
# BIGINT). Entries in column_types here win over that auto-union - use
|
||||||
|
# them when the auto result is wrong or when --no-prescan disables the
|
||||||
|
# auto-union and you still need to pin a column.
|
||||||
|
#
|
||||||
|
# Valid type strings are anything the CREATE TABLE DDL accepts (TEXT,
|
||||||
|
# INTEGER, BIGINT, DOUBLE PRECISION, DATE, TIMESTAMP, ...). Columns that
|
||||||
|
# don't exist in a given file are simply ignored for that file.
|
||||||
|
#
|
||||||
|
# column_types:
|
||||||
|
# RESP_PH_PREFIX_ID: TEXT
|
||||||
|
# RESP_PH_SUFFIX_ID: TEXT
|
||||||
|
# SOMELONG_ID: BIGINT
|
||||||
|
|
||||||
# Explicit cluster patterns. Each pattern is matched against the file
|
# Explicit cluster patterns. Each pattern is matched against the file
|
||||||
# *basename*. Files matched by a pattern are pulled out of the auto-detect
|
# *basename*. Files matched by a pattern are pulled out of the auto-detect
|
||||||
# pool, so explicit and auto clusters compose cleanly.
|
# pool, so explicit and auto clusters compose cleanly.
|
||||||
#
|
#
|
||||||
# `tablename` is required. `if_exists`, `include`, and `exclude` are
|
# `tablename` is required. `if_exists`, `include`, `exclude`, and
|
||||||
# optional per-cluster overrides of the folder-level defaults above.
|
# `column_types` are optional per-cluster overrides of the folder-level
|
||||||
|
# defaults above. Cluster-level column_types entries win over folder-
|
||||||
|
# level entries for the same column.
|
||||||
clusters:
|
clusters:
|
||||||
- pattern: '^group_a\d+\.xpt$'
|
- pattern: '^group_a\d+\.xpt$'
|
||||||
tablename: group_a
|
tablename: group_a
|
||||||
|
# column_types:
|
||||||
|
# INTCOL: TEXT
|
||||||
|
|
||||||
# Example of an explicit override. Uncomment to force the group_b cluster to
|
# Example of an explicit override. Uncomment to force the group_b cluster to
|
||||||
# append instead of replace even though the folder default is "replace":
|
# append instead of replace even though the folder default is "replace":
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user