advanced_analyzer #8

Merged
dp merged 23 commits from advanced_analyzer into main 2026-04-21 22:32:18 +00:00
Showing only changes of commit 5449a25b44 - Show all commits

View File

@ -76,21 +76,14 @@ INDEX_UNIQUENESS_PCT: float = 95.0
"""Columns whose distinct/non-null ratio meets or exceeds this threshold are """Columns whose distinct/non-null ratio meets or exceeds this threshold are
flagged as index candidates.""" flagged as index candidates."""
PARTITION_MIN_DISTINCT: int = 2
"""A partition candidate must have at least this many distinct values."""
PARTITION_MAX_DISTINCT: int = 500
"""A partition candidate must have at most this many distinct values. Kept
deliberately tighter than the loader's max_partitions default (10,000) so
the default suggestions stay conservative."""
PARTITION_MIN_FILL_PCT: float = 95.0 PARTITION_MIN_FILL_PCT: float = 95.0
"""Partition candidates must be non-null in at least this fraction of rows.""" """Name-matched partition candidates must be non-null in at least this
fraction of rows."""
PRE_SHARDED_MAX_DISTINCT: int = 3 PRE_SHARDED_MAX_DISTINCT: int = 3
"""A name-matched column with <= this many distinct values is treated as """A name-matched column with <= this many distinct values is treated as
"the file is probably pre-sharded on this column" rather than being pre-sharded ("this file is one slice; sibling files have the other values")
silently dumped into the drop list.""" rather than as a ready-to-partition observed column."""
DISTINCT_CAP: int = 10_000 DISTINCT_CAP: int = 10_000
"""Max size of the per-column distinct-value set. Exceeding this marks the """Max size of the per-column distinct-value set. Exceeding this marks the
@ -108,10 +101,11 @@ PARTITION_NAME_PATTERNS: Tuple[re.Pattern, ...] = (
re.compile(r"^state$", re.IGNORECASE), re.compile(r"^state$", re.IGNORECASE),
re.compile(r"^state_?code$", re.IGNORECASE), re.compile(r"^state_?code$", re.IGNORECASE),
) )
"""Column names that are "probably partition columns" regardless of how """Only columns whose name matches one of these patterns are ever considered
many distinct values happen to be present in this one file. Kept tiny on partition candidates. This deliberately ignores generic low-cardinality
purpose - add more patterns here later if you want to recognise signals (status flags, boolean columns, etc.) because in practice the only
region/year/etc.""" useful partition key in this codebase is STATE. Add more patterns here if
that ever stops being true."""
INDEX_NAME_PATTERNS: Tuple[re.Pattern, ...] = ( INDEX_NAME_PATTERNS: Tuple[re.Pattern, ...] = (
@ -123,12 +117,6 @@ INDEX_NAME_PATTERNS: Tuple[re.Pattern, ...] = (
"""Name-bonus patterns for index-candidate ranking.""" """Name-bonus patterns for index-candidate ranking."""
_PARTITION_FRIENDLY_TYPES: frozenset = frozenset(
{"TEXT", "VARCHAR", "CHARACTER VARYING", "CHAR", "CHARACTER",
"INTEGER", "BIGINT", "SMALLINT", "BOOLEAN", "DATE"}
)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Per-column streaming aggregator # Per-column streaming aggregator
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -447,8 +435,6 @@ def classify(
*, *,
high_null_pct: float, high_null_pct: float,
index_uniqueness_pct: float, index_uniqueness_pct: float,
partition_min_distinct: int,
partition_max_distinct: int,
partition_min_fill_pct: float, partition_min_fill_pct: float,
pre_sharded_max_distinct: int, pre_sharded_max_distinct: int,
) -> Tuple[ ) -> Tuple[
@ -457,60 +443,69 @@ def classify(
List[_IndexCandidate], List[_IndexCandidate],
List[_TypeWarning], List[_TypeWarning],
]: ]:
"""Turn per-column stats + the loader's schema into four ranked lists.""" """Turn per-column stats + the loader's schema into four ranked lists.
Partition candidates are restricted to columns whose name matches
:data:`PARTITION_NAME_PATTERNS` - in practice STATE / STATE_CODE. A
generic "low-cardinality = partition candidate" heuristic produces too
much noise for this codebase, so we only surface columns we're confident
about by name.
"""
drops: List[_DropCandidate] = [] drops: List[_DropCandidate] = []
partitions: List[_PartitionCandidate] = [] partitions: List[_PartitionCandidate] = []
indexes: List[_IndexCandidate] = [] indexes: List[_IndexCandidate] = []
warnings: List[_TypeWarning] = [] warnings: List[_TypeWarning] = []
# Names we've already routed into the partition lane - exclude them from # -- Partition candidates (name-matched only) --------------------------
# the drop / index lanes downstream.
claimed_by_partition: set = set()
# -- First pass: partition-name-matched columns ------------------------
# Run this before the drop check so pre-sharded STATE columns don't get # Run this before the drop check so pre-sharded STATE columns don't get
# silently dropped. # silently dropped for being "constant".
claimed_by_partition: set = set()
for name, cs in stats.items(): for name, cs in stats.items():
spec = columns.get(name)
if not _matches_any(PARTITION_NAME_PATTERNS, name): if not _matches_any(PARTITION_NAME_PATTERNS, name):
continue continue
if cs.n_total == 0: if cs.n_total == 0 or cs.n_non_null == 0:
continue
if cs.fill_pct < partition_min_fill_pct:
continue continue
looks_pre_sharded = ( is_pre_sharded = (
cs.n_non_null > 0 not cs.distinct_overflow
and not cs.distinct_overflow
and cs.distinct_count <= pre_sharded_max_distinct and cs.distinct_count <= pre_sharded_max_distinct
and cs.fill_pct >= partition_min_fill_pct
) )
if looks_pre_sharded: kind = "pre_sharded" if is_pre_sharded else "observed"
observed = ", ".join(_format_value(v) for v, _ in cs.top_values(pre_sharded_max_distinct)) observed = _format_top_values(cs.top_values(pre_sharded_max_distinct))
note_parts = [
f"pre-sharded: this file only contains {cs.distinct_count} distinct " if is_pre_sharded:
f"value(s) ({observed})", note = (
"keep the column and set partition_by at the load_folder level so " f"pre-sharded: this file only contains {cs.distinct_count} "
"sibling files merge into separate partitions of one table", f"distinct value(s) ({observed}); keep the column and set "
] "partition_by at the load_folder level so sibling files merge "
"into separate partitions of one table"
)
else:
note = (
f"observed {cs.distinct_display} distinct value(s) across "
f"{cs.fill_pct:.1f}% of rows; LIST partitioning will create "
"one child table per distinct value"
)
partitions.append( partitions.append(
_PartitionCandidate( _PartitionCandidate(
name=name, name=name,
kind="pre_sharded", kind=kind,
distinct_count=cs.distinct_count, distinct_count=cs.distinct_count,
fill_pct=cs.fill_pct, fill_pct=cs.fill_pct,
top_values=_format_top_values(cs.top_values()), top_values=_format_top_values(cs.top_values()),
observed_values_in_file=observed, observed_values_in_file=observed,
note="; ".join(note_parts), note=note,
# Pre-sharded STATE always wins the ranking. # Pre-sharded beats observed as the snippet's top pick.
score=1_000_000.0, score=(1_000_000.0 if is_pre_sharded else 500_000.0) + cs.fill_pct,
) )
) )
claimed_by_partition.add(name) claimed_by_partition.add(name)
continue
# Name-matched but not pre-sharded: fall through into the regular partitions.sort(key=lambda p: p.score, reverse=True)
# partition candidate pass below, which will score it up due to the
# name match.
# -- Drop candidates --------------------------------------------------- # -- Drop candidates ---------------------------------------------------
for name, cs in stats.items(): for name, cs in stats.items():
@ -541,62 +536,6 @@ def classify(
drops.append(_DropCandidate(name=name, reason=reason)) drops.append(_DropCandidate(name=name, reason=reason))
dropped_names = {d.name for d in drops} dropped_names = {d.name for d in drops}
# -- Partition candidates (observed) ----------------------------------
for name, cs in stats.items():
if name in claimed_by_partition or name in dropped_names:
continue
spec = columns.get(name)
if spec is None:
continue
pg_type = spec.postgres_type.upper()
if pg_type not in _PARTITION_FRIENDLY_TYPES:
continue
if cs.distinct_overflow:
continue
if not (
partition_min_distinct <= cs.distinct_count <= partition_max_distinct
):
continue
if cs.fill_pct < partition_min_fill_pct:
continue
name_match = _matches_any(PARTITION_NAME_PATTERNS, name)
# Score: name-match dominates, then prefer fewer partitions (safer
# DDL), then prefer more-filled columns as a tiebreaker.
score = (
(500_000.0 if name_match else 0.0)
+ (partition_max_distinct - cs.distinct_count)
+ cs.fill_pct
)
notes: List[str] = []
if name_match:
notes.append("name matches PARTITION_NAME_PATTERNS")
if cs.distinct_count > 10_000:
notes.append(
f"distinct_count={cs.distinct_count:,} exceeds loader "
"max_partitions default (10,000); expect DDL warnings"
)
notes.append(
"LIST partitioning creates one child table per distinct value "
"(see load_sas.render_partition_ddl)"
)
partitions.append(
_PartitionCandidate(
name=name,
kind="observed",
distinct_count=cs.distinct_count,
fill_pct=cs.fill_pct,
top_values=_format_top_values(cs.top_values()),
observed_values_in_file=_format_top_values(cs.top_values()),
note="; ".join(notes),
score=score,
)
)
partitions.sort(key=lambda p: p.score, reverse=True)
partition_names = {p.name for p in partitions} partition_names = {p.name for p in partitions}
# -- Index candidates -------------------------------------------------- # -- Index candidates --------------------------------------------------
@ -768,17 +707,11 @@ def render_yaml_snippet(
) )
lines.append("partition_by:") lines.append("partition_by:")
lines.append(f" - {top.name}") lines.append(f" - {top.name}")
if len(partitions) > 1:
lines.append(
"# Runners-up (append to partition_by for multi-level "
"LIST partitioning; see load_sas.render_partition_ddl):"
)
for p in partitions[1:]:
lines.append(
f"# - {p.name} # kind={p.kind} distinct={p.distinct_count}"
)
else: else:
lines.append("# (no partition candidates found)") lines.append(
"# (no partition candidates found - no column matched "
"PARTITION_NAME_PATTERNS)"
)
lines.append("") lines.append("")
@ -1050,8 +983,6 @@ def _build_argparser() -> argparse.ArgumentParser:
help="Null percentage at/above which a column is a drop candidate.") help="Null percentage at/above which a column is a drop candidate.")
p.add_argument("--index-uniqueness-pct", type=float, default=INDEX_UNIQUENESS_PCT, p.add_argument("--index-uniqueness-pct", type=float, default=INDEX_UNIQUENESS_PCT,
help="Uniqueness (distinct/non-null) at/above which a column is an index candidate.") help="Uniqueness (distinct/non-null) at/above which a column is an index candidate.")
p.add_argument("--partition-min-distinct", type=int, default=PARTITION_MIN_DISTINCT)
p.add_argument("--partition-max-distinct", type=int, default=PARTITION_MAX_DISTINCT)
p.add_argument("--partition-min-fill-pct", type=float, default=PARTITION_MIN_FILL_PCT) p.add_argument("--partition-min-fill-pct", type=float, default=PARTITION_MIN_FILL_PCT)
p.add_argument("--pre-sharded-max-distinct", type=int, default=PRE_SHARDED_MAX_DISTINCT) p.add_argument("--pre-sharded-max-distinct", type=int, default=PRE_SHARDED_MAX_DISTINCT)
return p return p
@ -1074,8 +1005,6 @@ def main(argv: Optional[List[str]] = None) -> int:
stats, columns, stats, columns,
high_null_pct=args.high_null_pct, high_null_pct=args.high_null_pct,
index_uniqueness_pct=args.index_uniqueness_pct, index_uniqueness_pct=args.index_uniqueness_pct,
partition_min_distinct=args.partition_min_distinct,
partition_max_distinct=args.partition_max_distinct,
partition_min_fill_pct=args.partition_min_fill_pct, partition_min_fill_pct=args.partition_min_fill_pct,
pre_sharded_max_distinct=args.pre_sharded_max_distinct, pre_sharded_max_distinct=args.pre_sharded_max_distinct,
) )
@ -1085,10 +1014,9 @@ def main(argv: Optional[List[str]] = None) -> int:
thresholds = { thresholds = {
"HIGH_NULL_PCT": args.high_null_pct, "HIGH_NULL_PCT": args.high_null_pct,
"INDEX_UNIQUENESS_PCT": args.index_uniqueness_pct, "INDEX_UNIQUENESS_PCT": args.index_uniqueness_pct,
"PARTITION_MIN_DISTINCT": args.partition_min_distinct,
"PARTITION_MAX_DISTINCT": args.partition_max_distinct,
"PARTITION_MIN_FILL_PCT": args.partition_min_fill_pct, "PARTITION_MIN_FILL_PCT": args.partition_min_fill_pct,
"PRE_SHARDED_MAX_DISTINCT": args.pre_sharded_max_distinct, "PRE_SHARDED_MAX_DISTINCT": args.pre_sharded_max_distinct,
"PARTITION_NAME_PATTERNS": ", ".join(p.pattern for p in PARTITION_NAME_PATTERNS),
"DISTINCT_CAP": DISTINCT_CAP, "DISTINCT_CAP": DISTINCT_CAP,
"TOP_N_VALUES": TOP_N_VALUES, "TOP_N_VALUES": TOP_N_VALUES,
"PREVIEW_ROWS_FOR_INFERENCE": PREVIEW_ROWS_FOR_INFERENCE, "PREVIEW_ROWS_FOR_INFERENCE": PREVIEW_ROWS_FOR_INFERENCE,