advanced_analyzer #8

Merged
dp merged 23 commits from advanced_analyzer into main 2026-04-21 22:32:18 +00:00
Showing only changes of commit 5449a25b44 - Show all commits

View File

@ -76,21 +76,14 @@ INDEX_UNIQUENESS_PCT: float = 95.0
"""Columns whose distinct/non-null ratio meets or exceeds this threshold are
flagged as index candidates."""
PARTITION_MIN_DISTINCT: int = 2
"""A partition candidate must have at least this many distinct values."""
PARTITION_MAX_DISTINCT: int = 500
"""A partition candidate must have at most this many distinct values. Kept
deliberately tighter than the loader's max_partitions default (10,000) so
the default suggestions stay conservative."""
PARTITION_MIN_FILL_PCT: float = 95.0
"""Partition candidates must be non-null in at least this fraction of rows."""
"""Name-matched partition candidates must be non-null in at least this
fraction of rows."""
PRE_SHARDED_MAX_DISTINCT: int = 3
"""A name-matched column with <= this many distinct values is treated as
"the file is probably pre-sharded on this column" rather than being
silently dumped into the drop list."""
pre-sharded ("this file is one slice; sibling files have the other values")
rather than as a ready-to-partition observed column."""
DISTINCT_CAP: int = 10_000
"""Max size of the per-column distinct-value set. Exceeding this marks the
@ -108,10 +101,11 @@ PARTITION_NAME_PATTERNS: Tuple[re.Pattern, ...] = (
re.compile(r"^state$", re.IGNORECASE),
re.compile(r"^state_?code$", re.IGNORECASE),
)
"""Column names that are "probably partition columns" regardless of how
many distinct values happen to be present in this one file. Kept tiny on
purpose - add more patterns here later if you want to recognise
region/year/etc."""
"""Only columns whose name matches one of these patterns are ever considered
partition candidates. This deliberately ignores generic low-cardinality
signals (status flags, boolean columns, etc.) because in practice the only
useful partition key in this codebase is STATE. Add more patterns here if
that ever stops being true."""
INDEX_NAME_PATTERNS: Tuple[re.Pattern, ...] = (
@ -123,12 +117,6 @@ INDEX_NAME_PATTERNS: Tuple[re.Pattern, ...] = (
"""Name-bonus patterns for index-candidate ranking."""
_PARTITION_FRIENDLY_TYPES: frozenset = frozenset(
{"TEXT", "VARCHAR", "CHARACTER VARYING", "CHAR", "CHARACTER",
"INTEGER", "BIGINT", "SMALLINT", "BOOLEAN", "DATE"}
)
# ---------------------------------------------------------------------------
# Per-column streaming aggregator
# ---------------------------------------------------------------------------
@ -447,8 +435,6 @@ def classify(
*,
high_null_pct: float,
index_uniqueness_pct: float,
partition_min_distinct: int,
partition_max_distinct: int,
partition_min_fill_pct: float,
pre_sharded_max_distinct: int,
) -> Tuple[
@ -457,60 +443,69 @@ def classify(
List[_IndexCandidate],
List[_TypeWarning],
]:
"""Turn per-column stats + the loader's schema into four ranked lists."""
"""Turn per-column stats + the loader's schema into four ranked lists.
Partition candidates are restricted to columns whose name matches
:data:`PARTITION_NAME_PATTERNS` - in practice STATE / STATE_CODE. A
generic "low-cardinality = partition candidate" heuristic produces too
much noise for this codebase, so we only surface columns we're confident
about by name.
"""
drops: List[_DropCandidate] = []
partitions: List[_PartitionCandidate] = []
indexes: List[_IndexCandidate] = []
warnings: List[_TypeWarning] = []
# Names we've already routed into the partition lane - exclude them from
# the drop / index lanes downstream.
claimed_by_partition: set = set()
# -- First pass: partition-name-matched columns ------------------------
# -- Partition candidates (name-matched only) --------------------------
# Run this before the drop check so pre-sharded STATE columns don't get
# silently dropped.
# silently dropped for being "constant".
claimed_by_partition: set = set()
for name, cs in stats.items():
spec = columns.get(name)
if not _matches_any(PARTITION_NAME_PATTERNS, name):
continue
if cs.n_total == 0:
if cs.n_total == 0 or cs.n_non_null == 0:
continue
if cs.fill_pct < partition_min_fill_pct:
continue
looks_pre_sharded = (
cs.n_non_null > 0
and not cs.distinct_overflow
is_pre_sharded = (
not cs.distinct_overflow
and cs.distinct_count <= pre_sharded_max_distinct
and cs.fill_pct >= partition_min_fill_pct
)
if looks_pre_sharded:
observed = ", ".join(_format_value(v) for v, _ in cs.top_values(pre_sharded_max_distinct))
note_parts = [
f"pre-sharded: this file only contains {cs.distinct_count} distinct "
f"value(s) ({observed})",
"keep the column and set partition_by at the load_folder level so "
"sibling files merge into separate partitions of one table",
]
partitions.append(
_PartitionCandidate(
name=name,
kind="pre_sharded",
distinct_count=cs.distinct_count,
fill_pct=cs.fill_pct,
top_values=_format_top_values(cs.top_values()),
observed_values_in_file=observed,
note="; ".join(note_parts),
# Pre-sharded STATE always wins the ranking.
score=1_000_000.0,
)
)
claimed_by_partition.add(name)
continue
kind = "pre_sharded" if is_pre_sharded else "observed"
observed = _format_top_values(cs.top_values(pre_sharded_max_distinct))
# Name-matched but not pre-sharded: fall through into the regular
# partition candidate pass below, which will score it up due to the
# name match.
if is_pre_sharded:
note = (
f"pre-sharded: this file only contains {cs.distinct_count} "
f"distinct value(s) ({observed}); keep the column and set "
"partition_by at the load_folder level so sibling files merge "
"into separate partitions of one table"
)
else:
note = (
f"observed {cs.distinct_display} distinct value(s) across "
f"{cs.fill_pct:.1f}% of rows; LIST partitioning will create "
"one child table per distinct value"
)
partitions.append(
_PartitionCandidate(
name=name,
kind=kind,
distinct_count=cs.distinct_count,
fill_pct=cs.fill_pct,
top_values=_format_top_values(cs.top_values()),
observed_values_in_file=observed,
note=note,
# Pre-sharded beats observed as the snippet's top pick.
score=(1_000_000.0 if is_pre_sharded else 500_000.0) + cs.fill_pct,
)
)
claimed_by_partition.add(name)
partitions.sort(key=lambda p: p.score, reverse=True)
# -- Drop candidates ---------------------------------------------------
for name, cs in stats.items():
@ -541,62 +536,6 @@ def classify(
drops.append(_DropCandidate(name=name, reason=reason))
dropped_names = {d.name for d in drops}
# -- Partition candidates (observed) ----------------------------------
for name, cs in stats.items():
if name in claimed_by_partition or name in dropped_names:
continue
spec = columns.get(name)
if spec is None:
continue
pg_type = spec.postgres_type.upper()
if pg_type not in _PARTITION_FRIENDLY_TYPES:
continue
if cs.distinct_overflow:
continue
if not (
partition_min_distinct <= cs.distinct_count <= partition_max_distinct
):
continue
if cs.fill_pct < partition_min_fill_pct:
continue
name_match = _matches_any(PARTITION_NAME_PATTERNS, name)
# Score: name-match dominates, then prefer fewer partitions (safer
# DDL), then prefer more-filled columns as a tiebreaker.
score = (
(500_000.0 if name_match else 0.0)
+ (partition_max_distinct - cs.distinct_count)
+ cs.fill_pct
)
notes: List[str] = []
if name_match:
notes.append("name matches PARTITION_NAME_PATTERNS")
if cs.distinct_count > 10_000:
notes.append(
f"distinct_count={cs.distinct_count:,} exceeds loader "
"max_partitions default (10,000); expect DDL warnings"
)
notes.append(
"LIST partitioning creates one child table per distinct value "
"(see load_sas.render_partition_ddl)"
)
partitions.append(
_PartitionCandidate(
name=name,
kind="observed",
distinct_count=cs.distinct_count,
fill_pct=cs.fill_pct,
top_values=_format_top_values(cs.top_values()),
observed_values_in_file=_format_top_values(cs.top_values()),
note="; ".join(notes),
score=score,
)
)
partitions.sort(key=lambda p: p.score, reverse=True)
partition_names = {p.name for p in partitions}
# -- Index candidates --------------------------------------------------
@ -768,17 +707,11 @@ def render_yaml_snippet(
)
lines.append("partition_by:")
lines.append(f" - {top.name}")
if len(partitions) > 1:
lines.append(
"# Runners-up (append to partition_by for multi-level "
"LIST partitioning; see load_sas.render_partition_ddl):"
)
for p in partitions[1:]:
lines.append(
f"# - {p.name} # kind={p.kind} distinct={p.distinct_count}"
)
else:
lines.append("# (no partition candidates found)")
lines.append(
"# (no partition candidates found - no column matched "
"PARTITION_NAME_PATTERNS)"
)
lines.append("")
@ -1050,8 +983,6 @@ def _build_argparser() -> argparse.ArgumentParser:
help="Null percentage at/above which a column is a drop candidate.")
p.add_argument("--index-uniqueness-pct", type=float, default=INDEX_UNIQUENESS_PCT,
help="Uniqueness (distinct/non-null) at/above which a column is an index candidate.")
p.add_argument("--partition-min-distinct", type=int, default=PARTITION_MIN_DISTINCT)
p.add_argument("--partition-max-distinct", type=int, default=PARTITION_MAX_DISTINCT)
p.add_argument("--partition-min-fill-pct", type=float, default=PARTITION_MIN_FILL_PCT)
p.add_argument("--pre-sharded-max-distinct", type=int, default=PRE_SHARDED_MAX_DISTINCT)
return p
@ -1074,8 +1005,6 @@ def main(argv: Optional[List[str]] = None) -> int:
stats, columns,
high_null_pct=args.high_null_pct,
index_uniqueness_pct=args.index_uniqueness_pct,
partition_min_distinct=args.partition_min_distinct,
partition_max_distinct=args.partition_max_distinct,
partition_min_fill_pct=args.partition_min_fill_pct,
pre_sharded_max_distinct=args.pre_sharded_max_distinct,
)
@ -1085,10 +1014,9 @@ def main(argv: Optional[List[str]] = None) -> int:
thresholds = {
"HIGH_NULL_PCT": args.high_null_pct,
"INDEX_UNIQUENESS_PCT": args.index_uniqueness_pct,
"PARTITION_MIN_DISTINCT": args.partition_min_distinct,
"PARTITION_MAX_DISTINCT": args.partition_max_distinct,
"PARTITION_MIN_FILL_PCT": args.partition_min_fill_pct,
"PRE_SHARDED_MAX_DISTINCT": args.pre_sharded_max_distinct,
"PARTITION_NAME_PATTERNS": ", ".join(p.pattern for p in PARTITION_NAME_PATTERNS),
"DISTINCT_CAP": DISTINCT_CAP,
"TOP_N_VALUES": TOP_N_VALUES,
"PREVIEW_ROWS_FOR_INFERENCE": PREVIEW_ROWS_FOR_INFERENCE,