2026-04-21 22:32:18 +00:00
1 changed files with 62 additions and 134 deletions
--- a/utils/sas_profiler.py
+++ b/utils/sas_profiler.py
@ -76,21 +76,14 @@ INDEX_UNIQUENESS_PCT: float = 95.0
 """Columns whose distinct/non-null ratio meets or exceeds this threshold are
 flagged as index candidates."""

-PARTITION_MIN_DISTINCT: int = 2
-"""A partition candidate must have at least this many distinct values."""
-
-PARTITION_MAX_DISTINCT: int = 500
-"""A partition candidate must have at most this many distinct values. Kept
-deliberately tighter than the loader's max_partitions default (10,000) so
-the default suggestions stay conservative."""
-
 PARTITION_MIN_FILL_PCT: float = 95.0
-"""Partition candidates must be non-null in at least this fraction of rows."""
+"""Name-matched partition candidates must be non-null in at least this
+fraction of rows."""

 PRE_SHARDED_MAX_DISTINCT: int = 3
 """A name-matched column with <= this many distinct values is treated as
-"the file is probably pre-sharded on this column" rather than being
-silently dumped into the drop list."""
+pre-sharded ("this file is one slice; sibling files have the other values")
+rather than as a ready-to-partition observed column."""

 DISTINCT_CAP: int = 10_000
 """Max size of the per-column distinct-value set. Exceeding this marks the
@ -108,10 +101,11 @@ PARTITION_NAME_PATTERNS: Tuple[re.Pattern, ...] = (
    re.compile(r"^state$", re.IGNORECASE),
    re.compile(r"^state_?code$", re.IGNORECASE),
 )
-"""Column names that are "probably partition columns" regardless of how
-many distinct values happen to be present in this one file. Kept tiny on
-purpose - add more patterns here later if you want to recognise
-region/year/etc."""
+"""Only columns whose name matches one of these patterns are ever considered
+partition candidates. This deliberately ignores generic low-cardinality
+signals (status flags, boolean columns, etc.) because in practice the only
+useful partition key in this codebase is STATE. Add more patterns here if
+that ever stops being true."""


 INDEX_NAME_PATTERNS: Tuple[re.Pattern, ...] = (
@ -123,12 +117,6 @@ INDEX_NAME_PATTERNS: Tuple[re.Pattern, ...] = (
 """Name-bonus patterns for index-candidate ranking."""


-_PARTITION_FRIENDLY_TYPES: frozenset = frozenset(
-    {"TEXT", "VARCHAR", "CHARACTER VARYING", "CHAR", "CHARACTER",
-     "INTEGER", "BIGINT", "SMALLINT", "BOOLEAN", "DATE"}
-)
-
-
 # ---------------------------------------------------------------------------
 # Per-column streaming aggregator
 # ---------------------------------------------------------------------------
@ -447,8 +435,6 @@ def classify(
    *,
    high_null_pct: float,
    index_uniqueness_pct: float,
-    partition_min_distinct: int,
-    partition_max_distinct: int,
    partition_min_fill_pct: float,
    pre_sharded_max_distinct: int,
 ) -> Tuple[
@ -457,60 +443,69 @@ def classify(
    List[_IndexCandidate],
    List[_TypeWarning],
 ]:
-    """Turn per-column stats + the loader's schema into four ranked lists."""
+    """Turn per-column stats + the loader's schema into four ranked lists.
+
+    Partition candidates are restricted to columns whose name matches
+    :data:`PARTITION_NAME_PATTERNS` - in practice STATE / STATE_CODE. A
+    generic "low-cardinality = partition candidate" heuristic produces too
+    much noise for this codebase, so we only surface columns we're confident
+    about by name.
+    """

    drops: List[_DropCandidate] = []
    partitions: List[_PartitionCandidate] = []
    indexes: List[_IndexCandidate] = []
    warnings: List[_TypeWarning] = []

-    # Names we've already routed into the partition lane - exclude them from
-    # the drop / index lanes downstream.
-    claimed_by_partition: set = set()
-
-    # -- First pass: partition-name-matched columns ------------------------
+    # -- Partition candidates (name-matched only) --------------------------
    # Run this before the drop check so pre-sharded STATE columns don't get
-    # silently dropped.
+    # silently dropped for being "constant".
+    claimed_by_partition: set = set()
    for name, cs in stats.items():
-        spec = columns.get(name)
        if not _matches_any(PARTITION_NAME_PATTERNS, name):
            continue
-        if cs.n_total == 0:
+        if cs.n_total == 0 or cs.n_non_null == 0:
+            continue
+        if cs.fill_pct < partition_min_fill_pct:
            continue

-        looks_pre_sharded = (
-            cs.n_non_null > 0
-            and not cs.distinct_overflow
+        is_pre_sharded = (
+            not cs.distinct_overflow
            and cs.distinct_count <= pre_sharded_max_distinct
-            and cs.fill_pct >= partition_min_fill_pct
        )
-        if looks_pre_sharded:
-            observed = ", ".join(_format_value(v) for v, _ in cs.top_values(pre_sharded_max_distinct))
-            note_parts = [
-                f"pre-sharded: this file only contains {cs.distinct_count} distinct "
-                f"value(s) ({observed})",
-                "keep the column and set partition_by at the load_folder level so "
-                "sibling files merge into separate partitions of one table",
-            ]
-            partitions.append(
-                _PartitionCandidate(
-                    name=name,
-                    kind="pre_sharded",
-                    distinct_count=cs.distinct_count,
-                    fill_pct=cs.fill_pct,
-                    top_values=_format_top_values(cs.top_values()),
-                    observed_values_in_file=observed,
-                    note="; ".join(note_parts),
-                    # Pre-sharded STATE always wins the ranking.
-                    score=1_000_000.0,
-                )
-            )
-            claimed_by_partition.add(name)
-            continue
+        kind = "pre_sharded" if is_pre_sharded else "observed"
+        observed = _format_top_values(cs.top_values(pre_sharded_max_distinct))

-        # Name-matched but not pre-sharded: fall through into the regular
-        # partition candidate pass below, which will score it up due to the
-        # name match.
+        if is_pre_sharded:
+            note = (
+                f"pre-sharded: this file only contains {cs.distinct_count} "
+                f"distinct value(s) ({observed}); keep the column and set "
+                "partition_by at the load_folder level so sibling files merge "
+                "into separate partitions of one table"
+            )
+        else:
+            note = (
+                f"observed {cs.distinct_display} distinct value(s) across "
+                f"{cs.fill_pct:.1f}% of rows; LIST partitioning will create "
+                "one child table per distinct value"
+            )
+
+        partitions.append(
+            _PartitionCandidate(
+                name=name,
+                kind=kind,
+                distinct_count=cs.distinct_count,
+                fill_pct=cs.fill_pct,
+                top_values=_format_top_values(cs.top_values()),
+                observed_values_in_file=observed,
+                note=note,
+                # Pre-sharded beats observed as the snippet's top pick.
+                score=(1_000_000.0 if is_pre_sharded else 500_000.0) + cs.fill_pct,
+            )
+        )
+        claimed_by_partition.add(name)
+
+    partitions.sort(key=lambda p: p.score, reverse=True)

    # -- Drop candidates ---------------------------------------------------
    for name, cs in stats.items():
@ -541,62 +536,6 @@ def classify(
            drops.append(_DropCandidate(name=name, reason=reason))

    dropped_names = {d.name for d in drops}
-
-    # -- Partition candidates (observed) ----------------------------------
-    for name, cs in stats.items():
-        if name in claimed_by_partition or name in dropped_names:
-            continue
-        spec = columns.get(name)
-        if spec is None:
-            continue
-        pg_type = spec.postgres_type.upper()
-        if pg_type not in _PARTITION_FRIENDLY_TYPES:
-            continue
-        if cs.distinct_overflow:
-            continue
-        if not (
-            partition_min_distinct <= cs.distinct_count <= partition_max_distinct
-        ):
-            continue
-        if cs.fill_pct < partition_min_fill_pct:
-            continue
-
-        name_match = _matches_any(PARTITION_NAME_PATTERNS, name)
-        # Score: name-match dominates, then prefer fewer partitions (safer
-        # DDL), then prefer more-filled columns as a tiebreaker.
-        score = (
-            (500_000.0 if name_match else 0.0)
-            + (partition_max_distinct - cs.distinct_count)
-            + cs.fill_pct
-        )
-
-        notes: List[str] = []
-        if name_match:
-            notes.append("name matches PARTITION_NAME_PATTERNS")
-        if cs.distinct_count > 10_000:
-            notes.append(
-                f"distinct_count={cs.distinct_count:,} exceeds loader "
-                "max_partitions default (10,000); expect DDL warnings"
-            )
-        notes.append(
-            "LIST partitioning creates one child table per distinct value "
-            "(see load_sas.render_partition_ddl)"
-        )
-
-        partitions.append(
-            _PartitionCandidate(
-                name=name,
-                kind="observed",
-                distinct_count=cs.distinct_count,
-                fill_pct=cs.fill_pct,
-                top_values=_format_top_values(cs.top_values()),
-                observed_values_in_file=_format_top_values(cs.top_values()),
-                note="; ".join(notes),
-                score=score,
-            )
-        )
-
-    partitions.sort(key=lambda p: p.score, reverse=True)
    partition_names = {p.name for p in partitions}

    # -- Index candidates --------------------------------------------------
@ -768,17 +707,11 @@ def render_yaml_snippet(
            )
        lines.append("partition_by:")
        lines.append(f"  - {top.name}")
-        if len(partitions) > 1:
-            lines.append(
-                "# Runners-up (append to partition_by for multi-level "
-                "LIST partitioning; see load_sas.render_partition_ddl):"
-            )
-            for p in partitions[1:]:
-                lines.append(
-                    f"#   - {p.name}  # kind={p.kind} distinct={p.distinct_count}"
-                )
    else:
-        lines.append("# (no partition candidates found)")
+        lines.append(
+            "# (no partition candidates found - no column matched "
+            "PARTITION_NAME_PATTERNS)"
+        )

    lines.append("")

@ -1050,8 +983,6 @@ def _build_argparser() -> argparse.ArgumentParser:
                   help="Null percentage at/above which a column is a drop candidate.")
    p.add_argument("--index-uniqueness-pct", type=float, default=INDEX_UNIQUENESS_PCT,
                   help="Uniqueness (distinct/non-null) at/above which a column is an index candidate.")
-    p.add_argument("--partition-min-distinct", type=int, default=PARTITION_MIN_DISTINCT)
-    p.add_argument("--partition-max-distinct", type=int, default=PARTITION_MAX_DISTINCT)
    p.add_argument("--partition-min-fill-pct", type=float, default=PARTITION_MIN_FILL_PCT)
    p.add_argument("--pre-sharded-max-distinct", type=int, default=PRE_SHARDED_MAX_DISTINCT)
    return p
@ -1074,8 +1005,6 @@ def main(argv: Optional[List[str]] = None) -> int:
        stats, columns,
        high_null_pct=args.high_null_pct,
        index_uniqueness_pct=args.index_uniqueness_pct,
-        partition_min_distinct=args.partition_min_distinct,
-        partition_max_distinct=args.partition_max_distinct,
        partition_min_fill_pct=args.partition_min_fill_pct,
        pre_sharded_max_distinct=args.pre_sharded_max_distinct,
    )
@ -1085,10 +1014,9 @@ def main(argv: Optional[List[str]] = None) -> int:
    thresholds = {
        "HIGH_NULL_PCT": args.high_null_pct,
        "INDEX_UNIQUENESS_PCT": args.index_uniqueness_pct,
-        "PARTITION_MIN_DISTINCT": args.partition_min_distinct,
-        "PARTITION_MAX_DISTINCT": args.partition_max_distinct,
        "PARTITION_MIN_FILL_PCT": args.partition_min_fill_pct,
        "PRE_SHARDED_MAX_DISTINCT": args.pre_sharded_max_distinct,
+        "PARTITION_NAME_PATTERNS": ", ".join(p.pattern for p in PARTITION_NAME_PATTERNS),
        "DISTINCT_CAP": DISTINCT_CAP,
        "TOP_N_VALUES": TOP_N_VALUES,
        "PREVIEW_ROWS_FOR_INFERENCE": PREVIEW_ROWS_FOR_INFERENCE,