From a94ab68f4da8dc079304710bd087af70936e63fe Mon Sep 17 00:00:00 2001 From: David Peterson Date: Mon, 20 Apr 2026 19:27:01 -0500 Subject: [PATCH] Refine partition name patterns in sas_profiler.py Updated the regular expression for partition name patterns to improve matching accuracy for state-related columns. The new pattern captures variations like `state`, `state_code`, and `statecode` while avoiding false positives from unrelated terms. This change enhances the precision of partition candidate selection. --- utils/sas_profiler.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/utils/sas_profiler.py b/utils/sas_profiler.py index 2d70964..3adc4d6 100644 --- a/utils/sas_profiler.py +++ b/utils/sas_profiler.py @@ -117,8 +117,12 @@ larger than the file, pyreadstat just hands back one chunk.""" PARTITION_NAME_PATTERNS: Tuple[re.Pattern, ...] = ( - re.compile(r"^state$", re.IGNORECASE), - re.compile(r"^state_?code$", re.IGNORECASE), + # ``state`` or ``state_code`` / ``statecode`` appearing as a full token + # anywhere in the column name. Uses underscore / start / end as token + # boundaries so we catch STATE, STATE_CODE, HOME_STATE, + # ADDR_LINE3_STATE, BIRTH_STATE_CODE, etc. without matching STATUS, + # ESTATE, INTERSTATE, or STATEWIDE. + re.compile(r"(?:^|_)state(?:_?code)?(?:_|$)", re.IGNORECASE), ) """Only columns whose name matches one of these patterns are ever considered partition candidates. This deliberately ignores generic low-cardinality