Enhance file clustering by implementing numeric sorting for last digit groups in stems and updating documentation for embedded-digit handling in auto-detection. #5

Merged
dp merged 1 commits from regex_fix into main 2026-04-20 16:51:00 +00:00
2 changed files with 59 additions and 4 deletions

View File

@ -90,6 +90,25 @@ Exit codes:
* Auto-detect groups remaining files by ``re.sub(r'\\d+$', '', stem)`` with * Auto-detect groups remaining files by ``re.sub(r'\\d+$', '', stem)`` with
any trailing ``_`` / ``-`` stripped afterward. Stems without trailing any trailing ``_`` / ``-`` stripped afterward. Stems without trailing
digits become singleton clusters named after the stem. digits become singleton clusters named after the stem.
* Within a cluster, files are sorted **numerically** by the last digit
group in the stem, so ``..._9_...`` comes before ``..._10_...`` /
``..._40_...`` regardless of zero-padding. The first file in that
order drives schema inference; the rest are checked against that
schema via :func:`load_sas.assert_schema_compatible`. Gaps in the
numeric sequence (missing ``3``, ``7``, ``14``, ...) are irrelevant -
whatever files are present get loaded in numeric order.
* Auto-detect only recognizes *trailing* digit runs. File names where
the varying number sits in the middle of the stem (surrounded by
other name components) are not grouped by auto-detect - each becomes
its own singleton cluster. Use an explicit pattern to bucket them::
clusters:
- pattern: '^year2020_regionA_\\d+_detail\\.sas7bdat$'
tablename: year2020_regionA_detail
The regex still matches any digit width, so numbers like ``9`` and
``40`` both land in the same cluster and the numeric sort above puts
``9`` before ``40``.
4. Library usage 4. Library usage
---------------- ----------------
@ -487,6 +506,7 @@ def load_folder_config(path: Path) -> FolderConfig:
_TRAILING_DIGIT_RE = re.compile(r"\d+$") _TRAILING_DIGIT_RE = re.compile(r"\d+$")
_DIGIT_GROUP_RE = re.compile(r"\d+")
def _auto_prefix(stem: str) -> str: def _auto_prefix(stem: str) -> str:
@ -501,6 +521,22 @@ def _auto_prefix(stem: str) -> str:
return stripped or stem return stripped or stem
def _cluster_sort_key(path: Path) -> Tuple[int, str]:
"""Sort key for ordering files within a cluster.
Sorts numerically by the LAST digit group in the stem so ``_9`` comes
before ``_10`` / ``_40`` regardless of width, and so a file named
``foo_9_detail`` lands before ``foo_40_detail``. The first file under
this order is the one whose schema is inferred and used to create the
target table; sorting numerically keeps that choice stable as the file
set grows. Files with no digits fall to ``-1`` so they sort before
numbered files; the stem is a tiebreaker for reproducibility.
"""
digits = _DIGIT_GROUP_RE.findall(path.stem)
n = int(digits[-1]) if digits else -1
return (n, path.stem)
def _list_sas_files(folder: Path) -> List[Path]: def _list_sas_files(folder: Path) -> List[Path]:
files: List[Path] = [] files: List[Path] = []
for p in sorted(folder.iterdir()): for p in sorted(folder.iterdir()):
@ -582,7 +618,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
clusters.append( clusters.append(
ClusterSpec( ClusterSpec(
tablename=patt.tablename, tablename=patt.tablename,
files=sorted(matched), files=sorted(matched, key=_cluster_sort_key),
if_exists=patt.if_exists or cfg.if_exists, if_exists=patt.if_exists or cfg.if_exists,
include=patt.include if patt.include is not None else cfg.include, include=patt.include if patt.include is not None else cfg.include,
exclude=patt.exclude if patt.exclude is not None else cfg.exclude, exclude=patt.exclude if patt.exclude is not None else cfg.exclude,
@ -603,7 +639,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
clusters.append( clusters.append(
ClusterSpec( ClusterSpec(
tablename=key, tablename=key,
files=sorted(buckets[key]), files=sorted(buckets[key], key=_cluster_sort_key),
if_exists=cfg.if_exists, if_exists=cfg.if_exists,
include=cfg.include, include=cfg.include,
exclude=cfg.exclude, exclude=cfg.exclude,

View File

@ -19,6 +19,12 @@ if_exists: replace
# auto-grouped with its peers by stripping trailing digits (and any trailing # auto-grouped with its peers by stripping trailing digits (and any trailing
# _ / -) from the file stem. Files with no trailing digits become their own # _ / -) from the file stem. Files with no trailing digits become their own
# singleton cluster. # singleton cluster.
#
# Auto-detect only recognizes *trailing* digit runs. If your file names put
# the varying number in the middle of the stem (e.g. surrounded by year,
# region, and detail components), auto-detect will NOT group them - each
# file becomes its own singleton cluster. Use an explicit pattern instead;
# see the embedded-digit example near the bottom of this file.
auto_detect: true auto_detect: true
# Folder-level column filter. Every file in every cluster passes through # Folder-level column filter. Every file in every cluster passes through
@ -92,7 +98,20 @@ clusters:
# - region # - region
# - year # - year
# With only the gq pattern explicit, auto_detect: true will still bucket # Embedded-digit example. When the varying number sits in the MIDDLE of
# group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone # the stem (e.g. year2020_regionA_40_detail.sas7bdat,
# year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group
# them - each file becomes its own singleton cluster. An explicit
# pattern bucketizes them correctly. The \d+ matches any width, and
# files within the cluster are sorted numerically by the last digit
# group in the stem, so _9_ sorts before _40_ regardless of zero-
# padding. Gaps in the numeric sequence (missing 3, 7, 14, ...) are
# fine - whatever files are present get loaded in numeric order.
#
# - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
# tablename: year2020_regionA_detail
# With only the group_a pattern explicit, auto_detect: true will still
# bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
# standalone.xpt into a "standalone" cluster. See generate_sample_folder.py # standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
# for the fixture that exercises exactly this layout. # for the fixture that exercises exactly this layout.