Enhance file clustering by implementing numeric sorting for last digit groups in stems and updating documentation for embedded-digit handling in auto-detection.

2026-04-20 11:48:22 -05:00 · 2026-04-20 11:48:22 -05:00 · b78f6d648f
commit b78f6d648f
parent b3d7a9d440
2 changed files with 59 additions and 4 deletions
--- a/generic_loader/load_folder.py
+++ b/generic_loader/load_folder.py
@ -90,6 +90,25 @@ Exit codes:
 * Auto-detect groups remaining files by ``re.sub(r'\\d+$', '', stem)`` with
  any trailing ``_`` / ``-`` stripped afterward. Stems without trailing
  digits become singleton clusters named after the stem.
 * Within a cluster, files are sorted **numerically** by the last digit
  group in the stem, so ``..._9_...`` comes before ``..._10_...`` /
  ``..._40_...`` regardless of zero-padding. The first file in that
  order drives schema inference; the rest are checked against that
  schema via :func:`load_sas.assert_schema_compatible`. Gaps in the
  numeric sequence (missing ``3``, ``7``, ``14``, ...) are irrelevant -
  whatever files are present get loaded in numeric order.
 * Auto-detect only recognizes *trailing* digit runs. File names where
  the varying number sits in the middle of the stem (surrounded by
  other name components) are not grouped by auto-detect - each becomes
  its own singleton cluster. Use an explicit pattern to bucket them::
      clusters:
        - pattern: '^year2020_regionA_\\d+_detail\\.sas7bdat$'
          tablename: year2020_regionA_detail
  The regex still matches any digit width, so numbers like ``9`` and
  ``40`` both land in the same cluster and the numeric sort above puts
  ``9`` before ``40``.
 4. Library usage
 ----------------
@ -487,6 +506,7 @@ def load_folder_config(path: Path) -> FolderConfig:
 _TRAILING_DIGIT_RE = re.compile(r"\d+$")
 _DIGIT_GROUP_RE = re.compile(r"\d+")
 def _auto_prefix(stem: str) -> str:
@ -501,6 +521,22 @@ def _auto_prefix(stem: str) -> str:
    return stripped or stem
 def _cluster_sort_key(path: Path) -> Tuple[int, str]:
    """Sort key for ordering files within a cluster.
    Sorts numerically by the LAST digit group in the stem so ``_9`` comes
    before ``_10`` / ``_40`` regardless of width, and so a file named
    ``foo_9_detail`` lands before ``foo_40_detail``. The first file under
    this order is the one whose schema is inferred and used to create the
    target table; sorting numerically keeps that choice stable as the file
    set grows. Files with no digits fall to ``-1`` so they sort before
    numbered files; the stem is a tiebreaker for reproducibility.
    """
    digits = _DIGIT_GROUP_RE.findall(path.stem)
    n = int(digits[-1]) if digits else -1
    return (n, path.stem)
 def _list_sas_files(folder: Path) -> List[Path]:
    files: List[Path] = []
    for p in sorted(folder.iterdir()):
@ -582,7 +618,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
        clusters.append(
            ClusterSpec(
                tablename=patt.tablename,
-                files=sorted(matched),
+                files=sorted(matched, key=_cluster_sort_key),
                if_exists=patt.if_exists or cfg.if_exists,
                include=patt.include if patt.include is not None else cfg.include,
                exclude=patt.exclude if patt.exclude is not None else cfg.exclude,
@ -603,7 +639,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
            clusters.append(
                ClusterSpec(
                    tablename=key,
-                    files=sorted(buckets[key]),
+                    files=sorted(buckets[key], key=_cluster_sort_key),
                    if_exists=cfg.if_exists,
                    include=cfg.include,
                    exclude=cfg.exclude,
--- a/generic_loader/sample_folder_config.yaml
+++ b/generic_loader/sample_folder_config.yaml
@ -19,6 +19,12 @@ if_exists: replace
 # auto-grouped with its peers by stripping trailing digits (and any trailing
 # _ / -) from the file stem. Files with no trailing digits become their own
 # singleton cluster.
 #
 # Auto-detect only recognizes *trailing* digit runs. If your file names put
 # the varying number in the middle of the stem (e.g. surrounded by year,
 # region, and detail components), auto-detect will NOT group them - each
 # file becomes its own singleton cluster. Use an explicit pattern instead;
 # see the embedded-digit example near the bottom of this file.
 auto_detect: true
 # Folder-level column filter. Every file in every cluster passes through
@ -92,7 +98,20 @@ clusters:
  #     - region
  #     - year
-  # With only the gq pattern explicit, auto_detect: true will still bucket
+  # Embedded-digit example. When the varying number sits in the MIDDLE of
-  # group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
+  # the stem (e.g. year2020_regionA_40_detail.sas7bdat,
  # year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group
  # them - each file becomes its own singleton cluster. An explicit
  # pattern bucketizes them correctly. The \d+ matches any width, and
  # files within the cluster are sorted numerically by the last digit
  # group in the stem, so _9_ sorts before _40_ regardless of zero-
  # padding. Gaps in the numeric sequence (missing 3, 7, 14, ...) are
  # fine - whatever files are present get loaded in numeric order.
  #
  # - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
  #   tablename: year2020_regionA_detail
  # With only the group_a pattern explicit, auto_detect: true will still
  # bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
  # standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
  # for the fixture that exercises exactly this layout.