diff --git a/generic_loader/load_folder.py b/generic_loader/load_folder.py index 5136fc1..82fe459 100644 --- a/generic_loader/load_folder.py +++ b/generic_loader/load_folder.py @@ -90,6 +90,25 @@ Exit codes: * Auto-detect groups remaining files by ``re.sub(r'\\d+$', '', stem)`` with any trailing ``_`` / ``-`` stripped afterward. Stems without trailing digits become singleton clusters named after the stem. +* Within a cluster, files are sorted **numerically** by the last digit + group in the stem, so ``..._9_...`` comes before ``..._10_...`` / + ``..._40_...`` regardless of zero-padding. The first file in that + order drives schema inference; the rest are checked against that + schema via :func:`load_sas.assert_schema_compatible`. Gaps in the + numeric sequence (missing ``3``, ``7``, ``14``, ...) are irrelevant - + whatever files are present get loaded in numeric order. +* Auto-detect only recognizes *trailing* digit runs. File names where + the varying number sits in the middle of the stem (surrounded by + other name components) are not grouped by auto-detect - each becomes + its own singleton cluster. Use an explicit pattern to bucket them:: + + clusters: + - pattern: '^year2020_regionA_\\d+_detail\\.sas7bdat$' + tablename: year2020_regionA_detail + + The regex still matches any digit width, so numbers like ``9`` and + ``40`` both land in the same cluster and the numeric sort above puts + ``9`` before ``40``. 4. Library usage ---------------- @@ -487,6 +506,7 @@ def load_folder_config(path: Path) -> FolderConfig: _TRAILING_DIGIT_RE = re.compile(r"\d+$") +_DIGIT_GROUP_RE = re.compile(r"\d+") def _auto_prefix(stem: str) -> str: @@ -501,6 +521,22 @@ def _auto_prefix(stem: str) -> str: return stripped or stem +def _cluster_sort_key(path: Path) -> Tuple[int, str]: + """Sort key for ordering files within a cluster. + + Sorts numerically by the LAST digit group in the stem so ``_9`` comes + before ``_10`` / ``_40`` regardless of width, and so a file named + ``foo_9_detail`` lands before ``foo_40_detail``. The first file under + this order is the one whose schema is inferred and used to create the + target table; sorting numerically keeps that choice stable as the file + set grows. Files with no digits fall to ``-1`` so they sort before + numbered files; the stem is a tiebreaker for reproducibility. + """ + digits = _DIGIT_GROUP_RE.findall(path.stem) + n = int(digits[-1]) if digits else -1 + return (n, path.stem) + + def _list_sas_files(folder: Path) -> List[Path]: files: List[Path] = [] for p in sorted(folder.iterdir()): @@ -582,7 +618,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]: clusters.append( ClusterSpec( tablename=patt.tablename, - files=sorted(matched), + files=sorted(matched, key=_cluster_sort_key), if_exists=patt.if_exists or cfg.if_exists, include=patt.include if patt.include is not None else cfg.include, exclude=patt.exclude if patt.exclude is not None else cfg.exclude, @@ -603,7 +639,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]: clusters.append( ClusterSpec( tablename=key, - files=sorted(buckets[key]), + files=sorted(buckets[key], key=_cluster_sort_key), if_exists=cfg.if_exists, include=cfg.include, exclude=cfg.exclude, diff --git a/generic_loader/sample_folder_config.yaml b/generic_loader/sample_folder_config.yaml index 066d840..5740c3f 100644 --- a/generic_loader/sample_folder_config.yaml +++ b/generic_loader/sample_folder_config.yaml @@ -19,6 +19,12 @@ if_exists: replace # auto-grouped with its peers by stripping trailing digits (and any trailing # _ / -) from the file stem. Files with no trailing digits become their own # singleton cluster. +# +# Auto-detect only recognizes *trailing* digit runs. If your file names put +# the varying number in the middle of the stem (e.g. surrounded by year, +# region, and detail components), auto-detect will NOT group them - each +# file becomes its own singleton cluster. Use an explicit pattern instead; +# see the embedded-digit example near the bottom of this file. auto_detect: true # Folder-level column filter. Every file in every cluster passes through @@ -92,7 +98,20 @@ clusters: # - region # - year - # With only the gq pattern explicit, auto_detect: true will still bucket - # group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone + # Embedded-digit example. When the varying number sits in the MIDDLE of + # the stem (e.g. year2020_regionA_40_detail.sas7bdat, + # year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group + # them - each file becomes its own singleton cluster. An explicit + # pattern bucketizes them correctly. The \d+ matches any width, and + # files within the cluster are sorted numerically by the last digit + # group in the stem, so _9_ sorts before _40_ regardless of zero- + # padding. Gaps in the numeric sequence (missing 3, 7, 14, ...) are + # fine - whatever files are present get loaded in numeric order. + # + # - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$' + # tablename: year2020_regionA_detail + + # With only the group_a pattern explicit, auto_detect: true will still + # bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone # standalone.xpt into a "standalone" cluster. See generate_sample_folder.py # for the fixture that exercises exactly this layout.