Enhance file clustering by implementing numeric sorting for last digit groups in stems and updating documentation for embedded-digit handling in auto-detection.
This commit is contained in:
parent
b3d7a9d440
commit
b78f6d648f
@ -90,6 +90,25 @@ Exit codes:
|
|||||||
* Auto-detect groups remaining files by ``re.sub(r'\\d+$', '', stem)`` with
|
* Auto-detect groups remaining files by ``re.sub(r'\\d+$', '', stem)`` with
|
||||||
any trailing ``_`` / ``-`` stripped afterward. Stems without trailing
|
any trailing ``_`` / ``-`` stripped afterward. Stems without trailing
|
||||||
digits become singleton clusters named after the stem.
|
digits become singleton clusters named after the stem.
|
||||||
|
* Within a cluster, files are sorted **numerically** by the last digit
|
||||||
|
group in the stem, so ``..._9_...`` comes before ``..._10_...`` /
|
||||||
|
``..._40_...`` regardless of zero-padding. The first file in that
|
||||||
|
order drives schema inference; the rest are checked against that
|
||||||
|
schema via :func:`load_sas.assert_schema_compatible`. Gaps in the
|
||||||
|
numeric sequence (missing ``3``, ``7``, ``14``, ...) are irrelevant -
|
||||||
|
whatever files are present get loaded in numeric order.
|
||||||
|
* Auto-detect only recognizes *trailing* digit runs. File names where
|
||||||
|
the varying number sits in the middle of the stem (surrounded by
|
||||||
|
other name components) are not grouped by auto-detect - each becomes
|
||||||
|
its own singleton cluster. Use an explicit pattern to bucket them::
|
||||||
|
|
||||||
|
clusters:
|
||||||
|
- pattern: '^year2020_regionA_\\d+_detail\\.sas7bdat$'
|
||||||
|
tablename: year2020_regionA_detail
|
||||||
|
|
||||||
|
The regex still matches any digit width, so numbers like ``9`` and
|
||||||
|
``40`` both land in the same cluster and the numeric sort above puts
|
||||||
|
``9`` before ``40``.
|
||||||
|
|
||||||
4. Library usage
|
4. Library usage
|
||||||
----------------
|
----------------
|
||||||
@ -487,6 +506,7 @@ def load_folder_config(path: Path) -> FolderConfig:
|
|||||||
|
|
||||||
|
|
||||||
_TRAILING_DIGIT_RE = re.compile(r"\d+$")
|
_TRAILING_DIGIT_RE = re.compile(r"\d+$")
|
||||||
|
_DIGIT_GROUP_RE = re.compile(r"\d+")
|
||||||
|
|
||||||
|
|
||||||
def _auto_prefix(stem: str) -> str:
|
def _auto_prefix(stem: str) -> str:
|
||||||
@ -501,6 +521,22 @@ def _auto_prefix(stem: str) -> str:
|
|||||||
return stripped or stem
|
return stripped or stem
|
||||||
|
|
||||||
|
|
||||||
|
def _cluster_sort_key(path: Path) -> Tuple[int, str]:
|
||||||
|
"""Sort key for ordering files within a cluster.
|
||||||
|
|
||||||
|
Sorts numerically by the LAST digit group in the stem so ``_9`` comes
|
||||||
|
before ``_10`` / ``_40`` regardless of width, and so a file named
|
||||||
|
``foo_9_detail`` lands before ``foo_40_detail``. The first file under
|
||||||
|
this order is the one whose schema is inferred and used to create the
|
||||||
|
target table; sorting numerically keeps that choice stable as the file
|
||||||
|
set grows. Files with no digits fall to ``-1`` so they sort before
|
||||||
|
numbered files; the stem is a tiebreaker for reproducibility.
|
||||||
|
"""
|
||||||
|
digits = _DIGIT_GROUP_RE.findall(path.stem)
|
||||||
|
n = int(digits[-1]) if digits else -1
|
||||||
|
return (n, path.stem)
|
||||||
|
|
||||||
|
|
||||||
def _list_sas_files(folder: Path) -> List[Path]:
|
def _list_sas_files(folder: Path) -> List[Path]:
|
||||||
files: List[Path] = []
|
files: List[Path] = []
|
||||||
for p in sorted(folder.iterdir()):
|
for p in sorted(folder.iterdir()):
|
||||||
@ -582,7 +618,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
|
|||||||
clusters.append(
|
clusters.append(
|
||||||
ClusterSpec(
|
ClusterSpec(
|
||||||
tablename=patt.tablename,
|
tablename=patt.tablename,
|
||||||
files=sorted(matched),
|
files=sorted(matched, key=_cluster_sort_key),
|
||||||
if_exists=patt.if_exists or cfg.if_exists,
|
if_exists=patt.if_exists or cfg.if_exists,
|
||||||
include=patt.include if patt.include is not None else cfg.include,
|
include=patt.include if patt.include is not None else cfg.include,
|
||||||
exclude=patt.exclude if patt.exclude is not None else cfg.exclude,
|
exclude=patt.exclude if patt.exclude is not None else cfg.exclude,
|
||||||
@ -603,7 +639,7 @@ def discover_clusters(cfg: FolderConfig) -> List[ClusterSpec]:
|
|||||||
clusters.append(
|
clusters.append(
|
||||||
ClusterSpec(
|
ClusterSpec(
|
||||||
tablename=key,
|
tablename=key,
|
||||||
files=sorted(buckets[key]),
|
files=sorted(buckets[key], key=_cluster_sort_key),
|
||||||
if_exists=cfg.if_exists,
|
if_exists=cfg.if_exists,
|
||||||
include=cfg.include,
|
include=cfg.include,
|
||||||
exclude=cfg.exclude,
|
exclude=cfg.exclude,
|
||||||
|
|||||||
@ -19,6 +19,12 @@ if_exists: replace
|
|||||||
# auto-grouped with its peers by stripping trailing digits (and any trailing
|
# auto-grouped with its peers by stripping trailing digits (and any trailing
|
||||||
# _ / -) from the file stem. Files with no trailing digits become their own
|
# _ / -) from the file stem. Files with no trailing digits become their own
|
||||||
# singleton cluster.
|
# singleton cluster.
|
||||||
|
#
|
||||||
|
# Auto-detect only recognizes *trailing* digit runs. If your file names put
|
||||||
|
# the varying number in the middle of the stem (e.g. surrounded by year,
|
||||||
|
# region, and detail components), auto-detect will NOT group them - each
|
||||||
|
# file becomes its own singleton cluster. Use an explicit pattern instead;
|
||||||
|
# see the embedded-digit example near the bottom of this file.
|
||||||
auto_detect: true
|
auto_detect: true
|
||||||
|
|
||||||
# Folder-level column filter. Every file in every cluster passes through
|
# Folder-level column filter. Every file in every cluster passes through
|
||||||
@ -92,7 +98,20 @@ clusters:
|
|||||||
# - region
|
# - region
|
||||||
# - year
|
# - year
|
||||||
|
|
||||||
# With only the gq pattern explicit, auto_detect: true will still bucket
|
# Embedded-digit example. When the varying number sits in the MIDDLE of
|
||||||
# group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
|
# the stem (e.g. year2020_regionA_40_detail.sas7bdat,
|
||||||
|
# year2020_regionA_41_detail.sas7bdat, ...), auto-detect will NOT group
|
||||||
|
# them - each file becomes its own singleton cluster. An explicit
|
||||||
|
# pattern bucketizes them correctly. The \d+ matches any width, and
|
||||||
|
# files within the cluster are sorted numerically by the last digit
|
||||||
|
# group in the stem, so _9_ sorts before _40_ regardless of zero-
|
||||||
|
# padding. Gaps in the numeric sequence (missing 3, 7, 14, ...) are
|
||||||
|
# fine - whatever files are present get loaded in numeric order.
|
||||||
|
#
|
||||||
|
# - pattern: '^year2020_regionA_\d+_detail\.sas7bdat$'
|
||||||
|
# tablename: year2020_regionA_detail
|
||||||
|
|
||||||
|
# With only the group_a pattern explicit, auto_detect: true will still
|
||||||
|
# bucket group_b1.xpt + group_b2.xpt into a "group_b" cluster and the lone
|
||||||
# standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
|
# standalone.xpt into a "standalone" cluster. See generate_sample_folder.py
|
||||||
# for the fixture that exercises exactly this layout.
|
# for the fixture that exercises exactly this layout.
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user