Refactor pre-scan process in load_folder.py to utilize ThreadPoolExecutor for improved performance

Updated the main function to replace sequential file processing with a threaded approach using ThreadPoolExecutor. This change enhances the efficiency of reading row counts from SAS files, particularly for large datasets, by allowing concurrent I/O operations. Added progress tracking with tqdm for better user feedback during the pre-scan phase.
This commit is contained in:
David Peterson 2026-04-20 22:43:02 -05:00
parent fe7dc4d5a1
commit 052fb0e087

View File

@ -138,7 +138,7 @@ import queue as _queue_mod
import re import re
import sys import sys
import threading import threading
from concurrent.futures import ProcessPoolExecutor, as_completed from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
@ -1230,25 +1230,48 @@ def main(argv: Optional[List[str]] = None) -> int:
# -- Metadata pre-scan ----------------------------------------------------- # -- Metadata pre-scan -----------------------------------------------------
# Sum ``number_rows`` across every file so the tqdm bar has a real # Sum ``number_rows`` across every file so the tqdm bar has a real
# denominator. ``read_sas_metadata`` uses pyreadstat's ``metadataonly=True`` # denominator. ``read_sas_metadata`` uses pyreadstat's ``metadataonly=True``
# fast path; a few ms per sas7bdat even on large files. # fast path, but on multi-GB sas7bdat files that still reads tens of MB
# of scattered subheader pages per file - sequentially that's minutes for
# a 52-file folder. pyreadstat releases the GIL during I/O and C decoding,
# so a ThreadPool gives near-linear scaling until the disk saturates.
all_files: List[Path] = [p for c in loadable for p in c.files]
prescan_workers = min(16, max(1, len(all_files)))
print( print(
f"pre-scanning row counts for {sum(len(c.files) for c in loadable)} " f"pre-scanning row counts for {len(all_files)} file(s) "
f"file(s)...", f"across {prescan_workers} thread(s)...",
file=sys.stderr, file=sys.stderr,
) )
grand_total = 0
unknown_total_files: List[str] = [] def _scan_one(p: Path) -> Tuple[Path, Optional[int], Optional[str]]:
for c in loadable:
for p in c.files:
try: try:
meta = read_sas_metadata(p) meta = read_sas_metadata(p)
n = getattr(meta, "number_rows", None) n = getattr(meta, "number_rows", None)
if n is None: return (p, int(n) if n is not None else None, None)
except Exception as e:
return (p, None, str(e))
grand_total = 0
unknown_total_files: List[str] = []
with ThreadPoolExecutor(max_workers=prescan_workers) as tpool:
prescan_bar = tqdm(
total=len(all_files),
unit="file",
desc=" prescanning",
file=sys.stderr,
dynamic_ncols=True,
)
try:
for p, n, err in tpool.map(_scan_one, all_files):
prescan_bar.update(1)
if err is not None:
unknown_total_files.append(f"{p.name} ({err})")
elif n is None:
unknown_total_files.append(p.name) unknown_total_files.append(p.name)
else: else:
grand_total += int(n) grand_total += n
except Exception as e: finally:
unknown_total_files.append(f"{p.name} ({e})") prescan_bar.close()
if unknown_total_files: if unknown_total_files:
print( print(
f"[warn] could not read row count from " f"[warn] could not read row count from "