advanced_analyzer #8

Merged
dp merged 23 commits from advanced_analyzer into main 2026-04-21 22:32:18 +00:00
Showing only changes of commit 052fb0e087 - Show all commits

View File

@ -138,7 +138,7 @@ import queue as _queue_mod
import re import re
import sys import sys
import threading import threading
from concurrent.futures import ProcessPoolExecutor, as_completed from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
@ -1230,25 +1230,48 @@ def main(argv: Optional[List[str]] = None) -> int:
# -- Metadata pre-scan ----------------------------------------------------- # -- Metadata pre-scan -----------------------------------------------------
# Sum ``number_rows`` across every file so the tqdm bar has a real # Sum ``number_rows`` across every file so the tqdm bar has a real
# denominator. ``read_sas_metadata`` uses pyreadstat's ``metadataonly=True`` # denominator. ``read_sas_metadata`` uses pyreadstat's ``metadataonly=True``
# fast path; a few ms per sas7bdat even on large files. # fast path, but on multi-GB sas7bdat files that still reads tens of MB
# of scattered subheader pages per file - sequentially that's minutes for
# a 52-file folder. pyreadstat releases the GIL during I/O and C decoding,
# so a ThreadPool gives near-linear scaling until the disk saturates.
all_files: List[Path] = [p for c in loadable for p in c.files]
prescan_workers = min(16, max(1, len(all_files)))
print( print(
f"pre-scanning row counts for {sum(len(c.files) for c in loadable)} " f"pre-scanning row counts for {len(all_files)} file(s) "
f"file(s)...", f"across {prescan_workers} thread(s)...",
file=sys.stderr, file=sys.stderr,
) )
grand_total = 0
unknown_total_files: List[str] = [] def _scan_one(p: Path) -> Tuple[Path, Optional[int], Optional[str]]:
for c in loadable:
for p in c.files:
try: try:
meta = read_sas_metadata(p) meta = read_sas_metadata(p)
n = getattr(meta, "number_rows", None) n = getattr(meta, "number_rows", None)
if n is None: return (p, int(n) if n is not None else None, None)
except Exception as e:
return (p, None, str(e))
grand_total = 0
unknown_total_files: List[str] = []
with ThreadPoolExecutor(max_workers=prescan_workers) as tpool:
prescan_bar = tqdm(
total=len(all_files),
unit="file",
desc=" prescanning",
file=sys.stderr,
dynamic_ncols=True,
)
try:
for p, n, err in tpool.map(_scan_one, all_files):
prescan_bar.update(1)
if err is not None:
unknown_total_files.append(f"{p.name} ({err})")
elif n is None:
unknown_total_files.append(p.name) unknown_total_files.append(p.name)
else: else:
grand_total += int(n) grand_total += n
except Exception as e: finally:
unknown_total_files.append(f"{p.name} ({e})") prescan_bar.close()
if unknown_total_files: if unknown_total_files:
print( print(
f"[warn] could not read row count from " f"[warn] could not read row count from "