advanced_analyzer #8
@ -138,7 +138,7 @@ import queue as _queue_mod
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
@ -1230,25 +1230,48 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
# -- Metadata pre-scan -----------------------------------------------------
|
# -- Metadata pre-scan -----------------------------------------------------
|
||||||
# Sum ``number_rows`` across every file so the tqdm bar has a real
|
# Sum ``number_rows`` across every file so the tqdm bar has a real
|
||||||
# denominator. ``read_sas_metadata`` uses pyreadstat's ``metadataonly=True``
|
# denominator. ``read_sas_metadata`` uses pyreadstat's ``metadataonly=True``
|
||||||
# fast path; a few ms per sas7bdat even on large files.
|
# fast path, but on multi-GB sas7bdat files that still reads tens of MB
|
||||||
|
# of scattered subheader pages per file - sequentially that's minutes for
|
||||||
|
# a 52-file folder. pyreadstat releases the GIL during I/O and C decoding,
|
||||||
|
# so a ThreadPool gives near-linear scaling until the disk saturates.
|
||||||
|
all_files: List[Path] = [p for c in loadable for p in c.files]
|
||||||
|
prescan_workers = min(16, max(1, len(all_files)))
|
||||||
print(
|
print(
|
||||||
f"pre-scanning row counts for {sum(len(c.files) for c in loadable)} "
|
f"pre-scanning row counts for {len(all_files)} file(s) "
|
||||||
f"file(s)...",
|
f"across {prescan_workers} thread(s)...",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _scan_one(p: Path) -> Tuple[Path, Optional[int], Optional[str]]:
|
||||||
|
try:
|
||||||
|
meta = read_sas_metadata(p)
|
||||||
|
n = getattr(meta, "number_rows", None)
|
||||||
|
return (p, int(n) if n is not None else None, None)
|
||||||
|
except Exception as e:
|
||||||
|
return (p, None, str(e))
|
||||||
|
|
||||||
grand_total = 0
|
grand_total = 0
|
||||||
unknown_total_files: List[str] = []
|
unknown_total_files: List[str] = []
|
||||||
for c in loadable:
|
with ThreadPoolExecutor(max_workers=prescan_workers) as tpool:
|
||||||
for p in c.files:
|
prescan_bar = tqdm(
|
||||||
try:
|
total=len(all_files),
|
||||||
meta = read_sas_metadata(p)
|
unit="file",
|
||||||
n = getattr(meta, "number_rows", None)
|
desc=" prescanning",
|
||||||
if n is None:
|
file=sys.stderr,
|
||||||
|
dynamic_ncols=True,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
for p, n, err in tpool.map(_scan_one, all_files):
|
||||||
|
prescan_bar.update(1)
|
||||||
|
if err is not None:
|
||||||
|
unknown_total_files.append(f"{p.name} ({err})")
|
||||||
|
elif n is None:
|
||||||
unknown_total_files.append(p.name)
|
unknown_total_files.append(p.name)
|
||||||
else:
|
else:
|
||||||
grand_total += int(n)
|
grand_total += n
|
||||||
except Exception as e:
|
finally:
|
||||||
unknown_total_files.append(f"{p.name} ({e})")
|
prescan_bar.close()
|
||||||
|
|
||||||
if unknown_total_files:
|
if unknown_total_files:
|
||||||
print(
|
print(
|
||||||
f"[warn] could not read row count from "
|
f"[warn] could not read row count from "
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user