advanced_analyzer #8

Merged
dp merged 23 commits from advanced_analyzer into main 2026-04-21 22:32:18 +00:00
Showing only changes of commit 2dd247b067 - Show all commits

View File

@ -1071,6 +1071,17 @@ def _build_argparser() -> argparse.ArgumentParser:
"PGUSER / PGPASSWORD from the environment or .env file." "PGUSER / PGPASSWORD from the environment or .env file."
), ),
) )
p.add_argument(
"--no-prescan",
action="store_true",
help=(
"Skip the per-file metadata scan that populates the folder-wide "
"tqdm ETA. Useful when the folder is large (half-hour+ pre-scan) "
"or when you're iterating quickly on a failure. Without the "
"pre-scan the progress bar still shows rows loaded, rate, and "
"elapsed time - it just can't estimate remaining time."
),
)
p.add_argument( p.add_argument(
"--workers", "--workers",
type=int, type=int,
@ -1234,7 +1245,21 @@ def main(argv: Optional[List[str]] = None) -> int:
# of scattered subheader pages per file - sequentially that's minutes for # of scattered subheader pages per file - sequentially that's minutes for
# a 52-file folder. pyreadstat releases the GIL during I/O and C decoding, # a 52-file folder. pyreadstat releases the GIL during I/O and C decoding,
# so a ThreadPool gives near-linear scaling until the disk saturates. # so a ThreadPool gives near-linear scaling until the disk saturates.
# ``--no-prescan`` bypasses the scan entirely; the progress bar then runs
# without an ETA - useful when pre-scan itself is expensive (half hour+
# on very large files) or when debugging iteratively.
all_files: List[Path] = [p for c in loadable for p in c.files] all_files: List[Path] = [p for c in loadable for p in c.files]
grand_total: Optional[int] = 0
if args.no_prescan:
grand_total = None
print(
f"[info] --no-prescan set: skipping row-count pre-scan for "
f"{len(all_files)} file(s); progress bar will show rate + "
f"elapsed but no ETA.",
file=sys.stderr,
)
else:
prescan_workers = min(16, max(1, len(all_files))) prescan_workers = min(16, max(1, len(all_files)))
print( print(
f"pre-scanning row counts for {len(all_files)} file(s) " f"pre-scanning row counts for {len(all_files)} file(s) "
@ -1250,8 +1275,8 @@ def main(argv: Optional[List[str]] = None) -> int:
except Exception as e: except Exception as e:
return (p, None, str(e)) return (p, None, str(e))
grand_total = 0
unknown_total_files: List[str] = [] unknown_total_files: List[str] = []
running_total = 0
with ThreadPoolExecutor(max_workers=prescan_workers) as tpool: with ThreadPoolExecutor(max_workers=prescan_workers) as tpool:
prescan_bar = tqdm( prescan_bar = tqdm(
total=len(all_files), total=len(all_files),
@ -1268,7 +1293,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif n is None: elif n is None:
unknown_total_files.append(p.name) unknown_total_files.append(p.name)
else: else:
grand_total += n running_total += n
finally: finally:
prescan_bar.close() prescan_bar.close()
@ -1280,9 +1305,10 @@ def main(argv: Optional[List[str]] = None) -> int:
file=sys.stderr, file=sys.stderr,
) )
print( print(
f" total rows across folder: {grand_total:,}", f" total rows across folder: {running_total:,}",
file=sys.stderr, file=sys.stderr,
) )
grand_total = running_total
# -- Shared progress plumbing --------------------------------------------- # -- Shared progress plumbing ---------------------------------------------
# The queue crosses process boundaries when workers > 1 (managed proxy) # The queue crosses process boundaries when workers > 1 (managed proxy)