advanced_analyzer #8
@ -1071,6 +1071,17 @@ def _build_argparser() -> argparse.ArgumentParser:
|
|||||||
"PGUSER / PGPASSWORD from the environment or .env file."
|
"PGUSER / PGPASSWORD from the environment or .env file."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--no-prescan",
|
||||||
|
action="store_true",
|
||||||
|
help=(
|
||||||
|
"Skip the per-file metadata scan that populates the folder-wide "
|
||||||
|
"tqdm ETA. Useful when the folder is large (half-hour+ pre-scan) "
|
||||||
|
"or when you're iterating quickly on a failure. Without the "
|
||||||
|
"pre-scan the progress bar still shows rows loaded, rate, and "
|
||||||
|
"elapsed time - it just can't estimate remaining time."
|
||||||
|
),
|
||||||
|
)
|
||||||
p.add_argument(
|
p.add_argument(
|
||||||
"--workers",
|
"--workers",
|
||||||
type=int,
|
type=int,
|
||||||
@ -1234,7 +1245,21 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
# of scattered subheader pages per file - sequentially that's minutes for
|
# of scattered subheader pages per file - sequentially that's minutes for
|
||||||
# a 52-file folder. pyreadstat releases the GIL during I/O and C decoding,
|
# a 52-file folder. pyreadstat releases the GIL during I/O and C decoding,
|
||||||
# so a ThreadPool gives near-linear scaling until the disk saturates.
|
# so a ThreadPool gives near-linear scaling until the disk saturates.
|
||||||
|
# ``--no-prescan`` bypasses the scan entirely; the progress bar then runs
|
||||||
|
# without an ETA - useful when pre-scan itself is expensive (half hour+
|
||||||
|
# on very large files) or when debugging iteratively.
|
||||||
all_files: List[Path] = [p for c in loadable for p in c.files]
|
all_files: List[Path] = [p for c in loadable for p in c.files]
|
||||||
|
grand_total: Optional[int] = 0
|
||||||
|
|
||||||
|
if args.no_prescan:
|
||||||
|
grand_total = None
|
||||||
|
print(
|
||||||
|
f"[info] --no-prescan set: skipping row-count pre-scan for "
|
||||||
|
f"{len(all_files)} file(s); progress bar will show rate + "
|
||||||
|
f"elapsed but no ETA.",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
else:
|
||||||
prescan_workers = min(16, max(1, len(all_files)))
|
prescan_workers = min(16, max(1, len(all_files)))
|
||||||
print(
|
print(
|
||||||
f"pre-scanning row counts for {len(all_files)} file(s) "
|
f"pre-scanning row counts for {len(all_files)} file(s) "
|
||||||
@ -1250,8 +1275,8 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return (p, None, str(e))
|
return (p, None, str(e))
|
||||||
|
|
||||||
grand_total = 0
|
|
||||||
unknown_total_files: List[str] = []
|
unknown_total_files: List[str] = []
|
||||||
|
running_total = 0
|
||||||
with ThreadPoolExecutor(max_workers=prescan_workers) as tpool:
|
with ThreadPoolExecutor(max_workers=prescan_workers) as tpool:
|
||||||
prescan_bar = tqdm(
|
prescan_bar = tqdm(
|
||||||
total=len(all_files),
|
total=len(all_files),
|
||||||
@ -1268,7 +1293,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
elif n is None:
|
elif n is None:
|
||||||
unknown_total_files.append(p.name)
|
unknown_total_files.append(p.name)
|
||||||
else:
|
else:
|
||||||
grand_total += n
|
running_total += n
|
||||||
finally:
|
finally:
|
||||||
prescan_bar.close()
|
prescan_bar.close()
|
||||||
|
|
||||||
@ -1280,9 +1305,10 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
f" total rows across folder: {grand_total:,}",
|
f" total rows across folder: {running_total:,}",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
|
grand_total = running_total
|
||||||
|
|
||||||
# -- Shared progress plumbing ---------------------------------------------
|
# -- Shared progress plumbing ---------------------------------------------
|
||||||
# The queue crosses process boundaries when workers > 1 (managed proxy)
|
# The queue crosses process boundaries when workers > 1 (managed proxy)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user