Add --no-prescan option to load_folder.py for skipping metadata scan

Introduced a new command-line argument, --no-prescan, allowing users to bypass the per-file metadata scan during the loading process. This enhancement is particularly useful for large folders where the pre-scan may be time-consuming. The progress bar will still display rows loaded, rate, and elapsed time, but without an estimated time of arrival (ETA) for completion. Updated the main function to handle this new option and adjusted the progress tracking accordingly.
This commit is contained in:
David Peterson 2026-04-21 08:12:39 -05:00
parent 052fb0e087
commit 2dd247b067

View File

@ -1071,6 +1071,17 @@ def _build_argparser() -> argparse.ArgumentParser:
"PGUSER / PGPASSWORD from the environment or .env file." "PGUSER / PGPASSWORD from the environment or .env file."
), ),
) )
p.add_argument(
"--no-prescan",
action="store_true",
help=(
"Skip the per-file metadata scan that populates the folder-wide "
"tqdm ETA. Useful when the folder is large (half-hour+ pre-scan) "
"or when you're iterating quickly on a failure. Without the "
"pre-scan the progress bar still shows rows loaded, rate, and "
"elapsed time - it just can't estimate remaining time."
),
)
p.add_argument( p.add_argument(
"--workers", "--workers",
type=int, type=int,
@ -1234,7 +1245,21 @@ def main(argv: Optional[List[str]] = None) -> int:
# of scattered subheader pages per file - sequentially that's minutes for # of scattered subheader pages per file - sequentially that's minutes for
# a 52-file folder. pyreadstat releases the GIL during I/O and C decoding, # a 52-file folder. pyreadstat releases the GIL during I/O and C decoding,
# so a ThreadPool gives near-linear scaling until the disk saturates. # so a ThreadPool gives near-linear scaling until the disk saturates.
# ``--no-prescan`` bypasses the scan entirely; the progress bar then runs
# without an ETA - useful when pre-scan itself is expensive (half hour+
# on very large files) or when debugging iteratively.
all_files: List[Path] = [p for c in loadable for p in c.files] all_files: List[Path] = [p for c in loadable for p in c.files]
grand_total: Optional[int] = 0
if args.no_prescan:
grand_total = None
print(
f"[info] --no-prescan set: skipping row-count pre-scan for "
f"{len(all_files)} file(s); progress bar will show rate + "
f"elapsed but no ETA.",
file=sys.stderr,
)
else:
prescan_workers = min(16, max(1, len(all_files))) prescan_workers = min(16, max(1, len(all_files)))
print( print(
f"pre-scanning row counts for {len(all_files)} file(s) " f"pre-scanning row counts for {len(all_files)} file(s) "
@ -1250,8 +1275,8 @@ def main(argv: Optional[List[str]] = None) -> int:
except Exception as e: except Exception as e:
return (p, None, str(e)) return (p, None, str(e))
grand_total = 0
unknown_total_files: List[str] = [] unknown_total_files: List[str] = []
running_total = 0
with ThreadPoolExecutor(max_workers=prescan_workers) as tpool: with ThreadPoolExecutor(max_workers=prescan_workers) as tpool:
prescan_bar = tqdm( prescan_bar = tqdm(
total=len(all_files), total=len(all_files),
@ -1268,7 +1293,7 @@ def main(argv: Optional[List[str]] = None) -> int:
elif n is None: elif n is None:
unknown_total_files.append(p.name) unknown_total_files.append(p.name)
else: else:
grand_total += n running_total += n
finally: finally:
prescan_bar.close() prescan_bar.close()
@ -1280,9 +1305,10 @@ def main(argv: Optional[List[str]] = None) -> int:
file=sys.stderr, file=sys.stderr,
) )
print( print(
f" total rows across folder: {grand_total:,}", f" total rows across folder: {running_total:,}",
file=sys.stderr, file=sys.stderr,
) )
grand_total = running_total
# -- Shared progress plumbing --------------------------------------------- # -- Shared progress plumbing ---------------------------------------------
# The queue crosses process boundaries when workers > 1 (managed proxy) # The queue crosses process boundaries when workers > 1 (managed proxy)