Add --no-prescan option to load_folder.py for skipping metadata scan

Introduced a new command-line argument, --no-prescan, allowing users to bypass the per-file metadata scan during the loading process. This enhancement is particularly useful for large folders where the pre-scan may be time-consuming. The progress bar will still display rows loaded, rate, and elapsed time, but without an estimated time of arrival (ETA) for completion. Updated the main function to handle this new option and adjusted the progress tracking accordingly.
2026-04-21 08:12:39 -05:00 · 2026-04-21 08:12:39 -05:00 · 2dd247b067
commit 2dd247b067
parent 052fb0e087
1 changed files with 70 additions and 44 deletions
--- a/generic_loader/load_folder.py
+++ b/generic_loader/load_folder.py
@ -1071,6 +1071,17 @@ def _build_argparser() -> argparse.ArgumentParser:
            "PGUSER / PGPASSWORD from the environment or .env file."
        ),
    )
    p.add_argument(
        "--no-prescan",
        action="store_true",
        help=(
            "Skip the per-file metadata scan that populates the folder-wide "
            "tqdm ETA. Useful when the folder is large (half-hour+ pre-scan) "
            "or when you're iterating quickly on a failure. Without the "
            "pre-scan the progress bar still shows rows loaded, rate, and "
            "elapsed time - it just can't estimate remaining time."
        ),
    )
    p.add_argument(
        "--workers",
        type=int,
@ -1234,7 +1245,21 @@ def main(argv: Optional[List[str]] = None) -> int:
    # of scattered subheader pages per file - sequentially that's minutes for
    # a 52-file folder. pyreadstat releases the GIL during I/O and C decoding,
    # so a ThreadPool gives near-linear scaling until the disk saturates.
    # ``--no-prescan`` bypasses the scan entirely; the progress bar then runs
    # without an ETA - useful when pre-scan itself is expensive (half hour+
    # on very large files) or when debugging iteratively.
    all_files: List[Path] = [p for c in loadable for p in c.files]
    grand_total: Optional[int] = 0
    if args.no_prescan:
        grand_total = None
        print(
            f"[info] --no-prescan set: skipping row-count pre-scan for "
            f"{len(all_files)} file(s); progress bar will show rate + "
            f"elapsed but no ETA.",
            file=sys.stderr,
        )
    else:
        prescan_workers = min(16, max(1, len(all_files)))
        print(
            f"pre-scanning row counts for {len(all_files)} file(s) "
@ -1250,8 +1275,8 @@ def main(argv: Optional[List[str]] = None) -> int:
            except Exception as e:
                return (p, None, str(e))
    grand_total = 0
        unknown_total_files: List[str] = []
        running_total = 0
        with ThreadPoolExecutor(max_workers=prescan_workers) as tpool:
            prescan_bar = tqdm(
                total=len(all_files),
@ -1268,7 +1293,7 @@ def main(argv: Optional[List[str]] = None) -> int:
                    elif n is None:
                        unknown_total_files.append(p.name)
                    else:
-                    grand_total += n
+                        running_total += n
            finally:
                prescan_bar.close()
@ -1280,9 +1305,10 @@ def main(argv: Optional[List[str]] = None) -> int:
                file=sys.stderr,
            )
        print(
-        f"  total rows across folder: {grand_total:,}",
+            f"  total rows across folder: {running_total:,}",
            file=sys.stderr,
        )
        grand_total = running_total
    # -- Shared progress plumbing ---------------------------------------------
    # The queue crosses process boundaries when workers > 1 (managed proxy)