Add --chunk-rows option to load_folder.py for customizable memory management

Introduced a new command-line argument, --chunk-rows, allowing users to specify the number of rows per chunk for pyreadstat streaming and COPY operations. This option overrides the GENERIC_LOADER_CHUNK_ROWS environment variable and auto-scaling behavior when using multiple workers. Enhanced memory management by providing detailed information on peak memory usage based on the specified chunk size, improving performance and usability during large dataset processing.
2026-04-21 10:05:21 -05:00 · 2026-04-21 10:05:21 -05:00 · 9afb52aecb
commit 9afb52aecb
parent eac75cbb26
2 changed files with 68 additions and 14 deletions
--- a/generic_loader/load_folder.py
+++ b/generic_loader/load_folder.py
@ -1083,6 +1083,20 @@ def _build_argparser() -> argparse.ArgumentParser:
            "PGUSER / PGPASSWORD from the environment or .env file."
        ),
    )
    p.add_argument(
        "--chunk-rows",
        type=int,
        default=None,
        metavar="N",
        help=(
            "Per-chunk row target for pyreadstat streaming and COPY. "
            "Overrides both the GENERIC_LOADER_CHUNK_ROWS env var and the "
            "auto-scaling applied when --workers > 1. Peak memory per "
            "worker is roughly 4 × N × avg_row_bytes; with wide sas7bdat "
            "files (~4 KB/row) and 32 workers, N=100000 is a safe starting "
            "point on a 128 GB box."
        ),
    )
    p.add_argument(
        "--no-prescan",
        action="store_true",
@ -1234,19 +1248,40 @@ def main(argv: Optional[List[str]] = None) -> int:
    workers = max(1, int(args.workers))
-    # When running parallel workers, bound peak memory: each worker buffers a
+    # Per-worker peak memory ~= chunk_rows × avg_row_bytes × ~4 (the original
-    # chunk (read + prepared + serialized) so total memory scales with
+    # pyreadstat DataFrame, the type-coerced ``prepared`` copy, the pyarrow
-    # workers × chunk_rows × avg_row_bytes. Drop the default chunk target to
+    # table, and the serialized CSV buffer can all be alive simultaneously).
-    # 500k unless the operator has explicitly pinned it. Setting the env var
+    # With 32 workers and 500k rows × wide sas7bdat that's easily >128 GB -
-    # before workers spawn means they inherit it through forkserver / spawn.
+    # the default the loader shipped with OOM'd on a c6i.32xlarge box. Scale
-    if (
+    # the auto target inversely with worker count so total memory stays
-        workers > 1
+    # roughly flat regardless of how many workers you pick. Floor of 50k
-        and "GENERIC_LOADER_CHUNK_ROWS" not in os.environ
+    # keeps per-chunk overhead amortized; ceiling of 500k is where pyarrow
-    ):
+    # / pyreadstat buffer spikes start to dominate.
-        os.environ["GENERIC_LOADER_CHUNK_ROWS"] = "500000"
+    #
    # Order of precedence (most wins):
    #   1. ``--chunk-rows N`` CLI flag (if provided)
    #   2. ``GENERIC_LOADER_CHUNK_ROWS`` env var (if already set)
    #   3. Auto-pick based on ``workers``
    if args.chunk_rows is not None:
        os.environ["GENERIC_LOADER_CHUNK_ROWS"] = str(int(args.chunk_rows))
        print(
-            "[info] parallel mode: bounding per-chunk rows to 500,000. "
+            f"[info] --chunk-rows {args.chunk_rows:,}: pinning per-chunk "
-            "Pin GENERIC_LOADER_CHUNK_ROWS to override.",
+            f"row target (overrides auto-scaling).",
            file=sys.stderr,
        )
    elif "GENERIC_LOADER_CHUNK_ROWS" in os.environ:
        print(
            f"[info] honoring GENERIC_LOADER_CHUNK_ROWS="
            f"{os.environ['GENERIC_LOADER_CHUNK_ROWS']} from environment.",
            file=sys.stderr,
        )
    elif workers > 1:
        auto_rows = max(50_000, min(500_000, 3_200_000 // workers))
        os.environ["GENERIC_LOADER_CHUNK_ROWS"] = str(auto_rows)
        print(
            f"[info] parallel mode (workers={workers}): auto-scaled "
            f"per-chunk rows to {auto_rows:,}. "
            f"Use --chunk-rows N to override if you have RAM headroom.",
            file=sys.stderr,
        )
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -1881,15 +1881,34 @@ def copy_dataframes(
    )
    total = 0
    # Pull chunks one at a time so each ``df`` is unreferenced before the
    # generator reads the next one. Without this the loop-variable binding
    # of a ``for df in dfs:`` keeps the previous chunk alive during the
    # next pyreadstat read, pushing peak memory to 5-6× chunk size per
    # worker (old df + incoming df + prepared + pyarrow table + CSV buf).
    # With explicit drops we cap peak at ~2× chunk size: ``df`` goes away
    # once ``prepared`` exists, ``prepared`` once ``buf`` exists, ``buf``
    # once COPY has consumed it. Matters most in parallel mode where
    # 32 × per-worker peak can exhaust a 128 GB host.
    dfs_iter = iter(dfs)
    with conn.cursor() as cur:
-        for df in dfs:
+        while True:
            try:
                df = next(dfs_iter)
            except StopIteration:
                break
            if df.empty:
                del df
                continue
            prepared = _prepare_for_copy(df, columns)
            del df
            n = len(prepared)
            buf = _serialize_chunk_csv(prepared)
            del prepared
            cur.copy_expert(sql, buf)
            del buf
            conn.commit()
-            total += len(prepared)
+            total += n
    return total