Add --chunk-rows option to load_folder.py for customizable memory management

Introduced a new command-line argument, --chunk-rows, allowing users to specify the number of rows per chunk for pyreadstat streaming and COPY operations. This option overrides the GENERIC_LOADER_CHUNK_ROWS environment variable and auto-scaling behavior when using multiple workers. Enhanced memory management by providing detailed information on peak memory usage based on the specified chunk size, improving performance and usability during large dataset processing.
This commit is contained in:
David Peterson 2026-04-21 10:05:21 -05:00
parent eac75cbb26
commit 9afb52aecb
2 changed files with 68 additions and 14 deletions

View File

@ -1083,6 +1083,20 @@ def _build_argparser() -> argparse.ArgumentParser:
"PGUSER / PGPASSWORD from the environment or .env file."
),
)
p.add_argument(
"--chunk-rows",
type=int,
default=None,
metavar="N",
help=(
"Per-chunk row target for pyreadstat streaming and COPY. "
"Overrides both the GENERIC_LOADER_CHUNK_ROWS env var and the "
"auto-scaling applied when --workers > 1. Peak memory per "
"worker is roughly 4 × N × avg_row_bytes; with wide sas7bdat "
"files (~4 KB/row) and 32 workers, N=100000 is a safe starting "
"point on a 128 GB box."
),
)
p.add_argument(
"--no-prescan",
action="store_true",
@ -1234,19 +1248,40 @@ def main(argv: Optional[List[str]] = None) -> int:
workers = max(1, int(args.workers))
# When running parallel workers, bound peak memory: each worker buffers a
# chunk (read + prepared + serialized) so total memory scales with
# workers × chunk_rows × avg_row_bytes. Drop the default chunk target to
# 500k unless the operator has explicitly pinned it. Setting the env var
# before workers spawn means they inherit it through forkserver / spawn.
if (
workers > 1
and "GENERIC_LOADER_CHUNK_ROWS" not in os.environ
):
os.environ["GENERIC_LOADER_CHUNK_ROWS"] = "500000"
# Per-worker peak memory ~= chunk_rows × avg_row_bytes × ~4 (the original
# pyreadstat DataFrame, the type-coerced ``prepared`` copy, the pyarrow
# table, and the serialized CSV buffer can all be alive simultaneously).
# With 32 workers and 500k rows × wide sas7bdat that's easily >128 GB -
# the default the loader shipped with OOM'd on a c6i.32xlarge box. Scale
# the auto target inversely with worker count so total memory stays
# roughly flat regardless of how many workers you pick. Floor of 50k
# keeps per-chunk overhead amortized; ceiling of 500k is where pyarrow
# / pyreadstat buffer spikes start to dominate.
#
# Order of precedence (most wins):
# 1. ``--chunk-rows N`` CLI flag (if provided)
# 2. ``GENERIC_LOADER_CHUNK_ROWS`` env var (if already set)
# 3. Auto-pick based on ``workers``
if args.chunk_rows is not None:
os.environ["GENERIC_LOADER_CHUNK_ROWS"] = str(int(args.chunk_rows))
print(
"[info] parallel mode: bounding per-chunk rows to 500,000. "
"Pin GENERIC_LOADER_CHUNK_ROWS to override.",
f"[info] --chunk-rows {args.chunk_rows:,}: pinning per-chunk "
f"row target (overrides auto-scaling).",
file=sys.stderr,
)
elif "GENERIC_LOADER_CHUNK_ROWS" in os.environ:
print(
f"[info] honoring GENERIC_LOADER_CHUNK_ROWS="
f"{os.environ['GENERIC_LOADER_CHUNK_ROWS']} from environment.",
file=sys.stderr,
)
elif workers > 1:
auto_rows = max(50_000, min(500_000, 3_200_000 // workers))
os.environ["GENERIC_LOADER_CHUNK_ROWS"] = str(auto_rows)
print(
f"[info] parallel mode (workers={workers}): auto-scaled "
f"per-chunk rows to {auto_rows:,}. "
f"Use --chunk-rows N to override if you have RAM headroom.",
file=sys.stderr,
)

View File

@ -1881,15 +1881,34 @@ def copy_dataframes(
)
total = 0
# Pull chunks one at a time so each ``df`` is unreferenced before the
# generator reads the next one. Without this the loop-variable binding
# of a ``for df in dfs:`` keeps the previous chunk alive during the
# next pyreadstat read, pushing peak memory to 5-6× chunk size per
# worker (old df + incoming df + prepared + pyarrow table + CSV buf).
# With explicit drops we cap peak at ~2× chunk size: ``df`` goes away
# once ``prepared`` exists, ``prepared`` once ``buf`` exists, ``buf``
# once COPY has consumed it. Matters most in parallel mode where
# 32 × per-worker peak can exhaust a 128 GB host.
dfs_iter = iter(dfs)
with conn.cursor() as cur:
for df in dfs:
while True:
try:
df = next(dfs_iter)
except StopIteration:
break
if df.empty:
del df
continue
prepared = _prepare_for_copy(df, columns)
del df
n = len(prepared)
buf = _serialize_chunk_csv(prepared)
del prepared
cur.copy_expert(sql, buf)
del buf
conn.commit()
total += len(prepared)
total += n
return total