Enhance memory management in load_folder.py and load_sas.py for improved performance
Added memory management optimizations in the _worker_load_append_file function to release unused memory from pyarrow's pool and trigger Python's garbage collection. Implemented explicit memory trimming using glibc's malloc_trim to ensure efficient memory usage during long-running processes. Updated the copy_dataframes function in load_sas.py to release pyarrow's memory pool between chunks, preventing high memory usage in long-lived workers. These changes aim to reduce memory footprint and improve overall performance during large dataset processing.
This commit is contained in:
parent
9afb52aecb
commit
0c5e6e31f0
@ -944,6 +944,10 @@ def _worker_load_append_file(
|
|||||||
|
|
||||||
from dotenv import load_dotenv as _load_dotenv
|
from dotenv import load_dotenv as _load_dotenv
|
||||||
|
|
||||||
|
import ctypes
|
||||||
|
import ctypes.util
|
||||||
|
import gc
|
||||||
|
|
||||||
from load_sas import (
|
from load_sas import (
|
||||||
apply_column_filter as _apply_column_filter,
|
apply_column_filter as _apply_column_filter,
|
||||||
assert_schema_compatible as _assert_schema_compatible,
|
assert_schema_compatible as _assert_schema_compatible,
|
||||||
@ -962,6 +966,9 @@ def _worker_load_append_file(
|
|||||||
preview_df = _apply_column_filter(preview_df, include, exclude)
|
preview_df = _apply_column_filter(preview_df, include, exclude)
|
||||||
total_rows = getattr(meta, "number_rows", None)
|
total_rows = getattr(meta, "number_rows", None)
|
||||||
columns = _infer_schema(preview_df, meta, total_rows=total_rows)
|
columns = _infer_schema(preview_df, meta, total_rows=total_rows)
|
||||||
|
# Drop the preview ASAP - on a 2M-row wide file it's hundreds of MB
|
||||||
|
# and we never need it again after schema inference.
|
||||||
|
del preview_df, meta
|
||||||
|
|
||||||
user = db_overrides.get("user") if db_overrides else None
|
user = db_overrides.get("user") if db_overrides else None
|
||||||
password = db_overrides.get("password") if db_overrides else None
|
password = db_overrides.get("password") if db_overrides else None
|
||||||
@ -986,6 +993,32 @@ def _worker_load_append_file(
|
|||||||
conn.close()
|
conn.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return (path_str, 0, f"{type(e).__name__}: {e}")
|
return (path_str, 0, f"{type(e).__name__}: {e}")
|
||||||
|
finally:
|
||||||
|
# Hand memory back to the OS before the worker is recycled (or before
|
||||||
|
# ``max_tasks_per_child`` rotates this process). Three layers, each
|
||||||
|
# of which independently retains memory across calls:
|
||||||
|
#
|
||||||
|
# 1. pyarrow's memory pool aggressively reuses buffers - explicitly
|
||||||
|
# release_unused() returns them to the allocator.
|
||||||
|
# 2. Python's GC: cyclic refs from pandas/pyarrow chains aren't
|
||||||
|
# collected until a generation tick; force one now.
|
||||||
|
# 3. glibc's ptmalloc keeps freed heap in per-thread arenas instead
|
||||||
|
# of munmap'ing it back. ``malloc_trim(0)`` is the explicit ask.
|
||||||
|
# No-op (silently) on platforms without the symbol (macOS, etc).
|
||||||
|
try:
|
||||||
|
import pyarrow as _pa
|
||||||
|
_pa.default_memory_pool().release_unused()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
gc.collect()
|
||||||
|
try:
|
||||||
|
_libc_name = ctypes.util.find_library("c")
|
||||||
|
if _libc_name:
|
||||||
|
_libc = ctypes.CDLL(_libc_name)
|
||||||
|
if hasattr(_libc, "malloc_trim"):
|
||||||
|
_libc.malloc_trim(0)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _load_remaining_files_parallel(
|
def _load_remaining_files_parallel(
|
||||||
@ -1013,7 +1046,19 @@ def _load_remaining_files_parallel(
|
|||||||
total = 0
|
total = 0
|
||||||
errors: List[Tuple[str, str]] = []
|
errors: List[Tuple[str, str]] = []
|
||||||
|
|
||||||
with ProcessPoolExecutor(max_workers=workers) as pool:
|
# ``max_tasks_per_child=1`` recycles each worker process after every
|
||||||
|
# file. Without this, glibc/pyarrow/pyreadstat all retain peak-water
|
||||||
|
# memory inside long-lived workers; over a multi-hour run the sum
|
||||||
|
# across workers monotonically grows even though individual chunks
|
||||||
|
# have been freed at the Python level. Recycling per file gives the
|
||||||
|
# OS the memory back unconditionally - the only cost is one fork +
|
||||||
|
# python interpreter startup per file (~1-2 s), which is noise next
|
||||||
|
# to multi-GB sas7bdat reads.
|
||||||
|
pool_kwargs: Dict[str, Any] = {"max_workers": workers}
|
||||||
|
if sys.version_info >= (3, 11):
|
||||||
|
pool_kwargs["max_tasks_per_child"] = 1
|
||||||
|
|
||||||
|
with ProcessPoolExecutor(**pool_kwargs) as pool:
|
||||||
futures = [
|
futures = [
|
||||||
pool.submit(
|
pool.submit(
|
||||||
_worker_load_append_file,
|
_worker_load_append_file,
|
||||||
|
|||||||
@ -1909,6 +1909,15 @@ def copy_dataframes(
|
|||||||
del buf
|
del buf
|
||||||
conn.commit()
|
conn.commit()
|
||||||
total += n
|
total += n
|
||||||
|
# Hand pyarrow's pool memory back between chunks. Without this,
|
||||||
|
# arrow's internal buffer pool keeps the high-water bytes
|
||||||
|
# reserved across the worker's lifetime - inside long-running
|
||||||
|
# workers this presents as steadily climbing RSS even with the
|
||||||
|
# ``del``s above. Cheap (microseconds); call it every chunk.
|
||||||
|
try:
|
||||||
|
pa.default_memory_pool().release_unused()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
return total
|
return total
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user