Suppress PerformanceWarning in load_sas.py to reduce noise during processing of wide SAS files. This change filters out warnings related to DataFrame fragmentation, which are irrelevant for our pipeline as we directly convert DataFrames to pyarrow tables.

2026-04-21 13:40:38 -05:00 · 2026-04-21 13:40:38 -05:00 · a46f0518f6
commit a46f0518f6
parent 969a442775
1 changed files with 12 additions and 0 deletions
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -227,6 +227,7 @@ import math
 import os
 import re
 import sys
+import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple
@ -239,8 +240,19 @@ import pyarrow.csv as pa_csv
 import pyreadstat
 import yaml
 from dotenv import load_dotenv
+from pandas.errors import PerformanceWarning
 from tqdm import tqdm

+# ``_prepare_for_copy`` builds its output frame one column at a time with
+# ``out[name] = ...``. On wide SAS files (~100+ columns) pandas prints a
+# ``PerformanceWarning: DataFrame is highly fragmented`` once per chunk to
+# nudge callers toward ``pd.concat(axis=1, ...)``. The fragmentation only
+# matters for row-oriented ops or in-place ``.copy()``; we hand the frame
+# straight to ``pyarrow.Table.from_pandas`` which reads columns
+# independently, so the warning is pure noise for our pipeline. Filter it
+# at import time - narrow category match so nothing else is suppressed.
+warnings.filterwarnings("ignore", category=PerformanceWarning)
+

 logger = logging.getLogger(__name__)