2026-04-21 22:32:18 +00:00
1 changed files with 12 additions and 0 deletions
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -227,6 +227,7 @@ import math
 import os
 import re
 import sys
+import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple
@ -239,8 +240,19 @@ import pyarrow.csv as pa_csv
 import pyreadstat
 import yaml
 from dotenv import load_dotenv
+from pandas.errors import PerformanceWarning
 from tqdm import tqdm

+# ``_prepare_for_copy`` builds its output frame one column at a time with
+# ``out[name] = ...``. On wide SAS files (~100+ columns) pandas prints a
+# ``PerformanceWarning: DataFrame is highly fragmented`` once per chunk to
+# nudge callers toward ``pd.concat(axis=1, ...)``. The fragmentation only
+# matters for row-oriented ops or in-place ``.copy()``; we hand the frame
+# straight to ``pyarrow.Table.from_pandas`` which reads columns
+# independently, so the warning is pure noise for our pipeline. Filter it
+# at import time - narrow category match so nothing else is suppressed.
+warnings.filterwarnings("ignore", category=PerformanceWarning)
+

 logger = logging.getLogger(__name__)