From a46f0518f6ccc4f30d8c5f4ef8bf7ad61fc0e71b Mon Sep 17 00:00:00 2001 From: David Peterson Date: Tue, 21 Apr 2026 13:40:38 -0500 Subject: [PATCH] Suppress PerformanceWarning in load_sas.py to reduce noise during processing of wide SAS files. This change filters out warnings related to DataFrame fragmentation, which are irrelevant for our pipeline as we directly convert DataFrames to pyarrow tables. --- generic_loader/load_sas.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index 0100fc0..caed87a 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -227,6 +227,7 @@ import math import os import re import sys +import warnings from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Tuple @@ -239,8 +240,19 @@ import pyarrow.csv as pa_csv import pyreadstat import yaml from dotenv import load_dotenv +from pandas.errors import PerformanceWarning from tqdm import tqdm +# ``_prepare_for_copy`` builds its output frame one column at a time with +# ``out[name] = ...``. On wide SAS files (~100+ columns) pandas prints a +# ``PerformanceWarning: DataFrame is highly fragmented`` once per chunk to +# nudge callers toward ``pd.concat(axis=1, ...)``. The fragmentation only +# matters for row-oriented ops or in-place ``.copy()``; we hand the frame +# straight to ``pyarrow.Table.from_pandas`` which reads columns +# independently, so the warning is pure noise for our pipeline. Filter it +# at import time - narrow category match so nothing else is suppressed. +warnings.filterwarnings("ignore", category=PerformanceWarning) + logger = logging.getLogger(__name__)