Suppress PerformanceWarning in load_sas.py to reduce noise during processing of wide SAS files. This change filters out warnings related to DataFrame fragmentation, which are irrelevant for our pipeline as we directly convert DataFrames to pyarrow tables.

This commit is contained in:
David Peterson 2026-04-21 13:40:38 -05:00
parent 969a442775
commit a46f0518f6

View File

@ -227,6 +227,7 @@ import math
import os
import re
import sys
import warnings
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple
@ -239,8 +240,19 @@ import pyarrow.csv as pa_csv
import pyreadstat
import yaml
from dotenv import load_dotenv
from pandas.errors import PerformanceWarning
from tqdm import tqdm
# ``_prepare_for_copy`` builds its output frame one column at a time with
# ``out[name] = ...``. On wide SAS files (~100+ columns) pandas prints a
# ``PerformanceWarning: DataFrame is highly fragmented`` once per chunk to
# nudge callers toward ``pd.concat(axis=1, ...)``. The fragmentation only
# matters for row-oriented ops or in-place ``.copy()``; we hand the frame
# straight to ``pyarrow.Table.from_pandas`` which reads columns
# independently, so the warning is pure noise for our pipeline. Filter it
# at import time - narrow category match so nothing else is suppressed.
warnings.filterwarnings("ignore", category=PerformanceWarning)
logger = logging.getLogger(__name__)