diff --git a/generic_loader/load_sas.py b/generic_loader/load_sas.py index 0100fc0..caed87a 100644 --- a/generic_loader/load_sas.py +++ b/generic_loader/load_sas.py @@ -227,6 +227,7 @@ import math import os import re import sys +import warnings from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Tuple @@ -239,8 +240,19 @@ import pyarrow.csv as pa_csv import pyreadstat import yaml from dotenv import load_dotenv +from pandas.errors import PerformanceWarning from tqdm import tqdm +# ``_prepare_for_copy`` builds its output frame one column at a time with +# ``out[name] = ...``. On wide SAS files (~100+ columns) pandas prints a +# ``PerformanceWarning: DataFrame is highly fragmented`` once per chunk to +# nudge callers toward ``pd.concat(axis=1, ...)``. The fragmentation only +# matters for row-oriented ops or in-place ``.copy()``; we hand the frame +# straight to ``pyarrow.Table.from_pandas`` which reads columns +# independently, so the warning is pure noise for our pipeline. Filter it +# at import time - narrow category match so nothing else is suppressed. +warnings.filterwarnings("ignore", category=PerformanceWarning) + logger = logging.getLogger(__name__)