Suppress PerformanceWarning in load_sas.py to reduce noise during processing of wide SAS files. This change filters out warnings related to DataFrame fragmentation, which are irrelevant for our pipeline as we directly convert DataFrames to pyarrow tables.
This commit is contained in:
parent
969a442775
commit
a46f0518f6
@ -227,6 +227,7 @@ import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
@ -239,8 +240,19 @@ import pyarrow.csv as pa_csv
|
||||
import pyreadstat
|
||||
import yaml
|
||||
from dotenv import load_dotenv
|
||||
from pandas.errors import PerformanceWarning
|
||||
from tqdm import tqdm
|
||||
|
||||
# ``_prepare_for_copy`` builds its output frame one column at a time with
|
||||
# ``out[name] = ...``. On wide SAS files (~100+ columns) pandas prints a
|
||||
# ``PerformanceWarning: DataFrame is highly fragmented`` once per chunk to
|
||||
# nudge callers toward ``pd.concat(axis=1, ...)``. The fragmentation only
|
||||
# matters for row-oriented ops or in-place ``.copy()``; we hand the frame
|
||||
# straight to ``pyarrow.Table.from_pandas`` which reads columns
|
||||
# independently, so the warning is pure noise for our pipeline. Filter it
|
||||
# at import time - narrow category match so nothing else is suppressed.
|
||||
warnings.filterwarnings("ignore", category=PerformanceWarning)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user