altering such that commit is by batch

This commit is contained in:
michael-corey 2026-04-20 08:38:38 -05:00
parent 508cc974ea
commit e39eb47a90
2 changed files with 14 additions and 10 deletions

View File

@ -385,9 +385,10 @@ def _infer_cluster_schema(path: Path, include, exclude):
def load_cluster(conn, cluster: ClusterSpec, schemaname: str) -> int: def load_cluster(conn, cluster: ClusterSpec, schemaname: str) -> int:
"""Load every file in ``cluster`` into one table. Returns total rows loaded. """Load every file in ``cluster`` into one table. Returns total rows loaded.
The caller owns transaction boundaries. This function does NOT commit or Commits happen per chunk inside :func:`load_sas.copy_dataframes`. If a
roll back - :func:`main` does that per cluster so one bad cluster file mid-cluster fails, earlier chunks - including chunks from earlier
doesn't poison the rest of the run. files in the cluster - stay committed; only the in-flight chunk is
rolled back by :func:`main`.
""" """
if not cluster.files: if not cluster.files:
return 0 return 0
@ -407,8 +408,8 @@ def load_cluster(conn, cluster: ClusterSpec, schemaname: str) -> int:
for path in rest: for path in rest:
columns = _infer_cluster_schema(path, cluster.include, cluster.exclude) columns = _infer_cluster_schema(path, cluster.include, cluster.exclude)
# Uses the same check that if_exists=append runs. A type mismatch or # Uses the same check that if_exists=append runs. A type mismatch or
# missing column aborts the cluster; the transaction rollback in # missing column aborts the cluster; because chunks commit as they
# main() keeps the table from ending up half-loaded. # load, earlier chunks in the cluster remain in the table.
assert_schema_compatible(conn, schemaname, cluster.tablename, columns) assert_schema_compatible(conn, schemaname, cluster.tablename, columns)
total += _stream_file( total += _stream_file(
conn, schemaname, cluster.tablename, path, columns, conn, schemaname, cluster.tablename, path, columns,

View File

@ -194,8 +194,8 @@ will fail mid-stream and the whole transaction rolls back. Set
matters more than speed. matters more than speed.
Streaming loads use :func:`iter_sas_chunks` + :func:`copy_dataframes`, which Streaming loads use :func:`iter_sas_chunks` + :func:`copy_dataframes`, which
share one cursor and transaction so a failure mid-file rolls back the whole commit each chunk as it is copied so an interrupted load retains the rows
load. that were already written.
7. Tunables 7. Tunables
----------- -----------
@ -1032,10 +1032,12 @@ def copy_dataframes(
dfs: Iterable[pd.DataFrame], dfs: Iterable[pd.DataFrame],
columns: Dict[str, ColumnSpec], columns: Dict[str, ColumnSpec],
) -> int: ) -> int:
"""Stream an iterable of DataFrames into one ``COPY`` session. """Stream an iterable of DataFrames into Postgres, committing each chunk.
All chunks share a cursor and transaction, so a failure mid-stream Each non-empty chunk is copied via ``COPY ... FROM STDIN`` and committed
rolls back the whole load when the caller hasn't committed yet. before the next chunk is processed, so an interrupted or failed load
retains the rows from previously committed chunks. The first chunk's
commit also flushes any pending DDL (e.g. a preceding ``CREATE TABLE``).
Empty chunks are skipped. Returns the total rows inserted. Empty chunks are skipped. Returns the total rows inserted.
""" """
col_list = ", ".join(_quote_ident(name) for name in columns.keys()) col_list = ", ".join(_quote_ident(name) for name in columns.keys())
@ -1060,6 +1062,7 @@ def copy_dataframes(
) )
buf.seek(0) buf.seek(0)
cur.copy_expert(sql, buf) cur.copy_expert(sql, buf)
conn.commit()
total += len(prepared) total += len(prepared)
return total return total