altering such that commit is by batch

2026-04-20 08:38:38 -05:00 · 2026-04-20 08:38:38 -05:00 · e39eb47a90
commit e39eb47a90
parent 508cc974ea
2 changed files with 14 additions and 10 deletions
--- a/generic_loader/load_folder.py
+++ b/generic_loader/load_folder.py
@ -385,9 +385,10 @@ def _infer_cluster_schema(path: Path, include, exclude):
 def load_cluster(conn, cluster: ClusterSpec, schemaname: str) -> int:
    """Load every file in ``cluster`` into one table. Returns total rows loaded.
-    The caller owns transaction boundaries. This function does NOT commit or
+    Commits happen per chunk inside :func:`load_sas.copy_dataframes`. If a
-    roll back - :func:`main` does that per cluster so one bad cluster
+    file mid-cluster fails, earlier chunks - including chunks from earlier
-    doesn't poison the rest of the run.
+    files in the cluster - stay committed; only the in-flight chunk is
    rolled back by :func:`main`.
    """
    if not cluster.files:
        return 0
@ -407,8 +408,8 @@ def load_cluster(conn, cluster: ClusterSpec, schemaname: str) -> int:
    for path in rest:
        columns = _infer_cluster_schema(path, cluster.include, cluster.exclude)
        # Uses the same check that if_exists=append runs. A type mismatch or
-        # missing column aborts the cluster; the transaction rollback in
+        # missing column aborts the cluster; because chunks commit as they
-        # main() keeps the table from ending up half-loaded.
+        # load, earlier chunks in the cluster remain in the table.
        assert_schema_compatible(conn, schemaname, cluster.tablename, columns)
        total += _stream_file(
            conn, schemaname, cluster.tablename, path, columns,
--- a/generic_loader/load_sas.py
+++ b/generic_loader/load_sas.py
@ -194,8 +194,8 @@ will fail mid-stream and the whole transaction rolls back. Set
 matters more than speed.
 Streaming loads use :func:`iter_sas_chunks` + :func:`copy_dataframes`, which
-share one cursor and transaction so a failure mid-file rolls back the whole
+commit each chunk as it is copied so an interrupted load retains the rows
-load.
+that were already written.
 7. Tunables
 -----------
@ -1032,10 +1032,12 @@ def copy_dataframes(
    dfs: Iterable[pd.DataFrame],
    columns: Dict[str, ColumnSpec],
 ) -> int:
-    """Stream an iterable of DataFrames into one ``COPY`` session.
+    """Stream an iterable of DataFrames into Postgres, committing each chunk.
-    All chunks share a cursor and transaction, so a failure mid-stream
+    Each non-empty chunk is copied via ``COPY ... FROM STDIN`` and committed
-    rolls back the whole load when the caller hasn't committed yet.
+    before the next chunk is processed, so an interrupted or failed load
    retains the rows from previously committed chunks. The first chunk's
    commit also flushes any pending DDL (e.g. a preceding ``CREATE TABLE``).
    Empty chunks are skipped. Returns the total rows inserted.
    """
    col_list = ", ".join(_quote_ident(name) for name in columns.keys())
@ -1060,6 +1062,7 @@ def copy_dataframes(
            )
            buf.seek(0)
            cur.copy_expert(sql, buf)
            conn.commit()
            total += len(prepared)
    return total