refactor: widen SessionContext.read_batches to accept any iterable

timsaucer · claude · timsaucer · commit d3a4af46ce9b · 2026-05-26T10:05:35.000-04:00
The underlying PyArrow FFI extractor for `Vec&lt;RecordBatch&gt;` requires a
Python `list`, so the previous `list[pa.RecordBatch]` annotation was
accurate but unnecessarily strict. Accept any
`Iterable[pa.RecordBatch]` on the Python side and materialize to a
list before crossing the FFI boundary so callers can pass generators,
tuples, or other iterables without manual conversion.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -82,7 +82,7 @@
 
 if TYPE_CHECKING:
     import pathlib
-    from collections.abc import Sequence
+    from collections.abc import Iterable, Sequence
 
     import pandas as pd
     import polars as pl  # type: ignore[import]
@@ -981,13 +981,15 @@ def read_batch(self, batch: pa.RecordBatch) -> DataFrame:
         """
         return self.read_batches([batch])
 
-    def read_batches(self, batches: list[pa.RecordBatch]) -> DataFrame:
+    def read_batches(self, batches: Iterable[pa.RecordBatch]) -> DataFrame:
         """Return a :py:class:`~datafusion.DataFrame` reading the given batches.
 
-        All batches must share the same schema. Unlike
-        :py:meth:`register_record_batches`, this does not register the batches
-        as a named table; it returns an anonymous
-        :py:class:`~datafusion.DataFrame` directly.
+        All batches must share the same schema. Any iterable of
+        :py:class:`pa.RecordBatch` is accepted (list, tuple, generator);
+        it is materialized into a list before being handed to the
+        underlying Rust binding. Unlike :py:meth:`register_record_batches`,
+        this does not register the batches as a named table; it returns
+        an anonymous :py:class:`~datafusion.DataFrame` directly.
 
         Args:
             batches: Record batches to wrap as a DataFrame.
@@ -998,8 +1000,13 @@ def read_batches(self, batches: list[pa.RecordBatch]) -> DataFrame:
             >>> b2 = pa.RecordBatch.from_pydict({"a": [3, 4]})
             >>> ctx.read_batches([b1, b2]).to_pydict()
             {'a': [1, 2, 3, 4]}
+
+            A generator works too:
+
+            >>> ctx.read_batches(b for b in [b1, b2]).to_pydict()
+            {'a': [1, 2, 3, 4]}
         """
-        return DataFrame(self.ctx.read_batches(batches))
+        return DataFrame(self.ctx.read_batches(list(batches)))
 
     def register_parquet(
         self,
diff --git a/python/tests/test_context.py b/python/tests/test_context.py
@@ -920,6 +920,17 @@ def test_read_batches_concatenates(ctx):
     assert df.to_pydict() == {"a": [1, 2, 3, 4]}
 
 
+def test_read_batches_accepts_iterable(ctx):
+    b1 = pa.RecordBatch.from_pydict({"a": [1, 2]})
+    b2 = pa.RecordBatch.from_pydict({"a": [3, 4]})
+    # Generator: ensures non-list iterables are materialized before FFI.
+    df = ctx.read_batches(b for b in (b1, b2))
+    assert df.to_pydict() == {"a": [1, 2, 3, 4]}
+    # Tuple: same.
+    df = ctx.read_batches((b1, b2))
+    assert df.to_pydict() == {"a": [1, 2, 3, 4]}
+
+
 def test_create_sql_options():
     SQLOptions()