Skip to content

Commit d3a4af4

Browse files
timsaucerclaude
andcommitted
refactor: widen SessionContext.read_batches to accept any iterable
The underlying PyArrow FFI extractor for `Vec<RecordBatch>` requires a Python `list`, so the previous `list[pa.RecordBatch]` annotation was accurate but unnecessarily strict. Accept any `Iterable[pa.RecordBatch]` on the Python side and materialize to a list before crossing the FFI boundary so callers can pass generators, tuples, or other iterables without manual conversion. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 90705a6 commit d3a4af4

2 files changed

Lines changed: 25 additions & 7 deletions

File tree

python/datafusion/context.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@
8282

8383
if TYPE_CHECKING:
8484
import pathlib
85-
from collections.abc import Sequence
85+
from collections.abc import Iterable, Sequence
8686

8787
import pandas as pd
8888
import polars as pl # type: ignore[import]
@@ -981,13 +981,15 @@ def read_batch(self, batch: pa.RecordBatch) -> DataFrame:
981981
"""
982982
return self.read_batches([batch])
983983

984-
def read_batches(self, batches: list[pa.RecordBatch]) -> DataFrame:
984+
def read_batches(self, batches: Iterable[pa.RecordBatch]) -> DataFrame:
985985
"""Return a :py:class:`~datafusion.DataFrame` reading the given batches.
986986
987-
All batches must share the same schema. Unlike
988-
:py:meth:`register_record_batches`, this does not register the batches
989-
as a named table; it returns an anonymous
990-
:py:class:`~datafusion.DataFrame` directly.
987+
All batches must share the same schema. Any iterable of
988+
:py:class:`pa.RecordBatch` is accepted (list, tuple, generator);
989+
it is materialized into a list before being handed to the
990+
underlying Rust binding. Unlike :py:meth:`register_record_batches`,
991+
this does not register the batches as a named table; it returns
992+
an anonymous :py:class:`~datafusion.DataFrame` directly.
991993
992994
Args:
993995
batches: Record batches to wrap as a DataFrame.
@@ -998,8 +1000,13 @@ def read_batches(self, batches: list[pa.RecordBatch]) -> DataFrame:
9981000
>>> b2 = pa.RecordBatch.from_pydict({"a": [3, 4]})
9991001
>>> ctx.read_batches([b1, b2]).to_pydict()
10001002
{'a': [1, 2, 3, 4]}
1003+
1004+
A generator works too:
1005+
1006+
>>> ctx.read_batches(b for b in [b1, b2]).to_pydict()
1007+
{'a': [1, 2, 3, 4]}
10011008
"""
1002-
return DataFrame(self.ctx.read_batches(batches))
1009+
return DataFrame(self.ctx.read_batches(list(batches)))
10031010

10041011
def register_parquet(
10051012
self,

python/tests/test_context.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -920,6 +920,17 @@ def test_read_batches_concatenates(ctx):
920920
assert df.to_pydict() == {"a": [1, 2, 3, 4]}
921921

922922

923+
def test_read_batches_accepts_iterable(ctx):
924+
b1 = pa.RecordBatch.from_pydict({"a": [1, 2]})
925+
b2 = pa.RecordBatch.from_pydict({"a": [3, 4]})
926+
# Generator: ensures non-list iterables are materialized before FFI.
927+
df = ctx.read_batches(b for b in (b1, b2))
928+
assert df.to_pydict() == {"a": [1, 2, 3, 4]}
929+
# Tuple: same.
930+
df = ctx.read_batches((b1, b2))
931+
assert df.to_pydict() == {"a": [1, 2, 3, 4]}
932+
933+
923934
def test_create_sql_options():
924935
SQLOptions()
925936

0 commit comments

Comments
 (0)