1515from __future__ import annotations
1616
1717import datetime
18- import functools
1918import typing
2019from typing import Iterable , Literal , Sequence , Tuple , Union
2120
2928from bigframes .core import log_adapter
3029import bigframes .core .block_transforms as block_ops
3130import bigframes .core .blocks as blocks
32- from bigframes .core .groupby import aggs
31+ from bigframes .core .groupby import aggs , group_by
3332import bigframes .core .ordering as order
3433import bigframes .core .utils as utils
3534import bigframes .core .validations as validations
3837import bigframes .core .window_spec as window_specs
3938import bigframes .dataframe as df
4039import bigframes .dtypes
41- import bigframes .enums
42- import bigframes .operations as ops
4340import bigframes .operations .aggregations as agg_ops
4441import bigframes .series as series
4542
@@ -55,6 +52,8 @@ def __init__(
5552 by_col_ids : typing .Sequence [str ],
5653 value_name : blocks .Label = None ,
5754 dropna = True ,
55+ * ,
56+ by_key_is_singular : bool = False ,
5857 ):
5958 # TODO(tbergeron): Support more group-by expression types
6059 self ._block = block
@@ -63,6 +62,10 @@ def __init__(
6362 self ._value_name = value_name
6463 self ._dropna = dropna # Applies to aggregations but not windowing
6564
65+ self ._by_key_is_singular = by_key_is_singular
66+ if by_key_is_singular :
67+ assert len (by_col_ids ) == 1 , "singular key should be exactly one group key"
68+
6669 @property
6770 def _session (self ) -> session .Session :
6871 return self ._block .session
@@ -79,56 +82,17 @@ def head(self, n: int = 5) -> series.Series:
7982 )
8083
8184 def __iter__ (self ) -> Iterable [Tuple [blocks .Label , series .Series ]]:
82- original_index_columns = self ._block ._index_columns
83- original_index_labels = self ._block ._index_labels
84- by_col_ids = self ._by_col_ids
85- block = self ._block .reset_index (
86- level = None ,
87- # Keep the original index columns so they can be recovered.
88- drop = False ,
89- allow_duplicates = True ,
90- replacement = bigframes .enums .DefaultIndexKind .NULL ,
91- ).set_index (
92- by_col_ids ,
93- # Keep by_col_ids in-place so the ordering doesn't change.
94- drop = False ,
95- append = False ,
96- )
97- block .cached (
98- force = True ,
99- # All DataFrames will be filtered by by_col_ids, so
100- # force block.cached() to cluster by the new index by explicitly
101- # setting `session_aware=False`. This will ensure that the filters
102- # are more efficient.
103- session_aware = False ,
104- )
105- keys_block , _ = block .aggregate (by_col_ids , dropna = self ._dropna )
106- for chunk in keys_block .to_pandas_batches ():
107- for by_keys in chunk .index :
108- filtered_series = series .Series (
109- # To ensure the cache is used, filter first, then reset the
110- # index before yielding the DataFrame.
111- block .filter (
112- functools .reduce (
113- ops .and_op .as_expr ,
114- (
115- ops .eq_op .as_expr (by_col , ex .const (by_key ))
116- for by_col , by_key in zip (by_col_ids , by_keys )
117- ),
118- ),
119- )
120- .set_index (
121- original_index_columns ,
122- # We retained by_col_ids in the set_index call above,
123- # so it's safe to drop the duplicates now.
124- drop = True ,
125- append = False ,
126- index_labels = original_index_labels ,
127- )
128- .select_column (self ._value_column ),
129- )
130- filtered_series .name = self ._value_name
131- yield by_keys , filtered_series
85+ for group_keys , filtered_block in group_by .block_groupby_iter (
86+ self ._block ,
87+ by_col_ids = self ._by_col_ids ,
88+ by_key_is_singular = self ._by_key_is_singular ,
89+ dropna = self ._dropna ,
90+ ):
91+ filtered_series = series .Series (
92+ filtered_block .select_column (self ._value_column )
93+ )
94+ filtered_series .name = self ._value_name
95+ yield group_keys , filtered_series
13296
13397 def all (self ) -> series .Series :
13498 return self ._aggregate (agg_ops .all_op )
0 commit comments