2222import queue
2323import threading
2424import typing
25- from typing import Any , Iterator , Optional , Sequence , Tuple
25+ from typing import Any , Iterator , List , Optional , Sequence , Tuple , Union
2626
2727from google .cloud import bigquery_storage_v1
2828import google .cloud .bigquery as bq
3737 import bigframes .core .ordering as orderings
3838
3939
40+ # what is the line between metadata and core fields? Mostly metadata fields are optional or unreliable, but its fuzzy
4041@dataclasses .dataclass (frozen = True )
41- class GbqTable :
42+ class TableMetadata :
43+ # this size metadata might be stale, don't use where strict correctness is needed
44+ numBytes : Optional [int ] = None
45+ numRows : Optional [int ] = None
46+ location : Optional [str ] = None
47+ type : Optional [str ] = None
48+ created_time : Optional [datetime .datetime ] = None
49+ modified_time : Optional [datetime .datetime ] = None
50+
51+
52+ @dataclasses .dataclass (frozen = True )
53+ class GbqNativeTable :
4254 project_id : str = dataclasses .field ()
4355 dataset_id : str = dataclasses .field ()
4456 table_id : str = dataclasses .field ()
4557 physical_schema : Tuple [bq .SchemaField , ...] = dataclasses .field ()
4658 is_physically_stored : bool = dataclasses .field ()
47- cluster_cols : typing .Optional [Tuple [str , ...]]
59+ partition_col : Optional [str ] = None
60+ cluster_cols : typing .Optional [Tuple [str , ...]] = None
61+ primary_key : Optional [Tuple [str , ...]] = None
62+ metadata : TableMetadata = TableMetadata ()
4863
4964 @staticmethod
50- def from_table (table : bq .Table , columns : Sequence [str ] = ()) -> GbqTable :
65+ def from_table (table : bq .Table , columns : Sequence [str ] = ()) -> GbqNativeTable :
5166 # Subsetting fields with columns can reduce cost of row-hash default ordering
5267 if columns :
5368 schema = tuple (item for item in table .schema if item .name in columns )
5469 else :
5570 schema = tuple (table .schema )
56- return GbqTable (
71+
72+ metadata = TableMetadata (
73+ numBytes = table .num_bytes ,
74+ numRows = table .num_rows ,
75+ location = table .location , # type: ignore
76+ type = table .table_type , # type: ignore
77+ created_time = table .created ,
78+ modified_time = table .modified ,
79+ )
80+
81+ return GbqNativeTable (
5782 project_id = table .project ,
5883 dataset_id = table .dataset_id ,
5984 table_id = table .table_id ,
@@ -62,15 +87,17 @@ def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqTable:
6287 cluster_cols = None
6388 if table .clustering_fields is None
6489 else tuple (table .clustering_fields ),
90+ primary_key = tuple (_get_primary_keys (table )),
91+ metadata = metadata ,
6592 )
6693
6794 @staticmethod
6895 def from_ref_and_schema (
6996 table_ref : bq .TableReference ,
7097 schema : Sequence [bq .SchemaField ],
7198 cluster_cols : Optional [Sequence [str ]] = None ,
72- ) -> GbqTable :
73- return GbqTable (
99+ ) -> GbqNativeTable :
100+ return GbqNativeTable (
74101 project_id = table_ref .project ,
75102 dataset_id = table_ref .dataset_id ,
76103 table_id = table_ref .table_id ,
@@ -84,12 +111,48 @@ def get_table_ref(self) -> bq.TableReference:
84111 bq .DatasetReference (self .project_id , self .dataset_id ), self .table_id
85112 )
86113
114+ def get_full_id (self , quoted : bool = False ) -> str :
115+ if quoted :
116+ return f"`{ self .project_id } `.`{ self .dataset_id } `.`{ self .table_id } `"
117+ return f"{ self .project_id } .{ self .dataset_id } .{ self .table_id } "
118+
87119 @property
88120 @functools .cache
89121 def schema_by_id (self ):
90122 return {col .name : col for col in self .physical_schema }
91123
92124
125+ @dataclasses .dataclass (frozen = True )
126+ class BiglakeIcebergTable :
127+ project_id : str = dataclasses .field ()
128+ catalog_id : str = dataclasses .field ()
129+ namespace_id : str = dataclasses .field ()
130+ table_id : str = dataclasses .field ()
131+ physical_schema : Tuple [bq .SchemaField , ...] = dataclasses .field ()
132+ cluster_cols : typing .Optional [Tuple [str , ...]]
133+ metadata : TableMetadata
134+
135+ def get_full_id (self , quoted : bool = False ) -> str :
136+ if quoted :
137+ return f"`{ self .project_id } `.`{ self .catalog_id } `.`{ self .namespace_id } `.`{ self .table_id } `"
138+ return (
139+ f"{ self .project_id } .{ self .catalog_id } .{ self .namespace_id } .{ self .table_id } "
140+ )
141+
142+ @property
143+ @functools .cache
144+ def schema_by_id (self ):
145+ return {col .name : col for col in self .physical_schema }
146+
147+ @property
148+ def partition_col (self ) -> Optional [str ]:
149+ return None
150+
151+ @property
152+ def primary_key (self ) -> Optional [Tuple [str , ...]]:
153+ return None
154+
155+
93156@dataclasses .dataclass (frozen = True )
94157class BigqueryDataSource :
95158 """
@@ -104,7 +167,7 @@ def __post_init__(self):
104167 self .schema .names
105168 )
106169
107- table : GbqTable
170+ table : Union [ GbqNativeTable , BiglakeIcebergTable ]
108171 schema : bigframes .core .schema .ArraySchema
109172 at_time : typing .Optional [datetime .datetime ] = None
110173 # Added for backwards compatibility, not validated
@@ -188,6 +251,8 @@ def get_arrow_batches(
188251 project_id : str ,
189252 sample_rate : Optional [float ] = None ,
190253) -> ReadResult :
254+ assert isinstance (data .table , GbqNativeTable )
255+
191256 table_mod_options = {}
192257 read_options_dict : dict [str , Any ] = {"selected_fields" : list (columns )}
193258
@@ -245,3 +310,21 @@ def process_batch(pa_batch):
245310 return ReadResult (
246311 batches , session .estimated_row_count , session .estimated_total_bytes_scanned
247312 )
313+
314+
315+ def _get_primary_keys (
316+ table : bq .Table ,
317+ ) -> List [str ]:
318+ """Get primary keys from table if they are set."""
319+
320+ primary_keys : List [str ] = []
321+ if (
322+ (table_constraints := getattr (table , "table_constraints" , None )) is not None
323+ and (primary_key := table_constraints .primary_key ) is not None
324+ # This will be False for either None or empty list.
325+ # We want primary_keys = None if no primary keys are set.
326+ and (columns := primary_key .columns )
327+ ):
328+ primary_keys = columns if columns is not None else []
329+
330+ return primary_keys
0 commit comments