From 6198e8457bf51327744e66b81df85eee6a32d1b7 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Thu, 14 May 2026 16:34:54 +0000 Subject: [PATCH 1/6] fix: allow multi-part dataset IDs to support BigLake tables Relaxes DatasetReference.from_string and TableReference.from_string validation. --- .../cloud/bigquery/_string_references.py | 99 ++++++++++++ .../google/cloud/bigquery/dataset.py | 30 +--- .../google/cloud/bigquery/table.py | 21 +-- .../tests/unit/test_dataset.py | 48 ------ .../tests/unit/test_string_references.py | 143 ++++++++++++++++++ .../tests/unit/test_table.py | 54 ------- 6 files changed, 256 insertions(+), 139 deletions(-) create mode 100644 packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py create mode 100644 packages/google-cloud-bigquery/tests/unit/test_string_references.py diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py b/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py new file mode 100644 index 000000000000..d7c8815d6c95 --- /dev/null +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py @@ -0,0 +1,99 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper to turn string references into REST resources.""" + +# TODO(b/513204277): Consolidate these transformations with pandas-gbq and bigframes. + +import typing + +from google.cloud.bigquery import _helpers + + +ParsedDatasetReference = typing.TypedDict( + 'ParsedDatasetReference', + { + "projectId": str, + "datasetId": str, + } +) + + +ParsedTableReference = typing.TypedDict( + 'ParsedTableReference', + { + "projectId": str, + "datasetId": str, + "tableId": str, + } +) + + +def parse_dataset_reference(dataset_id: str, *, default_project: str | None) -> ParsedDatasetReference: + """Parse a dataset ID string. + + Returns: + ParsedDatasetReference: A typed dictionary (to avoid circular dependencies). + + Raises: + ValueError: When a fully-qualified dataset ID can't be determined. + """ + output_dataset_id = dataset_id + parts = _helpers._split_id(dataset_id) + + if len(parts) == 1: + if default_project is not None: + output_project_id = default_project + else: + raise ValueError( + "When default_project is not set, dataset_id must be a " + "fully-qualified dataset ID in standard SQL format, " + 'e.g., "project.dataset_id" got {}'.format(dataset_id) + ) + elif len(parts) == 2: + output_project_id, output_dataset_id = parts + else: + raise ValueError( + "Too many parts in dataset_id. Expected a fully-qualified " + "dataset ID in standard SQL format, " + 'e.g. "project.dataset_id", got {}'.format(dataset_id) + ) + + return {"datasetId": output_dataset_id, "projectId": output_project_id} + + +def parse_table_reference(table_id: str, *, default_project: str | None) -> ParsedTableReference: + """Parse a table ID string. + + Returns: + ParsedTableReference: A typed dictionary (to avoid circular dependencies). + + Raises: + ValueError: When a fully-qualified table ID can't be determined. + """ + ( + output_project_id, + output_dataset_id, + output_table_id, + ) = _helpers._parse_3_part_id( + table_id, default_project=default_project, property_name="table_id" + ) + + if output_project_id is None: + raise ValueError( + "Could not determine project ID. Supply a fully-qualified table ID, " + f"such as 'project.dataset.table', got {table_id}." + ) + + return {"projectId": output_project_id, "datasetId": output_dataset_id, "tableId": output_table_id} diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py b/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py index 878b77d4186c..3ecc87068c0a 100644 --- a/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py @@ -30,6 +30,7 @@ from google.cloud.bigquery.table import Table, TableReference from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration from google.cloud.bigquery import external_config +from google.cloud.bigquery import _string_references def _get_table_reference(self, table_id: str) -> TableReference: @@ -123,7 +124,7 @@ def path(self): routine = _get_routine_reference @classmethod - def from_api_repr(cls, resource: dict) -> "DatasetReference": + def from_api_repr(cls, resource: dict | _string_references.ParsedDatasetReference) -> "DatasetReference": """Factory: construct a dataset reference given its API representation Args: @@ -166,28 +167,11 @@ def from_string( If ``dataset_id`` is not a fully-qualified dataset ID in standard SQL format. """ - output_dataset_id = dataset_id - parts = _helpers._split_id(dataset_id) - - if len(parts) == 1: - if default_project is not None: - output_project_id = default_project - else: - raise ValueError( - "When default_project is not set, dataset_id must be a " - "fully-qualified dataset ID in standard SQL format, " - 'e.g., "project.dataset_id" got {}'.format(dataset_id) - ) - elif len(parts) == 2: - output_project_id, output_dataset_id = parts - else: - raise ValueError( - "Too many parts in dataset_id. Expected a fully-qualified " - "dataset ID in standard SQL format, " - 'e.g. "project.dataset_id", got {}'.format(dataset_id) - ) - - return cls(output_project_id, output_dataset_id) + return cls.from_api_repr( + _string_references.parse_dataset_reference( + dataset_id=dataset_id, + default_project=default_project, + )) def to_api_repr(self) -> dict: """Construct the API resource representation of this dataset reference diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/table.py b/packages/google-cloud-bigquery/google/cloud/bigquery/table.py index 8ba877026d9c..2eb2fd890e54 100644 --- a/packages/google-cloud-bigquery/google/cloud/bigquery/table.py +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/table.py @@ -72,6 +72,7 @@ from google.cloud.bigquery.schema import _parse_schema_resource from google.cloud.bigquery.schema import _to_schema_fields from google.cloud.bigquery import external_config +from google.cloud.bigquery import _string_references if typing.TYPE_CHECKING: # pragma: NO COVER # Unconditionally import optional dependencies again to tell pytype that @@ -281,22 +282,14 @@ def from_string( If ``table_id`` is not a fully-qualified table ID in standard SQL format. """ - from google.cloud.bigquery.dataset import DatasetReference - - ( - output_project_id, - output_dataset_id, - output_table_id, - ) = _helpers._parse_3_part_id( - table_id, default_project=default_project, property_name="table_id" - ) - - return cls( - DatasetReference(output_project_id, output_dataset_id), output_table_id - ) + return cls.from_api_repr( + _string_references.parse_table_reference( + table_id=table_id, + default_project=default_project, + )) @classmethod - def from_api_repr(cls, resource: dict) -> "TableReference": + def from_api_repr(cls, resource: dict | _string_references.ParsedTableReference) -> "TableReference": """Factory: construct a table reference given its API representation Args: diff --git a/packages/google-cloud-bigquery/tests/unit/test_dataset.py b/packages/google-cloud-bigquery/tests/unit/test_dataset.py index 604e5ed2e4bf..98466544b5ae 100644 --- a/packages/google-cloud-bigquery/tests/unit/test_dataset.py +++ b/packages/google-cloud-bigquery/tests/unit/test_dataset.py @@ -820,54 +820,6 @@ def test_from_api_repr(self): self.assertEqual(expected, got) - def test_from_string(self): - cls = self._get_target_class() - got = cls.from_string("string-project.string_dataset") - self.assertEqual(got.project, "string-project") - self.assertEqual(got.dataset_id, "string_dataset") - - def test_from_string_w_prefix(self): - cls = self._get_target_class() - got = cls.from_string("google.com:string-project.string_dataset") - self.assertEqual(got.project, "google.com:string-project") - self.assertEqual(got.dataset_id, "string_dataset") - - def test_from_string_legacy_string(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("string-project:string_dataset") - - def test_from_string_w_incorrect_prefix(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("google.com.string-project.dataset_id") - - def test_from_string_w_prefix_and_too_many_parts(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("google.com:string-project.dataset_id.table_id") - - def test_from_string_not_fully_qualified(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("string_dataset") - with self.assertRaises(ValueError): - cls.from_string("a.b.c") - - def test_from_string_with_default_project(self): - cls = self._get_target_class() - got = cls.from_string("string_dataset", default_project="default-project") - self.assertEqual(got.project, "default-project") - self.assertEqual(got.dataset_id, "string_dataset") - - def test_from_string_ignores_default_project(self): - cls = self._get_target_class() - got = cls.from_string( - "string-project.string_dataset", default_project="default-project" - ) - self.assertEqual(got.project, "string-project") - self.assertEqual(got.dataset_id, "string_dataset") - def test___eq___wrong_type(self): dataset = self._make_one("project_1", "dataset_1") other = object() diff --git a/packages/google-cloud-bigquery/tests/unit/test_string_references.py b/packages/google-cloud-bigquery/tests/unit/test_string_references.py new file mode 100644 index 000000000000..2d039b1c9e65 --- /dev/null +++ b/packages/google-cloud-bigquery/tests/unit/test_string_references.py @@ -0,0 +1,143 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from google.cloud import bigquery + + +@pytest.mark.parametrize( + ("value", "default_project", "expected_project_id", "expected_dataset_id"), + ( + ( + "string-project.string_dataset", + None, + "string-project", + "string_dataset", + ), + ( + "google.com:string-project.string_dataset", + None, + "google.com:string-project", + "string_dataset", + ), + ( + "string_dataset", + "default-project", + "default-project", + "string_dataset", + ), + ( + "string-project.string_dataset", + "default-project", + "string-project", + "string_dataset", + ), + ), +) +def test_dataset_reference(value, default_project, expected_project_id, expected_dataset_id): + got = bigquery.DatasetReference.from_string(value, default_project=default_project) + assert got.project == expected_project_id + assert got.dataset_id == expected_dataset_id + + +@pytest.mark.parametrize( + ("value", "expected_error_message"), + ( + ( + "string_dataset", + "dataset_id must be a fully-qualified dataset ID", + ), + ( + "string-project:string_dataset", + "dataset_id must be a fully-qualified dataset ID", + ), + ( + "google.com.string-project.dataset_id", + "Too many parts in dataset_id.", + ), + ( + "google.com:string-project.dataset_id.table_id", + "Too many parts in dataset_id.", + ) + ), +) +def test_dataset_reference_without_default_project_value_error(value, expected_error_message): + with pytest.raises(ValueError, match=expected_error_message): + bigquery.DatasetReference.from_string(value, default_project=None) + + +@pytest.mark.parametrize( + ("value", "default_project", "expected_project_id", "expected_dataset_id", "expected_table_id"), + ( + ( + "string-project.string_dataset.string_table", + None, + "string-project", + "string_dataset", + "string_table", + ), + ( + "google.com:string-project.string_dataset.string_table", + None, + "google.com:string-project", + "string_dataset", + "string_table", + ), + ( + "string_dataset.string_table", + "default-project", + "default-project", + "string_dataset", + "string_table", + ), + ( + "my-project.string_dataset.string_table", + "ignored-default-project", + "my-project", + "string_dataset", + "string_table", + ), + ), +) +def test_table_reference(value, default_project, expected_project_id, expected_dataset_id, expected_table_id): + got = bigquery.TableReference.from_string(value, default_project=default_project) + assert got.project == expected_project_id + assert got.dataset_id == expected_dataset_id + assert got.table_id == expected_table_id + + +@pytest.mark.parametrize( + ("value",), + ( + ( + "string_table", + ), + ( + "string_dataset.string_table", + ), + ( + "string-project:string_dataset.string_table", + ), + ( + "google.com.string-project.dataset_id", + ), + ( + "a.b.c.d", + ) + ), +) +def test_table_reference_without_default_project_value_error(value): + with pytest.raises(ValueError, match="table_id must be a fully-qualified ID in standard SQL format"): + bigquery.TableReference.from_string(value, default_project=None) diff --git a/packages/google-cloud-bigquery/tests/unit/test_table.py b/packages/google-cloud-bigquery/tests/unit/test_table.py index 32de1ac3497c..0297156aef95 100644 --- a/packages/google-cloud-bigquery/tests/unit/test_table.py +++ b/packages/google-cloud-bigquery/tests/unit/test_table.py @@ -302,60 +302,6 @@ def test_from_api_repr(self): self.assertEqual(expected, got) - def test_from_string(self): - cls = self._get_target_class() - got = cls.from_string("string-project.string_dataset.string_table") - self.assertEqual(got.project, "string-project") - self.assertEqual(got.dataset_id, "string_dataset") - self.assertEqual(got.table_id, "string_table") - - def test_from_string_w_prefix(self): - cls = self._get_target_class() - got = cls.from_string("google.com:string-project.string_dataset.string_table") - self.assertEqual(got.project, "google.com:string-project") - self.assertEqual(got.dataset_id, "string_dataset") - self.assertEqual(got.table_id, "string_table") - - def test_from_string_legacy_string(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("string-project:string_dataset.string_table") - - def test_from_string_w_incorrect_prefix(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("google.com.string-project.string_dataset.string_table") - - def test_from_string_not_fully_qualified(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("string_table") - - with self.assertRaises(ValueError): - cls.from_string("string_dataset.string_table") - - with self.assertRaises(ValueError): - cls.from_string("a.b.c.d") - - def test_from_string_with_default_project(self): - cls = self._get_target_class() - got = cls.from_string( - "string_dataset.string_table", default_project="default-project" - ) - self.assertEqual(got.project, "default-project") - self.assertEqual(got.dataset_id, "string_dataset") - self.assertEqual(got.table_id, "string_table") - - def test_from_string_ignores_default_project(self): - cls = self._get_target_class() - got = cls.from_string( - "string-project.string_dataset.string_table", - default_project="default-project", - ) - self.assertEqual(got.project, "string-project") - self.assertEqual(got.dataset_id, "string_dataset") - self.assertEqual(got.table_id, "string_table") - def test___repr__(self): dataset = DatasetReference("project1", "dataset1") table1 = self._make_one(dataset, "table1") From 33d4c7d09eb25a73a8c65db210f9cc3bc3dee3ec Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Thu, 14 May 2026 18:51:42 +0000 Subject: [PATCH 2/6] implement and unit tests --- .../cloud/bigquery/_string_references.py | 183 ++++++++++++------ .../google/cloud/bigquery/dataset.py | 7 +- .../google/cloud/bigquery/table.py | 7 +- .../tests/unit/test_string_references.py | 102 +++++++--- 4 files changed, 207 insertions(+), 92 deletions(-) diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py b/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py index d7c8815d6c95..1fc0b89a1706 100644 --- a/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py @@ -16,84 +16,153 @@ # TODO(b/513204277): Consolidate these transformations with pandas-gbq and bigframes. +from __future__ import annotations + +import re import typing from google.cloud.bigquery import _helpers ParsedDatasetReference = typing.TypedDict( - 'ParsedDatasetReference', + "ParsedDatasetReference", { "projectId": str, "datasetId": str, - } + }, ) ParsedTableReference = typing.TypedDict( - 'ParsedTableReference', + "ParsedTableReference", { "projectId": str, "datasetId": str, "tableId": str, - } + }, ) -def parse_dataset_reference(dataset_id: str, *, default_project: str | None) -> ParsedDatasetReference: - """Parse a dataset ID string. - - Returns: - ParsedDatasetReference: A typed dictionary (to avoid circular dependencies). - - Raises: - ValueError: When a fully-qualified dataset ID can't be determined. - """ - output_dataset_id = dataset_id - parts = _helpers._split_id(dataset_id) - - if len(parts) == 1: - if default_project is not None: - output_project_id = default_project - else: - raise ValueError( - "When default_project is not set, dataset_id must be a " - "fully-qualified dataset ID in standard SQL format, " - 'e.g., "project.dataset_id" got {}'.format(dataset_id) - ) - elif len(parts) == 2: - output_project_id, output_dataset_id = parts +_FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN = re.compile( + # In the past, organizations could prefix their project IDs with a domain + # name. Such projects still exist, especially at Google. + r"^(?P[^:]+:)?" + r"(?P[^.]+)\." + # Match dataset or catalog + namespace. + # + # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support + # this without catastrophic backtracking by moving the trailing "." to the + # table group. + r"(?P.*)" +) + + +_FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN = re.compile( + # In the past, organizations could prefix their project IDs with a domain + # name. Such projects still exist, especially at Google. + r"^(?P[^:]+:)?" + r"(?P[^.]+)\." + # Match dataset or catalog + namespace. + # + # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support + # this without catastrophic backtracking by moving the trailing "." to the + # table group. + r"(?P.*)" + # Table names can't contain ".", as that's used as the separator. + r"\.(?P[^.]+)$" +) + + +_RELATIVE_TABLE_REFERENCE_PATTERN = re.compile( + # Match dataset or catalog + namespace. + # + # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support + # this without catastrophic backtracking by moving the trailing "." to the + # table group. + r"(?P.*)" + # Table names can't contain ".", as that's used as the separator. + r"\.(?P
[^.]+)$" +) + + +def parse_dataset_reference( + dataset_id: str, *, default_project: str | None +) -> ParsedDatasetReference: + """Parse a dataset ID string. + + Returns: + ParsedDatasetReference: A typed dictionary (to avoid circular dependencies). + + Raises: + ValueError: When a fully-qualified dataset ID can't be determined. + """ + regex_match = _FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN.match(dataset_id) + if regex_match: + legacy_project_domain = regex_match.group("legacy_project_domain") + project = regex_match.group("project") + + if legacy_project_domain: + output_project_id = f"{legacy_project_domain}{project}" + else: + output_project_id = project + + return { + "projectId": output_project_id, + "datasetId": regex_match.group("inner_parts"), + } + + if not default_project: + raise ValueError( + "When default_project is not set, dataset_id must be a " + "fully-qualified dataset ID in standard SQL format, " + 'e.g., "project.dataset_id" got {}'.format(dataset_id) + ) + + return {"datasetId": dataset_id, "projectId": default_project} + + +def parse_table_reference( + table_id: str, *, default_project: str | None +) -> ParsedTableReference: + """Parse a table ID string. + + Returns: + ParsedTableReference: A typed dictionary (to avoid circular dependencies). + + Raises: + ValueError: When a fully-qualified table ID can't be determined. + """ + regex_match = _FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN.match(table_id) + if regex_match: + legacy_project_domain = regex_match.group("legacy_project_domain") + project = regex_match.group("project") + + if legacy_project_domain: + output_project_id = f"{legacy_project_domain}{project}" else: - raise ValueError( - "Too many parts in dataset_id. Expected a fully-qualified " - "dataset ID in standard SQL format, " - 'e.g. "project.dataset_id", got {}'.format(dataset_id) - ) - - return {"datasetId": output_dataset_id, "projectId": output_project_id} - - -def parse_table_reference(table_id: str, *, default_project: str | None) -> ParsedTableReference: - """Parse a table ID string. - - Returns: - ParsedTableReference: A typed dictionary (to avoid circular dependencies). - - Raises: - ValueError: When a fully-qualified table ID can't be determined. - """ - ( - output_project_id, - output_dataset_id, - output_table_id, - ) = _helpers._parse_3_part_id( - table_id, default_project=default_project, property_name="table_id" + output_project_id = project + + return { + "projectId": output_project_id, + "datasetId": regex_match.group("inner_parts"), + "tableId": regex_match.group("table"), + } + + if not default_project: + raise ValueError( + "Could not determine project ID. Supply a default project or a fully-qualified table ID, " + f"such as 'project.dataset.table'. Got {table_id}." ) - if output_project_id is None: - raise ValueError( - "Could not determine project ID. Supply a fully-qualified table ID, " - f"such as 'project.dataset.table', got {table_id}." - ) + regex_match = _RELATIVE_TABLE_REFERENCE_PATTERN.match(table_id) + if not regex_match: + raise ValueError( + "Could not parse table_id. Expected a table ID" + f"such as 'project.dataset.table', but got {table_id}." + ) - return {"projectId": output_project_id, "datasetId": output_dataset_id, "tableId": output_table_id} + return { + "projectId": default_project, + "datasetId": regex_match.group("inner_parts"), + "tableId": regex_match.group("table"), + } diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py b/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py index 3ecc87068c0a..54d2888afb95 100644 --- a/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py @@ -124,7 +124,9 @@ def path(self): routine = _get_routine_reference @classmethod - def from_api_repr(cls, resource: dict | _string_references.ParsedDatasetReference) -> "DatasetReference": + def from_api_repr( + cls, resource: dict | _string_references.ParsedDatasetReference + ) -> "DatasetReference": """Factory: construct a dataset reference given its API representation Args: @@ -171,7 +173,8 @@ def from_string( _string_references.parse_dataset_reference( dataset_id=dataset_id, default_project=default_project, - )) + ) + ) def to_api_repr(self) -> dict: """Construct the API resource representation of this dataset reference diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/table.py b/packages/google-cloud-bigquery/google/cloud/bigquery/table.py index 2eb2fd890e54..e68b79ce2fc6 100644 --- a/packages/google-cloud-bigquery/google/cloud/bigquery/table.py +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/table.py @@ -286,10 +286,13 @@ def from_string( _string_references.parse_table_reference( table_id=table_id, default_project=default_project, - )) + ) + ) @classmethod - def from_api_repr(cls, resource: dict | _string_references.ParsedTableReference) -> "TableReference": + def from_api_repr( + cls, resource: dict | _string_references.ParsedTableReference + ) -> "TableReference": """Factory: construct a table reference given its API representation Args: diff --git a/packages/google-cloud-bigquery/tests/unit/test_string_references.py b/packages/google-cloud-bigquery/tests/unit/test_string_references.py index 2d039b1c9e65..f7e1477bd8f1 100644 --- a/packages/google-cloud-bigquery/tests/unit/test_string_references.py +++ b/packages/google-cloud-bigquery/tests/unit/test_string_references.py @@ -44,9 +44,27 @@ "string-project", "string_dataset", ), + ( + "my-biglake-project.biglake_catalog.namespace_a.namespace_b", + "ignored-default-project", + "my-biglake-project", + # BigLake tables are usable from the BigQuery metadata APIs by + # combining catalog and namespace into the datasetId field. See + # internal issue b/512823729. + "biglake_catalog.namespace_a.namespace_b", + ), + ( + "example.com:my-biglake-project.biglake_catalog.namespace_a.namespace_b", + "ignored-default-project", + # BigLake tables should be usable from legacy domain-scoped project IDs. + "example.com:my-biglake-project", + "biglake_catalog.namespace_a.namespace_b", + ), ), ) -def test_dataset_reference(value, default_project, expected_project_id, expected_dataset_id): +def test_dataset_reference( + value, default_project, expected_project_id, expected_dataset_id +): got = bigquery.DatasetReference.from_string(value, default_project=default_project) assert got.project == expected_project_id assert got.dataset_id == expected_dataset_id @@ -63,23 +81,30 @@ def test_dataset_reference(value, default_project, expected_project_id, expected "string-project:string_dataset", "dataset_id must be a fully-qualified dataset ID", ), - ( - "google.com.string-project.dataset_id", - "Too many parts in dataset_id.", - ), - ( - "google.com:string-project.dataset_id.table_id", - "Too many parts in dataset_id.", - ) ), ) -def test_dataset_reference_without_default_project_value_error(value, expected_error_message): +@pytest.mark.parametrize( + ("default_project",), + ( + (None,), + ("",), + ), +) +def test_dataset_reference_without_default_project_value_error( + value, expected_error_message, default_project +): with pytest.raises(ValueError, match=expected_error_message): - bigquery.DatasetReference.from_string(value, default_project=None) + bigquery.DatasetReference.from_string(value, default_project=default_project) @pytest.mark.parametrize( - ("value", "default_project", "expected_project_id", "expected_dataset_id", "expected_table_id"), + ( + "value", + "default_project", + "expected_project_id", + "expected_dataset_id", + "expected_table_id", + ), ( ( "string-project.string_dataset.string_table", @@ -109,9 +134,29 @@ def test_dataset_reference_without_default_project_value_error(value, expected_e "string_dataset", "string_table", ), + ( + "my-biglake-project.biglake_catalog.namespace_a.namespace_b.biglake_table", + "ignored-default-project", + "my-biglake-project", + # BigLake tables are usable from the BigQuery metadata APIs by + # combining catalog and namespace into the datasetId field. See + # internal issue b/512823729. + "biglake_catalog.namespace_a.namespace_b", + "biglake_table", + ), + ( + "example.com:my-biglake-project.biglake_catalog.namespace_a.namespace_b.biglake_table", + "ignored-default-project", + # BigLake tables should be usable from legacy domain-scoped project IDs. + "example.com:my-biglake-project", + "biglake_catalog.namespace_a.namespace_b", + "biglake_table", + ), ), ) -def test_table_reference(value, default_project, expected_project_id, expected_dataset_id, expected_table_id): +def test_table_reference( + value, default_project, expected_project_id, expected_dataset_id, expected_table_id +): got = bigquery.TableReference.from_string(value, default_project=default_project) assert got.project == expected_project_id assert got.dataset_id == expected_dataset_id @@ -121,23 +166,18 @@ def test_table_reference(value, default_project, expected_project_id, expected_d @pytest.mark.parametrize( ("value",), ( - ( - "string_table", - ), - ( - "string_dataset.string_table", - ), - ( - "string-project:string_dataset.string_table", - ), - ( - "google.com.string-project.dataset_id", - ), - ( - "a.b.c.d", - ) + ("string_table",), + ("string_dataset.string_table",), + ("string-project:string_dataset.string_table",), + ), +) +@pytest.mark.parametrize( + ("default_project",), + ( + (None,), + ("",), ), ) -def test_table_reference_without_default_project_value_error(value): - with pytest.raises(ValueError, match="table_id must be a fully-qualified ID in standard SQL format"): - bigquery.TableReference.from_string(value, default_project=None) +def test_table_reference_without_default_project_value_error(value, default_project): + with pytest.raises(ValueError, match="Supply a default project"): + bigquery.TableReference.from_string(value, default_project=default_project) From 190aefa47144603b01c7884b38d707ee3a6d810a Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Thu, 14 May 2026 19:11:41 +0000 Subject: [PATCH 3/6] add system tests --- .../tests/system/test_client.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/packages/google-cloud-bigquery/tests/system/test_client.py b/packages/google-cloud-bigquery/tests/system/test_client.py index 77cb6e9f02e8..b6da77c04bdb 100644 --- a/packages/google-cloud-bigquery/tests/system/test_client.py +++ b/packages/google-cloud-bigquery/tests/system/test_client.py @@ -304,6 +304,18 @@ def test_get_dataset(self): self.assertEqual(got.friendly_name, "Friendly") self.assertEqual(got.description, "Description") + def test_get_dataset_w_public_biglake(self): + dataset_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data" + + dataset = Config.CLIENT.get_dataset(dataset_id) + self.assertEqual( + dataset.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data" + ) + self.assertEqual(dataset.project, "bigquery-public-data") + self.assertGreater( + dataset.created, datetime.datetime(2025, 1, 1, tzinfo=datetime.timezone.utc) + ) + def test_create_dataset_with_default_rounding_mode(self): DATASET_ID = _make_dataset_id("create_dataset_rounding_mode") dataset = self.temp_dataset(DATASET_ID, default_rounding_mode="ROUND_HALF_EVEN") @@ -693,6 +705,18 @@ def test_delete_dataset_delete_contents_false(self): with self.assertRaises(exceptions.BadRequest): Config.CLIENT.delete_dataset(dataset) + def test_get_table_w_public_biglake(self): + table_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab" + + table = Config.CLIENT.get_table(table_id) + self.assertEqual(table.table_id, "nyc_taxicab") + self.assertEqual( + table.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data" + ) + self.assertEqual(table.project, "bigquery-public-data") + schema_names = [field.name for field in table.schema] + self.assertGreater(len(schema_names), 0) + def test_get_table_w_public_dataset(self): public = "bigquery-public-data" dataset_id = "samples" From 8b407b2bea546395b4eb47141689e885169b37ae Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Thu, 14 May 2026 19:56:47 +0000 Subject: [PATCH 4/6] fix 3.9 lint --- .../google/cloud/bigquery/_string_references.py | 12 +++++------- .../google/cloud/bigquery/dataset.py | 2 +- .../google/cloud/bigquery/table.py | 2 +- .../google-cloud-bigquery/tests/unit/test_magics.py | 2 +- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py b/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py index 1fc0b89a1706..16a145a05a96 100644 --- a/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py @@ -19,12 +19,10 @@ from __future__ import annotations import re -import typing +from typing import TypedDict, Union -from google.cloud.bigquery import _helpers - -ParsedDatasetReference = typing.TypedDict( +ParsedDatasetReference = TypedDict( "ParsedDatasetReference", { "projectId": str, @@ -33,7 +31,7 @@ ) -ParsedTableReference = typing.TypedDict( +ParsedTableReference = TypedDict( "ParsedTableReference", { "projectId": str, @@ -86,7 +84,7 @@ def parse_dataset_reference( - dataset_id: str, *, default_project: str | None + dataset_id: str, *, default_project: Union[str, None] ) -> ParsedDatasetReference: """Parse a dataset ID string. @@ -122,7 +120,7 @@ def parse_dataset_reference( def parse_table_reference( - table_id: str, *, default_project: str | None + table_id: str, *, default_project: Union[str, None] ) -> ParsedTableReference: """Parse a table ID string. diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py b/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py index 54d2888afb95..fa3342de1628 100644 --- a/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py @@ -125,7 +125,7 @@ def path(self): @classmethod def from_api_repr( - cls, resource: dict | _string_references.ParsedDatasetReference + cls, resource: Union[dict, _string_references.ParsedDatasetReference] ) -> "DatasetReference": """Factory: construct a dataset reference given its API representation diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/table.py b/packages/google-cloud-bigquery/google/cloud/bigquery/table.py index e68b79ce2fc6..b58499343b8a 100644 --- a/packages/google-cloud-bigquery/google/cloud/bigquery/table.py +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/table.py @@ -291,7 +291,7 @@ def from_string( @classmethod def from_api_repr( - cls, resource: dict | _string_references.ParsedTableReference + cls, resource: Union[dict, _string_references.ParsedTableReference] ) -> "TableReference": """Factory: construct a table reference given its API representation diff --git a/packages/google-cloud-bigquery/tests/unit/test_magics.py b/packages/google-cloud-bigquery/tests/unit/test_magics.py index 6b082c8194c7..dd8e5e25b772 100644 --- a/packages/google-cloud-bigquery/tests/unit/test_magics.py +++ b/packages/google-cloud-bigquery/tests/unit/test_magics.py @@ -2045,7 +2045,7 @@ def test_bigquery_magic_query_variable_not_identifier(monkeypatch): # considered a table name, thus we expect an error that the table ID is not valid. output = captured_io.stderr assert "ERROR:" in output - assert "must be a fully-qualified ID" in output + assert "Could not parse table_id." in output @pytest.mark.usefixtures("ipython_interactive") From 85b500abbbedd62ecdc9d31d0d562f7002138b5c Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Thu, 14 May 2026 20:55:58 +0000 Subject: [PATCH 5/6] magics tests fixes --- .../tests/unit/test_magics.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/google-cloud-bigquery/tests/unit/test_magics.py b/packages/google-cloud-bigquery/tests/unit/test_magics.py index dd8e5e25b772..995da59699a5 100644 --- a/packages/google-cloud-bigquery/tests/unit/test_magics.py +++ b/packages/google-cloud-bigquery/tests/unit/test_magics.py @@ -1337,12 +1337,12 @@ def test_context_with_no_query_cache_from_context(monkeypatch): ip = IPython.get_ipython() monkeypatch.setattr(bigquery, "bigquery_magics", None) bigquery.load_ipython_extension(ip) + context = magics.Context() conn = make_connection() - monkeypatch.setattr(magics.context, "_connection", conn) - monkeypatch.setattr(magics.context, "project", "project-from-context") - monkeypatch.setattr( - magics.context.default_query_job_config, "use_query_cache", False - ) + context._connection = conn + context.project = "project-from-context" + context.default_query_job_config = bigquery.QueryJobConfig(use_query_cache=False) + monkeypatch.setattr(magics, "context", context) ip.run_cell_magic("bigquery", "", QUERY_STRING) @@ -1415,12 +1415,16 @@ def test_bigquery_magic_with_progress_bar_type(monkeypatch): ip = IPython.get_ipython() monkeypatch.setattr(bigquery, "bigquery_magics", None) bigquery.load_ipython_extension(ip) - magics.context.progress_bar_type = None + context = magics.Context() + conn = make_connection() + context._connection = conn + context.progress_bar_type = None + context.project = "unit-test-project" + monkeypatch.setattr(magics, "context", context) run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) - magics.context.project = "unit-test-project" with run_query_patch as run_query_mock: ip.run_cell_magic( From a8b73b24e3d099c9f7a49ca091b5eb41cab80906 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Thu, 14 May 2026 21:01:21 +0000 Subject: [PATCH 6/6] mock credentials --- packages/google-cloud-bigquery/tests/unit/test_magics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/google-cloud-bigquery/tests/unit/test_magics.py b/packages/google-cloud-bigquery/tests/unit/test_magics.py index 995da59699a5..8eaf944041ac 100644 --- a/packages/google-cloud-bigquery/tests/unit/test_magics.py +++ b/packages/google-cloud-bigquery/tests/unit/test_magics.py @@ -1340,8 +1340,9 @@ def test_context_with_no_query_cache_from_context(monkeypatch): context = magics.Context() conn = make_connection() context._connection = conn - context.project = "project-from-context" + context.credentials = mock.create_autospec(google.auth.credentials.Credentials) context.default_query_job_config = bigquery.QueryJobConfig(use_query_cache=False) + context.project = "project-from-context" monkeypatch.setattr(magics, "context", context) ip.run_cell_magic("bigquery", "", QUERY_STRING) @@ -1418,6 +1419,7 @@ def test_bigquery_magic_with_progress_bar_type(monkeypatch): context = magics.Context() conn = make_connection() context._connection = conn + context.credentials = mock.create_autospec(google.auth.credentials.Credentials) context.progress_bar_type = None context.project = "unit-test-project" monkeypatch.setattr(magics, "context", context)