diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py b/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py new file mode 100644 index 000000000000..16a145a05a96 --- /dev/null +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py @@ -0,0 +1,166 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper to turn string references into REST resources.""" + +# TODO(b/513204277): Consolidate these transformations with pandas-gbq and bigframes. + +from __future__ import annotations + +import re +from typing import TypedDict, Union + + +ParsedDatasetReference = TypedDict( + "ParsedDatasetReference", + { + "projectId": str, + "datasetId": str, + }, +) + + +ParsedTableReference = TypedDict( + "ParsedTableReference", + { + "projectId": str, + "datasetId": str, + "tableId": str, + }, +) + + +_FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN = re.compile( + # In the past, organizations could prefix their project IDs with a domain + # name. Such projects still exist, especially at Google. + r"^(?P[^:]+:)?" + r"(?P[^.]+)\." + # Match dataset or catalog + namespace. + # + # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support + # this without catastrophic backtracking by moving the trailing "." to the + # table group. + r"(?P.*)" +) + + +_FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN = re.compile( + # In the past, organizations could prefix their project IDs with a domain + # name. Such projects still exist, especially at Google. + r"^(?P[^:]+:)?" + r"(?P[^.]+)\." + # Match dataset or catalog + namespace. + # + # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support + # this without catastrophic backtracking by moving the trailing "." to the + # table group. + r"(?P.*)" + # Table names can't contain ".", as that's used as the separator. + r"\.(?P[^.]+)$" +) + + +_RELATIVE_TABLE_REFERENCE_PATTERN = re.compile( + # Match dataset or catalog + namespace. + # + # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support + # this without catastrophic backtracking by moving the trailing "." to the + # table group. + r"(?P.*)" + # Table names can't contain ".", as that's used as the separator. + r"\.(?P
[^.]+)$" +) + + +def parse_dataset_reference( + dataset_id: str, *, default_project: Union[str, None] +) -> ParsedDatasetReference: + """Parse a dataset ID string. + + Returns: + ParsedDatasetReference: A typed dictionary (to avoid circular dependencies). + + Raises: + ValueError: When a fully-qualified dataset ID can't be determined. + """ + regex_match = _FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN.match(dataset_id) + if regex_match: + legacy_project_domain = regex_match.group("legacy_project_domain") + project = regex_match.group("project") + + if legacy_project_domain: + output_project_id = f"{legacy_project_domain}{project}" + else: + output_project_id = project + + return { + "projectId": output_project_id, + "datasetId": regex_match.group("inner_parts"), + } + + if not default_project: + raise ValueError( + "When default_project is not set, dataset_id must be a " + "fully-qualified dataset ID in standard SQL format, " + 'e.g., "project.dataset_id" got {}'.format(dataset_id) + ) + + return {"datasetId": dataset_id, "projectId": default_project} + + +def parse_table_reference( + table_id: str, *, default_project: Union[str, None] +) -> ParsedTableReference: + """Parse a table ID string. + + Returns: + ParsedTableReference: A typed dictionary (to avoid circular dependencies). + + Raises: + ValueError: When a fully-qualified table ID can't be determined. + """ + regex_match = _FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN.match(table_id) + if regex_match: + legacy_project_domain = regex_match.group("legacy_project_domain") + project = regex_match.group("project") + + if legacy_project_domain: + output_project_id = f"{legacy_project_domain}{project}" + else: + output_project_id = project + + return { + "projectId": output_project_id, + "datasetId": regex_match.group("inner_parts"), + "tableId": regex_match.group("table"), + } + + if not default_project: + raise ValueError( + "Could not determine project ID. Supply a default project or a fully-qualified table ID, " + f"such as 'project.dataset.table'. Got {table_id}." + ) + + regex_match = _RELATIVE_TABLE_REFERENCE_PATTERN.match(table_id) + if not regex_match: + raise ValueError( + "Could not parse table_id. Expected a table ID" + f"such as 'project.dataset.table', but got {table_id}." + ) + + return { + "projectId": default_project, + "datasetId": regex_match.group("inner_parts"), + "tableId": regex_match.group("table"), + } diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py b/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py index 878b77d4186c..fa3342de1628 100644 --- a/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py @@ -30,6 +30,7 @@ from google.cloud.bigquery.table import Table, TableReference from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration from google.cloud.bigquery import external_config +from google.cloud.bigquery import _string_references def _get_table_reference(self, table_id: str) -> TableReference: @@ -123,7 +124,9 @@ def path(self): routine = _get_routine_reference @classmethod - def from_api_repr(cls, resource: dict) -> "DatasetReference": + def from_api_repr( + cls, resource: Union[dict, _string_references.ParsedDatasetReference] + ) -> "DatasetReference": """Factory: construct a dataset reference given its API representation Args: @@ -166,28 +169,12 @@ def from_string( If ``dataset_id`` is not a fully-qualified dataset ID in standard SQL format. """ - output_dataset_id = dataset_id - parts = _helpers._split_id(dataset_id) - - if len(parts) == 1: - if default_project is not None: - output_project_id = default_project - else: - raise ValueError( - "When default_project is not set, dataset_id must be a " - "fully-qualified dataset ID in standard SQL format, " - 'e.g., "project.dataset_id" got {}'.format(dataset_id) - ) - elif len(parts) == 2: - output_project_id, output_dataset_id = parts - else: - raise ValueError( - "Too many parts in dataset_id. Expected a fully-qualified " - "dataset ID in standard SQL format, " - 'e.g. "project.dataset_id", got {}'.format(dataset_id) + return cls.from_api_repr( + _string_references.parse_dataset_reference( + dataset_id=dataset_id, + default_project=default_project, ) - - return cls(output_project_id, output_dataset_id) + ) def to_api_repr(self) -> dict: """Construct the API resource representation of this dataset reference diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/table.py b/packages/google-cloud-bigquery/google/cloud/bigquery/table.py index 8ba877026d9c..b58499343b8a 100644 --- a/packages/google-cloud-bigquery/google/cloud/bigquery/table.py +++ b/packages/google-cloud-bigquery/google/cloud/bigquery/table.py @@ -72,6 +72,7 @@ from google.cloud.bigquery.schema import _parse_schema_resource from google.cloud.bigquery.schema import _to_schema_fields from google.cloud.bigquery import external_config +from google.cloud.bigquery import _string_references if typing.TYPE_CHECKING: # pragma: NO COVER # Unconditionally import optional dependencies again to tell pytype that @@ -281,22 +282,17 @@ def from_string( If ``table_id`` is not a fully-qualified table ID in standard SQL format. """ - from google.cloud.bigquery.dataset import DatasetReference - - ( - output_project_id, - output_dataset_id, - output_table_id, - ) = _helpers._parse_3_part_id( - table_id, default_project=default_project, property_name="table_id" - ) - - return cls( - DatasetReference(output_project_id, output_dataset_id), output_table_id + return cls.from_api_repr( + _string_references.parse_table_reference( + table_id=table_id, + default_project=default_project, + ) ) @classmethod - def from_api_repr(cls, resource: dict) -> "TableReference": + def from_api_repr( + cls, resource: Union[dict, _string_references.ParsedTableReference] + ) -> "TableReference": """Factory: construct a table reference given its API representation Args: diff --git a/packages/google-cloud-bigquery/tests/system/test_client.py b/packages/google-cloud-bigquery/tests/system/test_client.py index 77cb6e9f02e8..b6da77c04bdb 100644 --- a/packages/google-cloud-bigquery/tests/system/test_client.py +++ b/packages/google-cloud-bigquery/tests/system/test_client.py @@ -304,6 +304,18 @@ def test_get_dataset(self): self.assertEqual(got.friendly_name, "Friendly") self.assertEqual(got.description, "Description") + def test_get_dataset_w_public_biglake(self): + dataset_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data" + + dataset = Config.CLIENT.get_dataset(dataset_id) + self.assertEqual( + dataset.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data" + ) + self.assertEqual(dataset.project, "bigquery-public-data") + self.assertGreater( + dataset.created, datetime.datetime(2025, 1, 1, tzinfo=datetime.timezone.utc) + ) + def test_create_dataset_with_default_rounding_mode(self): DATASET_ID = _make_dataset_id("create_dataset_rounding_mode") dataset = self.temp_dataset(DATASET_ID, default_rounding_mode="ROUND_HALF_EVEN") @@ -693,6 +705,18 @@ def test_delete_dataset_delete_contents_false(self): with self.assertRaises(exceptions.BadRequest): Config.CLIENT.delete_dataset(dataset) + def test_get_table_w_public_biglake(self): + table_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab" + + table = Config.CLIENT.get_table(table_id) + self.assertEqual(table.table_id, "nyc_taxicab") + self.assertEqual( + table.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data" + ) + self.assertEqual(table.project, "bigquery-public-data") + schema_names = [field.name for field in table.schema] + self.assertGreater(len(schema_names), 0) + def test_get_table_w_public_dataset(self): public = "bigquery-public-data" dataset_id = "samples" diff --git a/packages/google-cloud-bigquery/tests/unit/test_dataset.py b/packages/google-cloud-bigquery/tests/unit/test_dataset.py index 604e5ed2e4bf..98466544b5ae 100644 --- a/packages/google-cloud-bigquery/tests/unit/test_dataset.py +++ b/packages/google-cloud-bigquery/tests/unit/test_dataset.py @@ -820,54 +820,6 @@ def test_from_api_repr(self): self.assertEqual(expected, got) - def test_from_string(self): - cls = self._get_target_class() - got = cls.from_string("string-project.string_dataset") - self.assertEqual(got.project, "string-project") - self.assertEqual(got.dataset_id, "string_dataset") - - def test_from_string_w_prefix(self): - cls = self._get_target_class() - got = cls.from_string("google.com:string-project.string_dataset") - self.assertEqual(got.project, "google.com:string-project") - self.assertEqual(got.dataset_id, "string_dataset") - - def test_from_string_legacy_string(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("string-project:string_dataset") - - def test_from_string_w_incorrect_prefix(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("google.com.string-project.dataset_id") - - def test_from_string_w_prefix_and_too_many_parts(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("google.com:string-project.dataset_id.table_id") - - def test_from_string_not_fully_qualified(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("string_dataset") - with self.assertRaises(ValueError): - cls.from_string("a.b.c") - - def test_from_string_with_default_project(self): - cls = self._get_target_class() - got = cls.from_string("string_dataset", default_project="default-project") - self.assertEqual(got.project, "default-project") - self.assertEqual(got.dataset_id, "string_dataset") - - def test_from_string_ignores_default_project(self): - cls = self._get_target_class() - got = cls.from_string( - "string-project.string_dataset", default_project="default-project" - ) - self.assertEqual(got.project, "string-project") - self.assertEqual(got.dataset_id, "string_dataset") - def test___eq___wrong_type(self): dataset = self._make_one("project_1", "dataset_1") other = object() diff --git a/packages/google-cloud-bigquery/tests/unit/test_magics.py b/packages/google-cloud-bigquery/tests/unit/test_magics.py index 6b082c8194c7..8eaf944041ac 100644 --- a/packages/google-cloud-bigquery/tests/unit/test_magics.py +++ b/packages/google-cloud-bigquery/tests/unit/test_magics.py @@ -1337,12 +1337,13 @@ def test_context_with_no_query_cache_from_context(monkeypatch): ip = IPython.get_ipython() monkeypatch.setattr(bigquery, "bigquery_magics", None) bigquery.load_ipython_extension(ip) + context = magics.Context() conn = make_connection() - monkeypatch.setattr(magics.context, "_connection", conn) - monkeypatch.setattr(magics.context, "project", "project-from-context") - monkeypatch.setattr( - magics.context.default_query_job_config, "use_query_cache", False - ) + context._connection = conn + context.credentials = mock.create_autospec(google.auth.credentials.Credentials) + context.default_query_job_config = bigquery.QueryJobConfig(use_query_cache=False) + context.project = "project-from-context" + monkeypatch.setattr(magics, "context", context) ip.run_cell_magic("bigquery", "", QUERY_STRING) @@ -1415,12 +1416,17 @@ def test_bigquery_magic_with_progress_bar_type(monkeypatch): ip = IPython.get_ipython() monkeypatch.setattr(bigquery, "bigquery_magics", None) bigquery.load_ipython_extension(ip) - magics.context.progress_bar_type = None + context = magics.Context() + conn = make_connection() + context._connection = conn + context.credentials = mock.create_autospec(google.auth.credentials.Credentials) + context.progress_bar_type = None + context.project = "unit-test-project" + monkeypatch.setattr(magics, "context", context) run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) - magics.context.project = "unit-test-project" with run_query_patch as run_query_mock: ip.run_cell_magic( @@ -2045,7 +2051,7 @@ def test_bigquery_magic_query_variable_not_identifier(monkeypatch): # considered a table name, thus we expect an error that the table ID is not valid. output = captured_io.stderr assert "ERROR:" in output - assert "must be a fully-qualified ID" in output + assert "Could not parse table_id." in output @pytest.mark.usefixtures("ipython_interactive") diff --git a/packages/google-cloud-bigquery/tests/unit/test_string_references.py b/packages/google-cloud-bigquery/tests/unit/test_string_references.py new file mode 100644 index 000000000000..f7e1477bd8f1 --- /dev/null +++ b/packages/google-cloud-bigquery/tests/unit/test_string_references.py @@ -0,0 +1,183 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from google.cloud import bigquery + + +@pytest.mark.parametrize( + ("value", "default_project", "expected_project_id", "expected_dataset_id"), + ( + ( + "string-project.string_dataset", + None, + "string-project", + "string_dataset", + ), + ( + "google.com:string-project.string_dataset", + None, + "google.com:string-project", + "string_dataset", + ), + ( + "string_dataset", + "default-project", + "default-project", + "string_dataset", + ), + ( + "string-project.string_dataset", + "default-project", + "string-project", + "string_dataset", + ), + ( + "my-biglake-project.biglake_catalog.namespace_a.namespace_b", + "ignored-default-project", + "my-biglake-project", + # BigLake tables are usable from the BigQuery metadata APIs by + # combining catalog and namespace into the datasetId field. See + # internal issue b/512823729. + "biglake_catalog.namespace_a.namespace_b", + ), + ( + "example.com:my-biglake-project.biglake_catalog.namespace_a.namespace_b", + "ignored-default-project", + # BigLake tables should be usable from legacy domain-scoped project IDs. + "example.com:my-biglake-project", + "biglake_catalog.namespace_a.namespace_b", + ), + ), +) +def test_dataset_reference( + value, default_project, expected_project_id, expected_dataset_id +): + got = bigquery.DatasetReference.from_string(value, default_project=default_project) + assert got.project == expected_project_id + assert got.dataset_id == expected_dataset_id + + +@pytest.mark.parametrize( + ("value", "expected_error_message"), + ( + ( + "string_dataset", + "dataset_id must be a fully-qualified dataset ID", + ), + ( + "string-project:string_dataset", + "dataset_id must be a fully-qualified dataset ID", + ), + ), +) +@pytest.mark.parametrize( + ("default_project",), + ( + (None,), + ("",), + ), +) +def test_dataset_reference_without_default_project_value_error( + value, expected_error_message, default_project +): + with pytest.raises(ValueError, match=expected_error_message): + bigquery.DatasetReference.from_string(value, default_project=default_project) + + +@pytest.mark.parametrize( + ( + "value", + "default_project", + "expected_project_id", + "expected_dataset_id", + "expected_table_id", + ), + ( + ( + "string-project.string_dataset.string_table", + None, + "string-project", + "string_dataset", + "string_table", + ), + ( + "google.com:string-project.string_dataset.string_table", + None, + "google.com:string-project", + "string_dataset", + "string_table", + ), + ( + "string_dataset.string_table", + "default-project", + "default-project", + "string_dataset", + "string_table", + ), + ( + "my-project.string_dataset.string_table", + "ignored-default-project", + "my-project", + "string_dataset", + "string_table", + ), + ( + "my-biglake-project.biglake_catalog.namespace_a.namespace_b.biglake_table", + "ignored-default-project", + "my-biglake-project", + # BigLake tables are usable from the BigQuery metadata APIs by + # combining catalog and namespace into the datasetId field. See + # internal issue b/512823729. + "biglake_catalog.namespace_a.namespace_b", + "biglake_table", + ), + ( + "example.com:my-biglake-project.biglake_catalog.namespace_a.namespace_b.biglake_table", + "ignored-default-project", + # BigLake tables should be usable from legacy domain-scoped project IDs. + "example.com:my-biglake-project", + "biglake_catalog.namespace_a.namespace_b", + "biglake_table", + ), + ), +) +def test_table_reference( + value, default_project, expected_project_id, expected_dataset_id, expected_table_id +): + got = bigquery.TableReference.from_string(value, default_project=default_project) + assert got.project == expected_project_id + assert got.dataset_id == expected_dataset_id + assert got.table_id == expected_table_id + + +@pytest.mark.parametrize( + ("value",), + ( + ("string_table",), + ("string_dataset.string_table",), + ("string-project:string_dataset.string_table",), + ), +) +@pytest.mark.parametrize( + ("default_project",), + ( + (None,), + ("",), + ), +) +def test_table_reference_without_default_project_value_error(value, default_project): + with pytest.raises(ValueError, match="Supply a default project"): + bigquery.TableReference.from_string(value, default_project=default_project) diff --git a/packages/google-cloud-bigquery/tests/unit/test_table.py b/packages/google-cloud-bigquery/tests/unit/test_table.py index 32de1ac3497c..0297156aef95 100644 --- a/packages/google-cloud-bigquery/tests/unit/test_table.py +++ b/packages/google-cloud-bigquery/tests/unit/test_table.py @@ -302,60 +302,6 @@ def test_from_api_repr(self): self.assertEqual(expected, got) - def test_from_string(self): - cls = self._get_target_class() - got = cls.from_string("string-project.string_dataset.string_table") - self.assertEqual(got.project, "string-project") - self.assertEqual(got.dataset_id, "string_dataset") - self.assertEqual(got.table_id, "string_table") - - def test_from_string_w_prefix(self): - cls = self._get_target_class() - got = cls.from_string("google.com:string-project.string_dataset.string_table") - self.assertEqual(got.project, "google.com:string-project") - self.assertEqual(got.dataset_id, "string_dataset") - self.assertEqual(got.table_id, "string_table") - - def test_from_string_legacy_string(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("string-project:string_dataset.string_table") - - def test_from_string_w_incorrect_prefix(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("google.com.string-project.string_dataset.string_table") - - def test_from_string_not_fully_qualified(self): - cls = self._get_target_class() - with self.assertRaises(ValueError): - cls.from_string("string_table") - - with self.assertRaises(ValueError): - cls.from_string("string_dataset.string_table") - - with self.assertRaises(ValueError): - cls.from_string("a.b.c.d") - - def test_from_string_with_default_project(self): - cls = self._get_target_class() - got = cls.from_string( - "string_dataset.string_table", default_project="default-project" - ) - self.assertEqual(got.project, "default-project") - self.assertEqual(got.dataset_id, "string_dataset") - self.assertEqual(got.table_id, "string_table") - - def test_from_string_ignores_default_project(self): - cls = self._get_target_class() - got = cls.from_string( - "string-project.string_dataset.string_table", - default_project="default-project", - ) - self.assertEqual(got.project, "string-project") - self.assertEqual(got.dataset_id, "string_dataset") - self.assertEqual(got.table_id, "string_table") - def test___repr__(self): dataset = DatasetReference("project1", "dataset1") table1 = self._make_one(dataset, "table1")