Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Helper to turn string references into REST resources."""

# TODO(b/513204277): Consolidate these transformations with pandas-gbq and bigframes.

from __future__ import annotations

import re
from typing import TypedDict, Union


ParsedDatasetReference = TypedDict(
"ParsedDatasetReference",
{
"projectId": str,
"datasetId": str,
},
)


ParsedTableReference = TypedDict(
"ParsedTableReference",
{
"projectId": str,
"datasetId": str,
"tableId": str,
},
)


_FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN = re.compile(
# In the past, organizations could prefix their project IDs with a domain
# name. Such projects still exist, especially at Google.
r"^(?P<legacy_project_domain>[^:]+:)?"
r"(?P<project>[^.]+)\."
# Match dataset or catalog + namespace.
#
# Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
# this without catastrophic backtracking by moving the trailing "." to the
# table group.
r"(?P<inner_parts>.*)"
)


_FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN = re.compile(
# In the past, organizations could prefix their project IDs with a domain
# name. Such projects still exist, especially at Google.
r"^(?P<legacy_project_domain>[^:]+:)?"
r"(?P<project>[^.]+)\."
# Match dataset or catalog + namespace.
#
# Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
# this without catastrophic backtracking by moving the trailing "." to the
# table group.
r"(?P<inner_parts>.*)"
# Table names can't contain ".", as that's used as the separator.
r"\.(?P<table>[^.]+)$"
)


_RELATIVE_TABLE_REFERENCE_PATTERN = re.compile(
# Match dataset or catalog + namespace.
#
# Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
# this without catastrophic backtracking by moving the trailing "." to the
# table group.
r"(?P<inner_parts>.*)"
# Table names can't contain ".", as that's used as the separator.
r"\.(?P<table>[^.]+)$"
)


def parse_dataset_reference(
dataset_id: str, *, default_project: Union[str, None]
) -> ParsedDatasetReference:
"""Parse a dataset ID string.

Returns:
ParsedDatasetReference: A typed dictionary (to avoid circular dependencies).

Raises:
ValueError: When a fully-qualified dataset ID can't be determined.
"""
regex_match = _FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN.match(dataset_id)
if regex_match:
legacy_project_domain = regex_match.group("legacy_project_domain")
project = regex_match.group("project")

if legacy_project_domain:
output_project_id = f"{legacy_project_domain}{project}"
else:
output_project_id = project

return {
"projectId": output_project_id,
"datasetId": regex_match.group("inner_parts"),
}

if not default_project:
raise ValueError(
"When default_project is not set, dataset_id must be a "
"fully-qualified dataset ID in standard SQL format, "
'e.g., "project.dataset_id" got {}'.format(dataset_id)
)

return {"datasetId": dataset_id, "projectId": default_project}


def parse_table_reference(
table_id: str, *, default_project: Union[str, None]
) -> ParsedTableReference:
"""Parse a table ID string.

Returns:
ParsedTableReference: A typed dictionary (to avoid circular dependencies).

Raises:
ValueError: When a fully-qualified table ID can't be determined.
"""
regex_match = _FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN.match(table_id)
if regex_match:
legacy_project_domain = regex_match.group("legacy_project_domain")
project = regex_match.group("project")

if legacy_project_domain:
output_project_id = f"{legacy_project_domain}{project}"
else:
output_project_id = project

return {
"projectId": output_project_id,
"datasetId": regex_match.group("inner_parts"),
"tableId": regex_match.group("table"),
}

if not default_project:
raise ValueError(
"Could not determine project ID. Supply a default project or a fully-qualified table ID, "
f"such as 'project.dataset.table'. Got {table_id}."
)

regex_match = _RELATIVE_TABLE_REFERENCE_PATTERN.match(table_id)
if not regex_match:
raise ValueError(
"Could not parse table_id. Expected a table ID"
f"such as 'project.dataset.table', but got {table_id}."
)

return {
"projectId": default_project,
"datasetId": regex_match.group("inner_parts"),
"tableId": regex_match.group("table"),
}
31 changes: 9 additions & 22 deletions packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from google.cloud.bigquery.table import Table, TableReference
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
from google.cloud.bigquery import external_config
from google.cloud.bigquery import _string_references


def _get_table_reference(self, table_id: str) -> TableReference:
Expand Down Expand Up @@ -123,7 +124,9 @@ def path(self):
routine = _get_routine_reference

@classmethod
def from_api_repr(cls, resource: dict) -> "DatasetReference":
def from_api_repr(
cls, resource: Union[dict, _string_references.ParsedDatasetReference]
) -> "DatasetReference":
"""Factory: construct a dataset reference given its API representation

Args:
Expand Down Expand Up @@ -166,28 +169,12 @@ def from_string(
If ``dataset_id`` is not a fully-qualified dataset ID in
standard SQL format.
"""
output_dataset_id = dataset_id
parts = _helpers._split_id(dataset_id)

if len(parts) == 1:
if default_project is not None:
output_project_id = default_project
else:
raise ValueError(
"When default_project is not set, dataset_id must be a "
"fully-qualified dataset ID in standard SQL format, "
'e.g., "project.dataset_id" got {}'.format(dataset_id)
)
elif len(parts) == 2:
output_project_id, output_dataset_id = parts
else:
raise ValueError(
"Too many parts in dataset_id. Expected a fully-qualified "
"dataset ID in standard SQL format, "
'e.g. "project.dataset_id", got {}'.format(dataset_id)
return cls.from_api_repr(
_string_references.parse_dataset_reference(
dataset_id=dataset_id,
default_project=default_project,
)

return cls(output_project_id, output_dataset_id)
)

def to_api_repr(self) -> dict:
"""Construct the API resource representation of this dataset reference
Expand Down
22 changes: 9 additions & 13 deletions packages/google-cloud-bigquery/google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
from google.cloud.bigquery.schema import _parse_schema_resource
from google.cloud.bigquery.schema import _to_schema_fields
from google.cloud.bigquery import external_config
from google.cloud.bigquery import _string_references

if typing.TYPE_CHECKING: # pragma: NO COVER
# Unconditionally import optional dependencies again to tell pytype that
Expand Down Expand Up @@ -281,22 +282,17 @@ def from_string(
If ``table_id`` is not a fully-qualified table ID in
standard SQL format.
"""
from google.cloud.bigquery.dataset import DatasetReference

(
output_project_id,
output_dataset_id,
output_table_id,
) = _helpers._parse_3_part_id(
table_id, default_project=default_project, property_name="table_id"
)

return cls(
DatasetReference(output_project_id, output_dataset_id), output_table_id
return cls.from_api_repr(
_string_references.parse_table_reference(
table_id=table_id,
default_project=default_project,
)
)

@classmethod
def from_api_repr(cls, resource: dict) -> "TableReference":
def from_api_repr(
cls, resource: Union[dict, _string_references.ParsedTableReference]
) -> "TableReference":
"""Factory: construct a table reference given its API representation

Args:
Expand Down
24 changes: 24 additions & 0 deletions packages/google-cloud-bigquery/tests/system/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,18 @@ def test_get_dataset(self):
self.assertEqual(got.friendly_name, "Friendly")
self.assertEqual(got.description, "Description")

def test_get_dataset_w_public_biglake(self):
dataset_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data"

dataset = Config.CLIENT.get_dataset(dataset_id)
self.assertEqual(
dataset.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data"
)
self.assertEqual(dataset.project, "bigquery-public-data")
self.assertGreater(
dataset.created, datetime.datetime(2025, 1, 1, tzinfo=datetime.timezone.utc)
)

def test_create_dataset_with_default_rounding_mode(self):
DATASET_ID = _make_dataset_id("create_dataset_rounding_mode")
dataset = self.temp_dataset(DATASET_ID, default_rounding_mode="ROUND_HALF_EVEN")
Expand Down Expand Up @@ -693,6 +705,18 @@ def test_delete_dataset_delete_contents_false(self):
with self.assertRaises(exceptions.BadRequest):
Config.CLIENT.delete_dataset(dataset)

def test_get_table_w_public_biglake(self):
table_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab"

table = Config.CLIENT.get_table(table_id)
self.assertEqual(table.table_id, "nyc_taxicab")
self.assertEqual(
table.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data"
)
self.assertEqual(table.project, "bigquery-public-data")
schema_names = [field.name for field in table.schema]
self.assertGreater(len(schema_names), 0)

def test_get_table_w_public_dataset(self):
public = "bigquery-public-data"
dataset_id = "samples"
Expand Down
48 changes: 0 additions & 48 deletions packages/google-cloud-bigquery/tests/unit/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -820,54 +820,6 @@ def test_from_api_repr(self):

self.assertEqual(expected, got)

def test_from_string(self):
cls = self._get_target_class()
got = cls.from_string("string-project.string_dataset")
self.assertEqual(got.project, "string-project")
self.assertEqual(got.dataset_id, "string_dataset")

def test_from_string_w_prefix(self):
cls = self._get_target_class()
got = cls.from_string("google.com:string-project.string_dataset")
self.assertEqual(got.project, "google.com:string-project")
self.assertEqual(got.dataset_id, "string_dataset")

def test_from_string_legacy_string(self):
cls = self._get_target_class()
with self.assertRaises(ValueError):
cls.from_string("string-project:string_dataset")

def test_from_string_w_incorrect_prefix(self):
cls = self._get_target_class()
with self.assertRaises(ValueError):
cls.from_string("google.com.string-project.dataset_id")

def test_from_string_w_prefix_and_too_many_parts(self):
cls = self._get_target_class()
with self.assertRaises(ValueError):
cls.from_string("google.com:string-project.dataset_id.table_id")

def test_from_string_not_fully_qualified(self):
cls = self._get_target_class()
with self.assertRaises(ValueError):
cls.from_string("string_dataset")
with self.assertRaises(ValueError):
cls.from_string("a.b.c")

def test_from_string_with_default_project(self):
cls = self._get_target_class()
got = cls.from_string("string_dataset", default_project="default-project")
self.assertEqual(got.project, "default-project")
self.assertEqual(got.dataset_id, "string_dataset")

def test_from_string_ignores_default_project(self):
cls = self._get_target_class()
got = cls.from_string(
"string-project.string_dataset", default_project="default-project"
)
self.assertEqual(got.project, "string-project")
self.assertEqual(got.dataset_id, "string_dataset")

def test___eq___wrong_type(self):
dataset = self._make_one("project_1", "dataset_1")
other = object()
Expand Down
22 changes: 14 additions & 8 deletions packages/google-cloud-bigquery/tests/unit/test_magics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1337,12 +1337,13 @@ def test_context_with_no_query_cache_from_context(monkeypatch):
ip = IPython.get_ipython()
monkeypatch.setattr(bigquery, "bigquery_magics", None)
bigquery.load_ipython_extension(ip)
context = magics.Context()
conn = make_connection()
monkeypatch.setattr(magics.context, "_connection", conn)
monkeypatch.setattr(magics.context, "project", "project-from-context")
monkeypatch.setattr(
magics.context.default_query_job_config, "use_query_cache", False
)
context._connection = conn
context.credentials = mock.create_autospec(google.auth.credentials.Credentials)
context.default_query_job_config = bigquery.QueryJobConfig(use_query_cache=False)
context.project = "project-from-context"
monkeypatch.setattr(magics, "context", context)

ip.run_cell_magic("bigquery", "", QUERY_STRING)

Expand Down Expand Up @@ -1415,12 +1416,17 @@ def test_bigquery_magic_with_progress_bar_type(monkeypatch):
ip = IPython.get_ipython()
monkeypatch.setattr(bigquery, "bigquery_magics", None)
bigquery.load_ipython_extension(ip)
magics.context.progress_bar_type = None
context = magics.Context()
conn = make_connection()
context._connection = conn
context.credentials = mock.create_autospec(google.auth.credentials.Credentials)
context.progress_bar_type = None
context.project = "unit-test-project"
monkeypatch.setattr(magics, "context", context)

run_query_patch = mock.patch(
"google.cloud.bigquery.magics.magics._run_query", autospec=True
)
magics.context.project = "unit-test-project"

with run_query_patch as run_query_mock:
ip.run_cell_magic(
Expand Down Expand Up @@ -2045,7 +2051,7 @@ def test_bigquery_magic_query_variable_not_identifier(monkeypatch):
# considered a table name, thus we expect an error that the table ID is not valid.
output = captured_io.stderr
assert "ERROR:" in output
assert "must be a fully-qualified ID" in output
assert "Could not parse table_id." in output


@pytest.mark.usefixtures("ipython_interactive")
Expand Down
Loading
Loading