Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Add classes and objects to enable PANGEA support #2012

Draft
wants to merge 33 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
1b234c8
Adds ExternalCatalogDatasetOptions to Dataset
chalmerlowe Sep 4, 2024
8029213
adds ExternalCatalogTableOptions class and assorted content
chalmerlowe Sep 5, 2024
0992bbf
modifies argument names to snake_case
chalmerlowe Sep 11, 2024
45ddd89
replaces dtype placeholder with parameter names
chalmerlowe Sep 11, 2024
1411460
updates the inclusion of value in properties to use repr version
chalmerlowe Sep 11, 2024
20ee950
updates another inclusion of value in properties to use repr version
chalmerlowe Sep 11, 2024
bee33ef
updates type check via isinstance() or None
chalmerlowe Sep 11, 2024
15acfb3
Merge branch 'main' into add-pangea-classes
chalmerlowe Sep 11, 2024
ee69f24
adds tests related to ExternalCatalogDatasetOptions
chalmerlowe Sep 12, 2024
aeab931
Merge branch 'main' into add-pangea-classes
chalmerlowe Sep 12, 2024
f9d657b
adds test suite for ExternalCatalogTableOptions and minor tweaks else…
chalmerlowe Sep 12, 2024
89896a3
corrects Error type of failing test
chalmerlowe Sep 19, 2024
c452459
forgive me... a wild mess of tests, tweaks, etc
chalmerlowe Sep 26, 2024
199e903
Updates isinstance_or_raise, refines ExternalCatalogDatasetOptions in…
chalmerlowe Oct 2, 2024
e238ba0
Updates ExternalCatalogTableOptions and associated tests
chalmerlowe Oct 2, 2024
5fc89ae
Tweaks several docstrings
chalmerlowe Oct 2, 2024
68d04f0
Adds content related to ForeignTypeInfo
chalmerlowe Oct 2, 2024
2a5774e
add new classes and tests
chalmerlowe Oct 3, 2024
cbd08c5
Merge branch 'main' into add-pangea-classes
chalmerlowe Oct 3, 2024
0fcf424
Update tests/unit/test_schema.py
chalmerlowe Oct 3, 2024
d7698d2
Update google/cloud/bigquery/_helpers.py
chalmerlowe Oct 11, 2024
43dc45e
updates logic and tests related to _isinstance_or_raise'
chalmerlowe Oct 11, 2024
4f117a7
updates from_api_repr and a number of tests and cleans up miscellaneo…
chalmerlowe Oct 11, 2024
defa38c
Update google/cloud/bigquery/_helpers.py
chalmerlowe Oct 14, 2024
14d1bd8
Most recent round of tweaks and experiments
chalmerlowe Oct 30, 2024
1b7ba09
Updates from futures import annotation.
chalmerlowe Nov 1, 2024
79bbeb2
Updates from_api_repr() and external_config tests
chalmerlowe Nov 4, 2024
d71d904
Updates external_catalog_dataset functions in dataset.py and tests.
chalmerlowe Nov 4, 2024
b0a7fb1
Adds fixtures, tests, corrections to classes and tests
chalmerlowe Nov 6, 2024
d0d96fa
Updates comments and addes a to_api_repr test
chalmerlowe Nov 6, 2024
16e2c2c
Merge branch 'main' into add-pangea-classes
chalmerlowe Nov 12, 2024
116de78
Revises test for additional clarity
chalmerlowe Nov 13, 2024
d1432f9
Merge branch 'main' into add-pangea-classes
chalmerlowe Nov 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions google/cloud/bigquery/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
from google.cloud.bigquery.routine import Routine, RoutineReference
from google.cloud.bigquery.table import Table, TableReference
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
from google.cloud.bigquery.external_config import ExternalCatalogDatasetOptions


from typing import Optional, List, Dict, Any, Union

Expand Down Expand Up @@ -530,6 +532,7 @@ class Dataset(object):
"storage_billing_model": "storageBillingModel",
"max_time_travel_hours": "maxTimeTravelHours",
"default_rounding_mode": "defaultRoundingMode",
"external_catalog_dataset_options": "externalCatalogDatasetOptions",
}

def __init__(self, dataset_ref) -> None:
Expand Down Expand Up @@ -937,6 +940,24 @@ def _build_resource(self, filter_fields):
"""Generate a resource for ``update``."""
return _helpers._build_resource_from_properties(self, filter_fields)

@property
def external_catalog_dataset_options(self):
"""Options defining open source compatible datasets living in the
BigQuery catalog. Contains metadata of open source database, schema
or namespace represented by the current dataset."""

return self._properties.get("externalCatalogDatasetOptions")

@external_catalog_dataset_options.setter
def external_catalog_dataset_options(self, value):
if not isinstance(value, ExternalCatalogDatasetOptions) and value is not None:
raise ValueError(
"external_catalog_dataset_options must be an "
"ExternalCatalogDatasetOptions object or None. "
f"Got {repr(value)}."
chalmerlowe marked this conversation as resolved.
Show resolved Hide resolved
)
self._properties["externalCatalogDatasetOptions"] = value

table = _get_table_reference

model = _get_model_reference
Expand Down
92 changes: 92 additions & 0 deletions google/cloud/bigquery/external_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,3 +1003,95 @@ def from_api_repr(cls, resource: dict) -> "ExternalConfig":
config = cls(resource["sourceFormat"])
config._properties = copy.deepcopy(resource)
return config


class ExternalCatalogDatasetOptions(object):
"""Options defining open source compatible datasets living in the BigQuery catalog.
Contains metadata of open source database, schema or namespace represented
by the current dataset.

Args:
defaultStorageLocationUri: Optional. The storage location URI for all
tables in the dataset. Equivalent to hive metastore's database
locationUri. Maximum length of 1024 characters. (str)
parameters: Optional. A map of key value pairs defining the parameters
and properties of the open source schema. Maximum size of 2Mib.
"""

def __init__(self, defaultStorageLocationUri: Optional[str] = None, parameters: Optional[dict] = None):
Linchin marked this conversation as resolved.
Show resolved Hide resolved
self._properties = {}
if not isinstance(defaultStorageLocationUri, (str, None)):
Linchin marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(
"Pass defaultStorageLocationUri as a 'str'."
f"Got {repr(dtype)}."
Linchin marked this conversation as resolved.
Show resolved Hide resolved
)
if not isinstance(parameters, (dict, None)):
raise ValueError(
"Pass parameters as a ''."
f"Got {repr(dtype)}."
)
self._properties["defaultStorageLocationUri"] = defaultStorageLocationUri
self._properties["parameters"] = parameters

def to_api_repr(self) -> dict:
"""Build an API representation of this object.

Returns:
Dict[str, Any]:
A dictionary in the format used by the BigQuery API.
"""
config = copy.deepcopy(self._properties)
return config

class ExternalCatalogTableOptions(object):
"""Metadata about open source compatible table. The fields contained in these
options correspond to hive metastore's table level properties.

Args:
connectionId: Optional. The connection specifying the credentials to be
used to read external storage, such as Azure Blob, Cloud Storage, or
S3. The connection is needed to read the open source table from
BigQuery Engine. The connection_id can have the form `..` or
`projects//locations//connections/`. (str)
parameters: Optional. A map of key value pairs defining the parameters
and properties of the open source table. Corresponds with hive meta
store table parameters. Maximum size of 4Mib. (dict)
storageDescriptor: Optional. A storage descriptor containing information
about the physical storage of this table. (StorageDescriptor)
"""

def __init__(
self,
connectionId: Optional[str] = None,
parameters: Optional[dict] = None,
storageDescriptor: Optional[str] = None # TODO implement StorageDescriptor, correct this type hint
):
self._properties = {}
if not isinstance(connectionId, str):
raise ValueError(
"Pass connectionId as a 'str'."
f"Got {repr(dtype)}."
)
if not isinstance(parameters, dict):
raise ValueError(
"Pass parameters as a 'dict'."
f"Got {repr(dtype)}."
)
if not isinstance(storageDescriptor, str): # TODO implement StorageDescriptor, correct this type hint
raise ValueError(
"Pass storageDescriptor as a 'StorageDescriptor'."
f"Got {repr(dtype)}."
)
self._properties["connectionId"] = connectionId
self._properties["parameters"] = parameters
self._properties["storageDescriptor"] = storageDescriptor

def to_api_repr(self) -> dict:
"""Build an API representation of this object.

Returns:
Dict[str, Any]:
A dictionary in the format used by the BigQuery API.
"""
config = copy.deepcopy(self._properties)
return config
59 changes: 59 additions & 0 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
from google.cloud.bigquery.schema import _build_schema_resource
from google.cloud.bigquery.schema import _parse_schema_resource
from google.cloud.bigquery.schema import _to_schema_fields
from google.cloud.bigquery.external_config import ExternalCatalogTableOptions

if typing.TYPE_CHECKING: # pragma: NO COVER
# Unconditionally import optional dependencies again to tell pytype that
Expand Down Expand Up @@ -999,6 +1000,24 @@ def table_constraints(self) -> Optional["TableConstraints"]:
table_constraints = TableConstraints.from_api_repr(table_constraints)
return table_constraints

@property
def external_catalog_table_options(self):
chalmerlowe marked this conversation as resolved.
Show resolved Hide resolved
"""Options defining open source compatible datasets living in the
BigQuery catalog. Contains metadata of open source database, schema
or namespace represented by the current dataset."""

return self._properties.get("externalCatalogTableOptions")

@external_catalog_table_options.setter
def external_catalog_table_options(self, value):
if not isinstance(value, ExternalCatalogTableOptions) and value is not None:
raise ValueError(
"external_catalog_table_options must be an "
"ExternalCatalogTableOptions object or None. "
f"Got {repr(value)}."
)
self._properties["externalCatalogTableOptions"] = value
Linchin marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def from_string(cls, full_table_id: str) -> "Table":
"""Construct a table from fully-qualified table ID.
Expand Down Expand Up @@ -3308,3 +3327,43 @@ def _table_arg_to_table(value, default_project=None) -> Table:
value = newvalue

return value


class ExternalCatalogTableOptions
"""Metadata about open source compatible table. The fields contained in these options correspond to hive metastore's table level properties.
Args:
connectionId: Optional. The connection specifying the credentials to be
used to read external storage, such as Azure Blob, Cloud Storage, or
S3. The connection is needed to read the open source table from
BigQuery Engine. The connection_id can have the form `..` or
`projects//locations//connections/`. (str)
parameters: Optional. A map of key value pairs defining the parameters
and properties of the open source table. Corresponds with hive meta
store table parameters. Maximum size of 4Mib.
storageDescriptor: Optional. A storage descriptor containing information
about the physical storage of this table.
"""

def __init__(self, connectionId=None, parameters=None, storageDescriptor=None):
self._properties = {}
self.connectionId = connectionId
self.parameters = parameters
self.storageDescriptor = storageDescriptor

if not isinstance(connectionId, str):
raise ValueError(
connectionId must be a str.
Got 'connectionId'.
)

if not isinstance(parameters, ):
raise ValueError(
parameters must be a .
Got 'parameters'.
)

if not isinstance(storageDescriptor, ):
raise ValueError(
storageDescriptor must be a .
Got 'storageDescriptor'.
)