Source code for flytekitplugins.bigquery.task

from dataclasses import dataclass
from typing import Any, Dict, Optional, Type

from google.protobuf import json_format
from google.protobuf.struct_pb2 import Struct

from flytekit import lazy_module
from flytekit.configuration import SerializationSettings
from flytekit.extend import SQLTask
from flytekit.extend.backend.base_agent import AsyncAgentExecutorMixin
from flytekit.models import task as _task_model
from flytekit.types.structured import StructuredDataset

bigquery = lazy_module("google.cloud.bigquery")


[docs] @dataclass class BigQueryConfig(object): """ BigQueryConfig should be used to configure a BigQuery Task. """ ProjectID: str Location: Optional[str] = None QueryJobConfig: Optional[bigquery.QueryJobConfig] = None
[docs] class BigQueryTask(AsyncAgentExecutorMixin, SQLTask[BigQueryConfig]): """ This is the simplest form of a BigQuery Task, that can be used even for tasks that do not produce any output. """ # This task is executed using the BigQuery handler in the backend. # https://github.com/flyteorg/flyteplugins/blob/43623826fb189fa64dc4cb53e7025b517d911f22/go/tasks/plugins/webapi/bigquery/plugin.go#L34 _TASK_TYPE = "bigquery_query_job_task" def __init__( self, name: str, query_template: str, task_config: Optional[BigQueryConfig], inputs: Optional[Dict[str, Type]] = None, output_structured_dataset_type: Optional[Type[StructuredDataset]] = None, **kwargs, ): """ To be used to query BigQuery Tables. :param name: Name of this task, should be unique in the project :param query_template: The actual query to run. We use Flyte's Golang templating format for Query templating. Refer to the templating documentation :param task_config: BigQueryConfig object :param inputs: Name and type of inputs specified as an ordered dictionary :param output_structured_dataset_type: If some data is produced by this query, then you can specify the output StructuredDataset type :param kwargs: All other args required by Parent type - SQLTask """ outputs = None if output_structured_dataset_type is not None: outputs = { "results": output_structured_dataset_type, } super().__init__( name=name, task_config=task_config, query_template=query_template, inputs=inputs, outputs=outputs, task_type=self._TASK_TYPE, **kwargs, ) self._output_structured_dataset_type = output_structured_dataset_type
[docs] def get_custom(self, settings: SerializationSettings) -> Dict[str, Any]: config = { "Location": self.task_config.Location, "ProjectID": self.task_config.ProjectID, } if self.task_config.QueryJobConfig is not None: config.update(self.task_config.QueryJobConfig.to_api_repr()["query"]) s = Struct() s.update(config) return json_format.MessageToDict(s)
[docs] def get_sql(self, settings: SerializationSettings) -> Optional[_task_model.Sql]: sql = _task_model.Sql(statement=self.query_template, dialect=_task_model.Sql.Dialect.ANSI) return sql