ray-project
diff --git a/‎doc/source/data/api/input_output.rst‎
Lines changed: 9 additions & 0 deletions b/‎doc/source/data/api/input_output.rst‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎python/ray/data/BUILD‎
Lines changed: 8 additions & 0 deletions b/‎python/ray/data/BUILD‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎python/ray/data/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/ray/data/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/ray/data/_internal/logical/util.py‎
Lines changed: 2 additions & 0 deletions b/‎python/ray/data/_internal/logical/util.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/ray/data/dataset.py‎
Lines changed: 46 additions & 0 deletions b/‎python/ray/data/dataset.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎python/ray/data/datasource/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎python/ray/data/datasource/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/ray/data/datasource/bigquery_datasource.py‎
Lines changed: 194 additions & 0 deletions b/‎python/ray/data/datasource/bigquery_datasource.py‎
Lines changed: 194 additions & 0 deletions
@@ -140,6 +140,15 @@ MongoDB
    read_mongo
    Dataset.write_mongo
 
+BigQuery
+--------
+
+.. autosummary::
+   :toctree: doc/
+
+   read_bigquery
+   Dataset.write_bigquery
+
 SQL Databases
 -------------
 
 
@@ -73,6 +73,14 @@ py_test(
     deps = ["//:ray_lib", ":conftest"],
 )
 
+py_test(
+    name = "test_bigquery",
+    size = "large",
+    srcs = ["tests/test_bigquery.py"],
+    tags = ["team:data", "exclusive", "data_integration"],
+    deps = ["//:ray_lib", ":conftest"],
+)
+
 py_test(
     name = "test_actor_pool_map_operator",
     size = "small",
 
@@ -33,6 +33,7 @@
     range,
     range_table,
     range_tensor,
+    read_bigquery,
     read_binary_files,
     read_csv,
     read_databricks_tables,
 
@@ -16,6 +16,7 @@
 # The white list of operator names allowed to be recorded.
 _op_name_white_list = [
     # Read
+    "ReadBigQuery",
     "ReadRange",
     "ReadMongo",
     "ReadParquet",
@@ -35,6 +36,7 @@
     "FromNumpy",
     "FromPandas",
     # Write
+    "WriteBigQuery",
     "WriteParquet",
     "WriteJSON",
     "WriteCSV",
 
@@ -114,6 +114,7 @@
     DataContext,
 )
 from ray.data.datasource import (
+    BigQueryDatasource,
     BlockWritePathProvider,
     Connection,
     CSVDatasource,
@@ -3280,6 +3281,7 @@ def write_sql(
             sql=sql,
         )
 
+    @PublicAPI(stability="alpha")
     @ConsumptionAPI
     def write_mongo(
         self,
@@ -3348,6 +3350,50 @@ def write_mongo(
             collection=collection,
         )
 
+    @ConsumptionAPI
+    def write_bigquery(
+        self,
+        project_id: str,
+        dataset: str,
+        ray_remote_args: Dict[str, Any] = None,
+    ) -> None:
+        """Write the dataset to a BigQuery dataset table.
+
+        To control the number of parallel write tasks, use ``.repartition()``
+        before calling this method.
+
+        Examples:
+             .. testcode::
+                :skipif: True
+
+                import ray
+                import pandas as pd
+
+                docs = [{"title": "BigQuery Datasource test"} for key in range(4)]
+                ds = ray.data.from_pandas(pd.DataFrame(docs))
+                ds.write_bigquery(
+                    BigQueryDatasource(),
+                    project_id="my_project_id",
+                    dataset="my_dataset_table",
+                )
+
+        Args:
+            project_id: The name of the associated Google Cloud Project that hosts
+                the dataset to read. For more information, see details in
+                `Creating and managing projects <https://cloud.google.com/resource-manager/docs/creating-managing-projects>`. # noqa: E501
+            dataset: The name of the dataset in the format of ``dataset_id.table_id``.
+                The dataset is created if it doesn't already exist. The table_id is
+                overwritten if it exists.
+            ray_remote_args: Kwargs passed to ray.remote in the write tasks.
+        """
+
+        self.write_datasource(
+            BigQueryDatasource(),
+            ray_remote_args=ray_remote_args,
+            dataset=dataset,
+            project_id=project_id,
+        )
+
     @ConsumptionAPI(pattern="Time complexity:")
     def write_datasource(
         self,
 
@@ -1,3 +1,4 @@
+from ray.data.datasource.bigquery_datasource import BigQueryDatasource
 from ray.data.datasource.binary_datasource import BinaryDatasource
 from ray.data.datasource.csv_datasource import CSVDatasource
 from ray.data.datasource.datasource import (
@@ -48,6 +49,7 @@
 __all__ = [
     "BaseFileMetadataProvider",
     "BinaryDatasource",
+    "BigQueryDatasource",
     "BlockWritePathProvider",
     "Connection",
     "CSVDatasource",
 
@@ -0,0 +1,194 @@
+import logging
+import os
+import tempfile
+import time
+import uuid
+from typing import Any, Dict, List, Optional
+
+import pyarrow.parquet as pq
+
+from ray.data._internal.execution.interfaces import TaskContext
+from ray.data._internal.util import _check_import
+from ray.data.block import Block, BlockAccessor, BlockMetadata
+from ray.data.datasource.datasource import Datasource, Reader, ReadTask, WriteResult
+from ray.types import ObjectRef
+from ray.util.annotations import PublicAPI
+
+logger = logging.getLogger(__name__)
+
+MAX_RETRY_CNT = 10
+RATE_LIMIT_EXCEEDED_SLEEP_TIME = 11
+
+
+class _BigQueryDatasourceReader(Reader):
+    def __init__(
+        self,
+        project_id: str,
+        dataset: Optional[str] = None,
+        query: Optional[str] = None,
+        parallelism: Optional[int] = -1,
+        **kwargs: Optional[Dict[str, Any]],
+    ):
+        self._project_id = project_id
+        self._dataset = dataset
+        self._query = query
+        self._kwargs = kwargs
+
+        if query is not None and dataset is not None:
+            raise ValueError(
+                "Query and dataset kwargs cannot both be provided "
+                + "(must be mutually exclusive)."
+            )
+
+    def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
+        from google.cloud import bigquery, bigquery_storage
+
+        def _read_single_partition(stream) -> Block:
+            client = bigquery_storage.BigQueryReadClient()
+            reader = client.read_rows(stream.name)
+            return reader.to_arrow()
+
+        if self._query:
+            query_client = bigquery.Client(project=self._project_id)
+            query_job = query_client.query(self._query)
+            query_job.result()
+            destination = str(query_job.destination)
+            dataset_id = destination.split(".")[-2]
+            table_id = destination.split(".")[-1]
+        else:
+            self._validate_dataset_table_exist(self._project_id, self._dataset)
+            dataset_id = self._dataset.split(".")[0]
+            table_id = self._dataset.split(".")[1]
+
+        bqs_client = bigquery_storage.BigQueryReadClient()
+        table = f"projects/{self._project_id}/datasets/{dataset_id}/tables/{table_id}"
+
+        if parallelism == -1:
+            parallelism = None
+        requested_session = bigquery_storage.types.ReadSession(
+            table=table,
+            data_format=bigquery_storage.types.DataFormat.ARROW,
+        )
+        read_session = bqs_client.create_read_session(
+            parent=f"projects/{self._project_id}",
+            read_session=requested_session,
+            max_stream_count=parallelism,
+        )
+
+        read_tasks = []
+        logger.info("Created streams: " + str(len(read_session.streams)))
+        if len(read_session.streams) < parallelism:
+            logger.info(
+                "The number of streams created by the "
+                + "BigQuery Storage Read API is less than the requested "
+                + "parallelism due to the size of the dataset."
+            )
+
+        for stream in read_session.streams:
+            # Create a metadata block object to store schema, etc.
+            metadata = BlockMetadata(
+                num_rows=None,
+                size_bytes=None,
+                schema=None,
+                input_files=None,
+                exec_stats=None,
+            )
+
+            # Create the read task and pass the no-arg wrapper and metadata in
+            read_task = ReadTask(
+                lambda stream=stream: [_read_single_partition(stream)],
+                metadata,
+            )
+            read_tasks.append(read_task)
+
+        return read_tasks
+
+    def estimate_inmemory_data_size(self) -> Optional[int]:
+        return None
+
+    def _validate_dataset_table_exist(self, project_id: str, dataset: str) -> None:
+        from google.api_core import exceptions
+        from google.cloud import bigquery
+
+        client = bigquery.Client(project=project_id)
+        dataset_id = dataset.split(".")[0]
+        try:
+            client.get_dataset(dataset_id)
+        except exceptions.NotFound:
+            raise ValueError(
+                "Dataset {} is not found. Please ensure that it exists.".format(
+                    dataset_id
+                )
+            )
+
+        try:
+            client.get_table(dataset)
+        except exceptions.NotFound:
+            raise ValueError(
+                "Table {} is not found. Please ensure that it exists.".format(dataset)
+            )
+
+
+@PublicAPI(stability="alpha")
+class BigQueryDatasource(Datasource):
+    def create_reader(self, **kwargs) -> Reader:
+        _check_import(self, module="google.cloud", package="bigquery")
+        _check_import(self, module="google.cloud", package="bigquery_storage")
+        _check_import(self, module="google.api_core", package="exceptions")
+        return _BigQueryDatasourceReader(**kwargs)
+
+    def write(
+        self,
+        blocks: List[ObjectRef[Block]],
+        ctx: TaskContext,
+        project_id: str,
+        dataset: str,
+    ) -> WriteResult:
+        from google.api_core import exceptions
+        from google.cloud import bigquery
+
+        def _write_single_block(block: Block, project_id: str, dataset: str):
+            block = BlockAccessor.for_block(block).to_arrow()
+
+            client = bigquery.Client(project=project_id)
+            job_config = bigquery.LoadJobConfig(autodetect=True)
+            job_config.source_format = bigquery.SourceFormat.PARQUET
+            job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
+
+            with tempfile.TemporaryDirectory() as temp_dir:
+                fp = os.path.join(temp_dir, f"block_{uuid.uuid4()}.parquet")
+                pq.write_table(block, fp, compression="SNAPPY")
+
+                retry_cnt = 0
+                while retry_cnt < MAX_RETRY_CNT:
+                    with open(fp, "rb") as source_file:
+                        job = client.load_table_from_file(
+                            source_file, dataset, job_config=job_config
+                        )
+                    retry_cnt += 1
+                    try:
+                        logger.info(job.result())
+                        break
+                    except exceptions.Forbidden as e:
+                        logger.info("Rate limit exceeded... Sleeping to try again")
+                        logger.debug(e)
+                        time.sleep(RATE_LIMIT_EXCEEDED_SLEEP_TIME)
+
+        # Set up datasets to write
+        client = bigquery.Client(project=project_id)
+        dataset_id = dataset.split(".", 1)[0]
+        try:
+            client.create_dataset(f"{project_id}.{dataset_id}", timeout=30)
+            logger.info("Created dataset " + dataset_id)
+        except exceptions.Conflict:
+            logger.info(
+                f"Dataset {dataset_id} already exists. "
+                "The table will be overwritten if it already exists."
+            )
+
+        # Delete table if it already exists
+        client.delete_table(f"{project_id}.{dataset}", not_found_ok=True)
+
+        for block in blocks:
+            _write_single_block(block, project_id, dataset)
+        return "ok"