feature/ODSC-41635/Support Data Flow Pools (#212)

mingkang111 · web-flow · commit f46439d9305a · 2023-06-09T16:39:18.000-07:00
diff --git a/ads/jobs/builders/infrastructure/dataflow.py b/ads/jobs/builders/infrastructure/dataflow.py
@@ -391,6 +391,7 @@ class DataFlow(Infrastructure):
     CONST_OCPUS = "ocpus"
     CONST_ID = "id"
     CONST_PRIVATE_ENDPOINT_ID = "private_endpoint_id"
+    CONST_POOL_ID = "pool_id"
     CONST_FREEFORM_TAGS = "freeform_tags"
     CONST_DEFINED_TAGS = "defined_tags"
 
@@ -411,8 +412,9 @@ class DataFlow(Infrastructure):
         CONST_OCPUS: CONST_OCPUS,
         CONST_ID: CONST_ID,
         CONST_PRIVATE_ENDPOINT_ID: "privateEndpointId",
+        CONST_POOL_ID: "poolId",
         CONST_FREEFORM_TAGS: "freeformTags",
-        CONST_DEFINED_TAGS: "definedTags"
+        CONST_DEFINED_TAGS: "definedTags",
     }
 
     def __init__(self, spec: dict = None, **kwargs):
@@ -425,8 +427,10 @@ def __init__(self, spec: dict = None, **kwargs):
             spec = {
                 k: v
                 for k, v in spec.items()
-                if (f"with_{camel_to_snake(k)}" in self.__dir__()
-                or (k == "defined_tags" or "freeform_tags"))
+                if (
+                    f"with_{camel_to_snake(k)}" in self.__dir__()
+                    or (k == "defined_tags" or "freeform_tags")
+                )
                 and v is not None
             }
             defaults.update(spec)
@@ -809,10 +813,34 @@ def with_defined_tag(self, **kwargs) -> "DataFlow":
         """
         return self.set_spec(self.CONST_DEFINED_TAGS, kwargs)
 
+    def with_pool_id(self, pool_id: str) -> "DataFlow":
+        """
+        Set the Data Flow Pool Id for a Data Flow job.
+
+        Parameters
+        ----------
+        pool_id: str
+            The OCID of a Data Flow Pool.
+
+        Returns
+        -------
+        DataFlow
+            the Data Flow instance itself
+        """
+        if not hasattr(CreateApplicationDetails, "pool_id"):
+            raise EnvironmentError(
+                "Data Flow Pool has not been supported in the current OCI SDK installed."
+            )
+        return self.set_spec(self.CONST_POOL_ID, pool_id)
+
     def __getattr__(self, item):
         if item == self.CONST_DEFINED_TAGS or item == self.CONST_FREEFORM_TAGS:
             return self.get_spec(item)
-        elif f"with_{item}" in self.__dir__() and item != "defined_tag" and item != "freeform_tag":
+        elif (
+            f"with_{item}" in self.__dir__()
+            and item != "defined_tag"
+            and item != "freeform_tag"
+        ):
             return self.get_spec(item)
         raise AttributeError(f"Attribute {item} not found.")
 
@@ -832,6 +860,11 @@ def create(self, runtime: DataFlowRuntime, **kwargs) -> "DataFlow":
         DataFlow
             a Data Flow job instance
         """
+        if self.pool_id:
+            if not hasattr(CreateApplicationDetails, "pool_id"):
+                raise EnvironmentError(
+                    "Data Flow Pool has not been supported in the current OCI SDK installed."
+                )
         # Set default display_name if not specified - randomly generated easy to remember name
         if not self.name:
             self.name = utils.get_random_name_for_resource()
diff --git a/docs/source/user_guide/apachespark/dataflow-spark-magic.rst b/docs/source/user_guide/apachespark/dataflow-spark-magic.rst
@@ -32,7 +32,7 @@ Data Flow Sessions are accessible through the following conda environment:
 
 * PySpark 3.2 and Data Flow 2.0 (pyspark32_p38_cpu_v2)
 
-You can customize **pypspark32_p38_cpu_v1**, publish it, and use it as a runtime environment for a Data Flow Session.
+You can customize **pypspark32_p38_cpu_v2**, publish it, and use it as a runtime environment for a Data Flow Session.
 
 Policies
 ********
diff --git a/docs/source/user_guide/apachespark/dataflow.rst b/docs/source/user_guide/apachespark/dataflow.rst
@@ -159,9 +159,9 @@ You could submit a notebook using ADS SDK APIs. Here is an example to submit a n
             "ocid1.compartment.oc1.<your compartment id>"
         )
         .with_driver_shape("VM.Standard.E4.Flex")
-		.with_driver_shape_config(ocpus=2, memory_in_gbs=32)
-		.with_executor_shape("VM.Standard.E4.Flex")
-		.with_executor_shape_config(ocpus=4, memory_in_gbs=64)
+        .with_driver_shape_config(ocpus=2, memory_in_gbs=32)
+        .with_executor_shape("VM.Standard.E4.Flex")
+        .with_executor_shape_config(ocpus=4, memory_in_gbs=64)
         .with_logs_bucket_uri("oci://mybucket@mytenancy/")
         .with_private_endpoint_id("ocid1.dataflowprivateendpoint.oc1.iad.<your private endpoint ocid>")
         .with_configuration({
@@ -231,8 +231,8 @@ create applications.
 
 In the following "hello-world" example, ``DataFlow`` is populated with ``compartment_id``,
 ``driver_shape``, ``driver_shape_config``, ``executor_shape``, ``executor_shape_config``
-, ``spark_version``, ``defined_tags`` and ``freeform_tags``. ``DataFlowRuntime`` is 
-populated with ``script_uri`` and ``script_bucket``. The ``script_uri`` specifies the 
+, ``spark_version``, ``defined_tags`` and ``freeform_tags``. ``DataFlowRuntime`` is
+populated with ``script_uri`` and ``script_bucket``. The ``script_uri`` specifies the
 path to the script. It can be local or remote (an Object Storage path). If the path
 is local, then ``script_bucket`` must be specified additionally because Data Flow
 requires a script to be available in Object Storage. ADS
@@ -270,9 +270,9 @@ accepted. In the next example, the prefix is given for ``script_bucket``.
             .with_compartment_id("oci.xx.<compartment_id>")
             .with_logs_bucket_uri("oci://mybucket@mynamespace/dflogs")
             .with_driver_shape("VM.Standard.E4.Flex")
-		    .with_driver_shape_config(ocpus=2, memory_in_gbs=32)
-		    .with_executor_shape("VM.Standard.E4.Flex")
-		    .with_executor_shape_config(ocpus=4, memory_in_gbs=64)
+            .with_driver_shape_config(ocpus=2, memory_in_gbs=32)
+            .with_executor_shape("VM.Standard.E4.Flex")
+            .with_executor_shape_config(ocpus=4, memory_in_gbs=64)
             .with_spark_version("3.0.2")
             .with_defined_tag(
                 **{"Oracle-Tags": {"CreatedBy": "test_name@oracle.com"}}
@@ -391,9 +391,9 @@ In the next example, ``archive_uri`` is given as an Object Storage location.
             .with_compartment_id("oci1.xxx.<compartment_ocid>")
             .with_logs_bucket_uri("oci://mybucket@mynamespace/prefix")
             .with_driver_shape("VM.Standard.E4.Flex")
-		    .with_driver_shape_config(ocpus=2, memory_in_gbs=32)
-		    .with_executor_shape("VM.Standard.E4.Flex")
-		    .with_executor_shape_config(ocpus=4, memory_in_gbs=64)
+            .with_driver_shape_config(ocpus=2, memory_in_gbs=32)
+            .with_executor_shape("VM.Standard.E4.Flex")
+            .with_executor_shape_config(ocpus=4, memory_in_gbs=64)
             .with_spark_version("3.0.2")
             .with_configuration({
                 "spark.driverEnv.myEnvVariable": "value1",
diff --git a/tests/unitary/default_setup/jobs/test_jobs_dataflow.py b/tests/unitary/default_setup/jobs/test_jobs_dataflow.py
@@ -28,6 +28,7 @@
     DataFlowRuntime,
     DataFlowNotebookRuntime,
 )
+from oci.data_flow.models import CreateApplicationDetails
 
 logger.setLevel(logging.DEBUG)
 
@@ -47,7 +48,13 @@
     language="PYTHON",
     logs_bucket_uri="oci://test_bucket@test_namespace/",
     private_endpoint_id="test_private_endpoint",
+    pool_id="ocid1.dataflowpool.oc1..<unique_ocid>",
 )
+EXPECTED_YAML_LENGTH = 614
+if not hasattr(CreateApplicationDetails, "pool_id"):
+    SAMPLE_PAYLOAD.pop("pool_id")
+    EXPECTED_YAML_LENGTH = 567
+
 random_seed = 42
 
 
@@ -124,7 +131,7 @@ def test_create_delete(self, mock_to_dict, mock_client):
                     df.lifecycle_state
                     == oci.data_flow.models.Application.LIFECYCLE_STATE_DELETED
                 )
-                assert len(df.to_yaml()) == 567
+                assert len(df.to_yaml()) == EXPECTED_YAML_LENGTH
 
     def test_create_df_app_with_default_display_name(
         self,
@@ -319,14 +326,16 @@ def df(self):
             ).with_num_executors(
                 2
             ).with_private_endpoint_id(
-                "test_private_endpoint"
+                SAMPLE_PAYLOAD["private_endpoint_id"]
             ).with_freeform_tag(
                 test_freeform_tags_key="test_freeform_tags_value",
             ).with_defined_tag(
                 test_defined_tags_namespace={
                     "test_defined_tags_key": "test_defined_tags_value"
                 }
             )
+            if SAMPLE_PAYLOAD.get("pool_id", None):
+                df.with_pool_id(SAMPLE_PAYLOAD["pool_id"])
         return df
 
     def test_create_with_builder_pattern(self, mock_to_dict, mock_client, df):
@@ -341,6 +350,8 @@ def test_create_with_builder_pattern(self, mock_to_dict, mock_client, df):
                 "test_defined_tags_key": "test_defined_tags_value"
             }
         }
+        if SAMPLE_PAYLOAD.get("pool_id", None):
+            assert df.pool_id == SAMPLE_PAYLOAD["pool_id"]
 
         rt = (
             DataFlowRuntime()
@@ -483,50 +494,44 @@ def test_to_and_from_dict(self, df):
         assert df3_dict["spec"]["numExecutors"] == 2
 
     def test_shape_and_details(self, mock_to_dict, mock_client, df):
-        df.with_driver_shape(
-            "VM.Standard2.1"
-        ).with_executor_shape(
+        df.with_driver_shape("VM.Standard2.1").with_executor_shape(
             "VM.Standard.E4.Flex"
         )
 
         rt = (
             DataFlowRuntime()
-                .with_script_uri(SAMPLE_PAYLOAD["file_uri"])
-                .with_archive_uri(SAMPLE_PAYLOAD["archive_uri"])
-                .with_custom_conda(
-                    "oci://my_bucket@my_namespace/conda_environments/cpu/PySpark 3.0 and Data Flow/5.0/pyspark30_p37_cpu_v5"
-                )
-                .with_overwrite(True)
+            .with_script_uri(SAMPLE_PAYLOAD["file_uri"])
+            .with_archive_uri(SAMPLE_PAYLOAD["archive_uri"])
+            .with_custom_conda(
+                "oci://my_bucket@my_namespace/conda_environments/cpu/PySpark 3.0 and Data Flow/5.0/pyspark30_p37_cpu_v5"
+            )
+            .with_overwrite(True)
         )
 
         with pytest.raises(
             ValueError,
-            match="`executor_shape` and `driver_shape` must be from the same shape family."
+            match="`executor_shape` and `driver_shape` must be from the same shape family.",
         ):
             with patch.object(DataFlowApp, "client", mock_client):
                 with patch.object(DataFlowApp, "to_dict", mock_to_dict):
                     df.create(rt)
 
-        df.with_driver_shape(
-            "VM.Standard2.1"
-        ).with_driver_shape_config(
+        df.with_driver_shape("VM.Standard2.1").with_driver_shape_config(
             memory_in_gbs=SAMPLE_PAYLOAD["driver_shape_config"]["memory_in_gbs"],
             ocpus=SAMPLE_PAYLOAD["driver_shape_config"]["ocpus"],
-        ).with_executor_shape(
-            "VM.Standard2.16"
-        ).with_executor_shape_config(
+        ).with_executor_shape("VM.Standard2.16").with_executor_shape_config(
             memory_in_gbs=SAMPLE_PAYLOAD["executor_shape_config"]["memory_in_gbs"],
             ocpus=SAMPLE_PAYLOAD["executor_shape_config"]["ocpus"],
         )
 
         with pytest.raises(
             ValueError,
-            match="Shape config is not required for non flex shape from user end."
+            match="Shape config is not required for non flex shape from user end.",
         ):
             with patch.object(DataFlowApp, "client", mock_client):
                 with patch.object(DataFlowApp, "to_dict", mock_to_dict):
                     df.create(rt)
-            
+
 
 class TestDataFlowNotebookRuntime:
     @pytest.mark.skipif(