oracle · elizjo · Apr 25, 2025 · Apr 21, 2025 · Apr 23, 2025 · Apr 23, 2025
@@ -151,6 +151,9 @@ class AquaMultiModelRef(Serializable):
         The name of the model.
     gpu_count : Optional[int]
         Number of GPUs required for deployment.
+    model_task : Optional[str]
+        The task that model operates on.
+        If specified, overrides by-default completion | chat inference endpoints with embedding endpoint.
     env_var : Optional[Dict[str, Any]]
         Optional environment variables to override during deployment.
     artifact_location : Optional[str]
@@ -162,6 +165,7 @@ class AquaMultiModelRef(Serializable):
     gpu_count: Optional[int] = Field(
         None, description="The gpu count allocation for the model."
     )
+    model_task: Optional[str] = Field(None, description="The task that model operates on.")
     env_var: Optional[dict] = Field(
         default_factory=dict, description="The environment variables of the model."
     )

@@ -28,3 +28,8 @@ class FineTuningCustomMetadata(ExtendedEnum):
 class MultiModelSupportedTaskType(ExtendedEnum):
     TEXT_GENERATION = "text-generation"
     TEXT_GENERATION_ALT = "text_generation"
+    EMBEDDING_ALT = "text_embedding"
+
+class MultiModelConfigMode(ExtendedEnum):
+    EMBEDDING = "embedding"
+    DEFAULT = "completion"
@@ -80,7 +80,7 @@
     ImportModelDetails,
     ModelValidationResult,
 )
-from ads.aqua.model.enums import MultiModelSupportedTaskType
+from ads.aqua.model.enums import MultiModelSupportedTaskType, MultiModelConfigMode
 from ads.common.auth import default_signer
 from ads.common.oci_resource import SEARCH_TYPE, OCIResource
 from ads.common.utils import (
@@ -316,6 +316,11 @@ def create_multi(
 
             display_name_list.append(display_name)
 
+            model_task = source_model.freeform_tags.get(Tags.TASK, UNKNOWN)
+
+            if model_task != UNKNOWN:
+                self._get_task(model, model_task)
+
             # Retrieve model artifact
             model_artifact_path = source_model.artifact
             if not model_artifact_path:
@@ -704,6 +709,15 @@ def edit_registered_model(
         else:
             raise AquaRuntimeError("Only registered unverified models can be edited.")
 
+    def _get_task(
+        self,
+        model: AquaMultiModelRef,
+        freeform_task_tag: str
+    ) -> str:
+        """In a Multi Model Deployment, will set model task if freeform task tag from model needs a non-completion endpoint (embedding)"""
+        if freeform_task_tag == MultiModelSupportedTaskType.EMBEDDING_ALT:
+            model.model_task = MultiModelConfigMode.EMBEDDING
+
     def _fetch_metric_from_metadata(
         self,
         custom_metadata_list: ModelCustomMetadata,

@@ -178,9 +178,7 @@ def create(
         # validate instance shape availability in compartment
         available_shapes = [
             shape.name.lower()
-            for shape in self.list_shapes(
-                compartment_id=compartment_id
-            )
+            for shape in self.list_shapes(compartment_id=compartment_id)
         ]
 
         if create_deployment_details.instance_shape.lower() not in available_shapes:
@@ -645,7 +643,11 @@ def _create_multi(
                 os_path = ObjectStorageDetails.from_path(artifact_path_prefix)
                 artifact_path_prefix = os_path.filepath.rstrip("/")
 
-            model_config.append({"params": params, "model_path": artifact_path_prefix})
+            # override by-default completion/ chat endpoint with other endpoint (embedding)
+            config_data = {"params": params, "model_path": artifact_path_prefix}
+            if model.model_task:
+                config_data["model_task"] = model.model_task
+            model_config.append(config_data)
             model_name_list.append(model.model_name)
 
         env_var.update({AQUA_MULTI_MODEL_CONFIG: json.dumps({"models": model_config})})

@@ -276,7 +276,7 @@ class TestDataset:
                         "environment_configuration_type": "OCIR_CONTAINER",
                         "environment_variables": {
                             "MODEL_DEPLOY_PREDICT_ENDPOINT": "/v1/completions",
-                            "MULTI_MODEL_CONFIG": '{ "models": [{ "params": "--served-model-name model_one --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_one/5be6479/artifact/"}, {"params": "--served-model-name model_two --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_two/83e9aa1/artifact/"}, {"params": "--served-model-name model_three --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_three/83e9aa1/artifact/"}]}',
+                            "MULTI_MODEL_CONFIG": '{ "models": [{ "params": "--served-model-name model_one --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_one/5be6479/artifact/", "model_task": "embedding"}, {"params": "--served-model-name model_two --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_two/83e9aa1/artifact/"}, {"params": "--served-model-name model_three --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_three/83e9aa1/artifact/"}]}',
                         },
                         "health_check_port": 8080,
                         "image": "dsmc://image-name:1.0.0.0",
@@ -486,27 +486,30 @@ class TestDataset:
                 "gpu_count": 2,
                 "model_id": "test_model_id_1",
                 "model_name": "test_model_1",
+                "model_task": "embedding",
                 "artifact_location": "test_location_1",
             },
             {
                 "env_var": {},
                 "gpu_count": 2,
                 "model_id": "test_model_id_2",
                 "model_name": "test_model_2",
+                "model_task": None,
                 "artifact_location": "test_location_2",
             },
             {
                 "env_var": {},
                 "gpu_count": 2,
                 "model_id": "test_model_id_3",
                 "model_name": "test_model_3",
+                "model_task": None,
                 "artifact_location": "test_location_3",
             },
         ],
         "model_id": "ocid1.datasciencemodel.oc1.<region>.<OCID>",
         "environment_variables": {
             "MODEL_DEPLOY_PREDICT_ENDPOINT": "/v1/completions",
-            "MULTI_MODEL_CONFIG": '{ "models": [{ "params": "--served-model-name model_one --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_one/5be6479/artifact/"}, {"params": "--served-model-name model_two --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_two/83e9aa1/artifact/"}, {"params": "--served-model-name model_three --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_three/83e9aa1/artifact/"}]}',
+            "MULTI_MODEL_CONFIG": '{ "models": [{ "params": "--served-model-name model_one --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_one/5be6479/artifact/", "model_task": "embedding"}, {"params": "--served-model-name model_two --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_two/83e9aa1/artifact/"}, {"params": "--served-model-name model_three --tensor-parallel-size 1 --max-model-len 2096", "model_path": "models/model_three/83e9aa1/artifact/"}]}',
         },
         "cmd": [],
         "console_link": "https://cloud.oracle.com/data-science/model-deployments/ocid1.datasciencemodeldeployment.oc1.<region>.<MD_OCID>?region=region-name",
@@ -965,20 +968,23 @@ class TestDataset:
             "gpu_count": 1,
             "model_id": "ocid1.compartment.oc1..<OCID>",
             "model_name": "model_one",
+            "model_task": "embedding",
             "artifact_location": "artifact_location_one",
         },
         {
             "env_var": {"--test_key_two": "test_value_two"},
             "gpu_count": 1,
             "model_id": "ocid1.compartment.oc1..<OCID>",
             "model_name": "model_two",
+            "model_task": None,
             "artifact_location": "artifact_location_two",
         },
         {
             "env_var": {"--test_key_three": "test_value_three"},
             "gpu_count": 1,
             "model_id": "ocid1.compartment.oc1..<OCID>",
             "model_name": "model_three",
+            "model_task": None,
             "artifact_location": "artifact_location_three",
         },
     ]
@@ -1787,20 +1793,23 @@ def test_create_deployment_for_multi_model(
         model_info_1 = AquaMultiModelRef(
             model_id="test_model_id_1",
             model_name="test_model_1",
+            model_task="embedding",
             gpu_count=2,
             artifact_location="test_location_1",
         )
 
         model_info_2 = AquaMultiModelRef(
             model_id="test_model_id_2",
             model_name="test_model_2",
+            model_task=None,
             gpu_count=2,
             artifact_location="test_location_2",
         )
 
         model_info_3 = AquaMultiModelRef(
             model_id="test_model_id_3",
             model_name="test_model_3",
+            model_task=None,
             gpu_count=2,
             artifact_location="test_location_3",
         )
@@ -1826,6 +1835,7 @@ def test_create_deployment_for_multi_model(
 
         expected_attributes = set(AquaDeployment.__annotations__.keys())
         actual_attributes = result.to_dict()
+
         assert set(actual_attributes) == set(expected_attributes), "Attributes mismatch"
         expected_result = copy.deepcopy(TestDataset.aqua_multi_deployment_object)
         expected_result["state"] = "CREATING"