oracle
diff --git a/‎ads/aqua/common/enums.py
Lines changed: 12 additions & 0 deletions b/‎ads/aqua/common/enums.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎ads/aqua/extension/deployment_handler.py
Lines changed: 122 additions & 39 deletions b/‎ads/aqua/extension/deployment_handler.py
Lines changed: 122 additions & 39 deletions
diff --git a/‎ads/aqua/model/model.py
Lines changed: 2 additions & 1 deletion b/‎ads/aqua/model/model.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎ads/aqua/modeldeployment/__init__.py
Lines changed: 2 additions & 4 deletions b/‎ads/aqua/modeldeployment/__init__.py
Lines changed: 2 additions & 4 deletions
diff --git a/‎ads/aqua/modeldeployment/deployment.py
Lines changed: 5 additions & 4 deletions b/‎ads/aqua/modeldeployment/deployment.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎ads/aqua/modeldeployment/inference.py
Lines changed: 0 additions & 74 deletions b/‎ads/aqua/modeldeployment/inference.py
Lines changed: 0 additions & 74 deletions
diff --git a/‎ads/common/oci_client.py
Lines changed: 1 addition & 1 deletion b/‎ads/common/oci_client.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/release_notes.rst
Lines changed: 15 additions & 0 deletions b/‎docs/source/release_notes.rst
Lines changed: 15 additions & 0 deletions
@@ -20,6 +20,12 @@ class Resource(ExtendedEnum):
     MODEL_VERSION_SET = "model-version-sets"
 
 
+class PredictEndpoints(ExtendedEnum):
+    CHAT_COMPLETIONS_ENDPOINT = "/v1/chat/completions"
+    TEXT_COMPLETIONS_ENDPOINT = "/v1/completions"
+    EMBEDDING_ENDPOINT = "/v1/embedding"
+
+
 class Tags(ExtendedEnum):
     TASK = "task"
     LICENSE = "license"
@@ -49,6 +55,7 @@ class InferenceContainerType(ExtendedEnum):
 class InferenceContainerTypeFamily(ExtendedEnum):
     AQUA_VLLM_CONTAINER_FAMILY = "odsc-vllm-serving"
     AQUA_VLLM_V1_CONTAINER_FAMILY = "odsc-vllm-serving-v1"
+    AQUA_VLLM_LLAMA4_CONTAINER_FAMILY = "odsc-vllm-serving-llama4"
     AQUA_TGI_CONTAINER_FAMILY = "odsc-tgi-serving"
     AQUA_LLAMA_CPP_CONTAINER_FAMILY = "odsc-llama-cpp-serving"
 
@@ -119,4 +126,9 @@ class Platform(ExtendedEnum):
         InferenceContainerTypeFamily.AQUA_VLLM_V1_CONTAINER_FAMILY,
         InferenceContainerTypeFamily.AQUA_VLLM_CONTAINER_FAMILY,
     ],
+    InferenceContainerTypeFamily.AQUA_VLLM_LLAMA4_CONTAINER_FAMILY: [
+        InferenceContainerTypeFamily.AQUA_VLLM_LLAMA4_CONTAINER_FAMILY,
+        InferenceContainerTypeFamily.AQUA_VLLM_V1_CONTAINER_FAMILY,
+        InferenceContainerTypeFamily.AQUA_VLLM_CONTAINER_FAMILY,
+    ],
 }
@@ -2,16 +2,18 @@
 # Copyright (c) 2024, 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
-from typing import List, Union
+from typing import List, Optional, Union
 from urllib.parse import urlparse
 
 from tornado.web import HTTPError
 
+from ads.aqua.app import logger
+from ads.aqua.client.client import Client, ExtendedRequestError
 from ads.aqua.common.decorator import handle_exceptions
+from ads.aqua.common.enums import PredictEndpoints
 from ads.aqua.extension.base_handler import AquaAPIhandler
 from ads.aqua.extension.errors import Errors
-from ads.aqua.modeldeployment import AquaDeploymentApp, MDInferenceResponse
-from ads.aqua.modeldeployment.entities import ModelParams
+from ads.aqua.modeldeployment import AquaDeploymentApp
 from ads.config import COMPARTMENT_OCID
 
 
@@ -175,23 +177,107 @@ def list_shapes(self):
         )
 
 
-class AquaDeploymentInferenceHandler(AquaAPIhandler):
-    @staticmethod
-    def validate_predict_url(endpoint):
-        try:
-            url = urlparse(endpoint)
-            if url.scheme != "https":
-                return False
-            if not url.netloc:
-                return False
-            return url.path.endswith("/predict")
-        except Exception:
-            return False
+class AquaDeploymentStreamingInferenceHandler(AquaAPIhandler):
+    def _get_model_deployment_response(
+        self,
+        model_deployment_id: str,
+        payload: dict,
+        route_override_header: Optional[str],
+    ):
+        """
+        Returns the model deployment inference response in a streaming fashion.
+
+        This method connects to the specified model deployment endpoint and
+        streams the inference output back to the caller, handling both text
+        and chat completion endpoints depending on the route override.
+
+        Parameters
+        ----------
+        model_deployment_id : str
+            The OCID of the model deployment to invoke.
+            Example: 'ocid1.datasciencemodeldeployment.iad.oc1.xxxyz'
+
+        payload : dict
+            Dictionary containing the model inference parameters.
+            Same example for text completions:
+                {
+                    "max_tokens": 1024,
+                    "temperature": 0.5,
+                    "prompt": "what are some good skills deep learning expert. Give us some tips on how to structure interview with some coding example?",
+                    "top_p": 0.4,
+                    "top_k": 100,
+                    "model": "odsc-llm",
+                    "frequency_penalty": 1,
+                    "presence_penalty": 1,
+                    "stream": true
+                }
+
+        route_override_header : Optional[str]
+            Optional override for the inference route, used for routing between
+            different endpoint types (e.g., chat vs. text completions).
+            Example: '/v1/chat/completions'
+
+        Returns
+        -------
+        Generator[str]
+            A generator that yields strings of the model's output as they are received.
+
+        Raises
+        ------
+        HTTPError
+            If the request to the model deployment fails or if streaming cannot be established.
+        """
+
+        model_deployment = AquaDeploymentApp().get(model_deployment_id)
+        endpoint = model_deployment.endpoint + "/predictWithResponseStream"
+        endpoint_type = model_deployment.environment_variables.get(
+            "MODEL_DEPLOY_PREDICT_ENDPOINT", PredictEndpoints.TEXT_COMPLETIONS_ENDPOINT
+        )
+        aqua_client = Client(endpoint=endpoint)
+
+        if PredictEndpoints.CHAT_COMPLETIONS_ENDPOINT in (
+            endpoint_type,
+            route_override_header,
+        ):
+            try:
+                for chunk in aqua_client.chat(
+                    messages=payload.pop("messages"),
+                    payload=payload,
+                    stream=True,
+                ):
+                    try:
+                        yield chunk["choices"][0]["delta"]["content"]
+                    except Exception as e:
+                        logger.debug(
+                            f"Exception occurred while parsing streaming response: {e}"
+                        )
+            except ExtendedRequestError as ex:
+                raise HTTPError(400, str(ex))
+            except Exception as ex:
+                raise HTTPError(500, str(ex))
+
+        elif endpoint_type == PredictEndpoints.TEXT_COMPLETIONS_ENDPOINT:
+            try:
+                for chunk in aqua_client.generate(
+                    prompt=payload.pop("prompt"),
+                    payload=payload,
+                    stream=True,
+                ):
+                    try:
+                        yield chunk["choices"][0]["text"]
+                    except Exception as e:
+                        logger.debug(
+                            f"Exception occurred while parsing streaming response: {e}"
+                        )
+            except ExtendedRequestError as ex:
+                raise HTTPError(400, str(ex))
+            except Exception as ex:
+                raise HTTPError(500, str(ex))
 
     @handle_exceptions
-    def post(self, *args, **kwargs):  # noqa: ARG002
+    def post(self, model_deployment_id):
         """
-        Handles inference request for the Active Model Deployments
+        Handles streaming inference request for the Active Model Deployments
         Raises
         ------
         HTTPError
@@ -205,32 +291,29 @@ def post(self, *args, **kwargs):  # noqa: ARG002
         if not input_data:
             raise HTTPError(400, Errors.NO_INPUT_DATA)
 
-        endpoint = input_data.get("endpoint")
-        if not endpoint:
-            raise HTTPError(400, Errors.MISSING_REQUIRED_PARAMETER.format("endpoint"))
-
-        if not self.validate_predict_url(endpoint):
-            raise HTTPError(400, Errors.INVALID_INPUT_DATA_FORMAT.format("endpoint"))
-
         prompt = input_data.get("prompt")
-        if not prompt:
-            raise HTTPError(400, Errors.MISSING_REQUIRED_PARAMETER.format("prompt"))
+        messages = input_data.get("messages")
 
-        model_params = (
-            input_data.get("model_params") if input_data.get("model_params") else {}
-        )
-        try:
-            model_params_obj = ModelParams(**model_params)
-        except Exception as ex:
+        if not prompt and not messages:
             raise HTTPError(
-                400, Errors.INVALID_INPUT_DATA_FORMAT.format("model_params")
-            ) from ex
-
-        return self.finish(
-            MDInferenceResponse(prompt, model_params_obj).get_model_deployment_response(
-                endpoint
+                400, Errors.MISSING_REQUIRED_PARAMETER.format("prompt/messages")
             )
+        if not input_data.get("model"):
+            raise HTTPError(400, Errors.MISSING_REQUIRED_PARAMETER.format("model"))
+        route_override_header = self.request.headers.get("route", None)
+        self.set_header("Content-Type", "text/event-stream")
+        response_gen = self._get_model_deployment_response(
+            model_deployment_id, input_data, route_override_header
         )
+        try:
+            for chunk in response_gen:
+                self.write(chunk)
+                self.flush()
+            self.finish()
+        except Exception as ex:
+            self.set_status(ex.status_code)
+            self.write({"message": "Error occurred", "reason": str(ex)})
+            self.finish()
 
 
 class AquaDeploymentParamsHandler(AquaAPIhandler):
@@ -294,5 +377,5 @@ def post(self, *args, **kwargs):  # noqa: ARG002
     ("deployments/?([^/]*)", AquaDeploymentHandler),
     ("deployments/?([^/]*)/activate", AquaDeploymentHandler),
     ("deployments/?([^/]*)/deactivate", AquaDeploymentHandler),
-    ("inference", AquaDeploymentInferenceHandler),
+    ("inference/stream/?([^/]*)", AquaDeploymentStreamingInferenceHandler),
 ]
@@ -390,7 +390,8 @@ def create_multi(
                 raise AquaValueError(
                     "The selected models are associated with different container families: "
                     f"{list(selected_models_deployment_containers)}."
-                    "For multi-model deployment, all models in the group must share the same container family."
+                    "For multi-model deployment, all models in the group must belong to the same container "
+                    "family or to compatible container families."
                 )
         else:
             deployment_container = selected_models_deployment_containers.pop()
 
@@ -1,8 +1,6 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# Copyright (c) 2024 Oracle and/or its affiliates.
+# Copyright (c) 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 from ads.aqua.modeldeployment.deployment import AquaDeploymentApp
-from ads.aqua.modeldeployment.inference import MDInferenceResponse
 
-__all__ = ["AquaDeploymentApp", "MDInferenceResponse"]
+__all__ = ["AquaDeploymentApp"]
@@ -17,7 +17,11 @@
     ComputeShapeSummary,
     ContainerPath,
 )
-from ads.aqua.common.enums import InferenceContainerTypeFamily, ModelFormat, Tags
+from ads.aqua.common.enums import (
+    InferenceContainerTypeFamily,
+    ModelFormat,
+    Tags,
+)
 from ads.aqua.common.errors import AquaRuntimeError, AquaValueError
 from ads.aqua.common.utils import (
     DEFINED_METADATA_TO_FILE_MAP,
@@ -871,7 +875,6 @@ def get(self, model_deployment_id: str, **kwargs) -> "AquaDeploymentDetail":
         model_deployment = self.ds_client.get_model_deployment(
             model_deployment_id=model_deployment_id, **kwargs
         ).data
-
         oci_aqua = (
             (
                 Tags.AQUA_TAG in model_deployment.freeform_tags
@@ -916,7 +919,6 @@ def get(self, model_deployment_id: str, **kwargs) -> "AquaDeploymentDetail":
         aqua_deployment = AquaDeployment.from_oci_model_deployment(
             model_deployment, self.region
         )
-
         if Tags.MULTIMODEL_TYPE_TAG in model_deployment.freeform_tags:
             aqua_model_id = model_deployment.freeform_tags.get(
                 Tags.AQUA_MODEL_ID_TAG, UNKNOWN
@@ -947,7 +949,6 @@ def get(self, model_deployment_id: str, **kwargs) -> "AquaDeploymentDetail":
             aqua_deployment.models = [
                 AquaMultiModelRef(**metadata) for metadata in multi_model_metadata
             ]
-
         return AquaDeploymentDetail(
             **vars(aqua_deployment),
             log_group=AquaResourceIdentifier(
 
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
+# Copyright (c) 2021, 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 import logging
 
@@ -2,6 +2,21 @@
 Release Notes
 =============
 
+2.13.9
+-------
+Release date: May 19, 2025
+
+* Additional precision support in ForecastOperator.
+* AI Quick Actions: Use defined-metadata to include configuration for fine-tuned models.
+* AI Quick Actions: Support for embedding models in a multi model deployment.
+* AI Quick Actions: Fixed a bug in multi-model deployment to use model artifact json directly instead of accessing service bucket when creating a new grouped model.
+* AI Quick Actions telemetry improvements and enhancement to use threadpool instead of creating unbounded number of threads for telemetry.
+* AI Quick Actions: Support for ``list`` API for compute capacity reservations to onboard Bring-your-own-reservations (BYOR).
+* AI Quick Actions: Fixed a bug which now allows multiple parameters for deployment parameters.
+* AI Quick Actions: Enhances the model deployment logic for  vLLM architecture version.
+* AI Quick Actions: Enhances functionality to retrieve deployment configuration for fine-tuned models.
+
+
 2.13.8
 -------
 Release date: April 15, 2025