diff --git a/sdk/python/foundation-models/system/reinforcement-learning/reinforcement-learning.ipynb b/sdk/python/foundation-models/system/reinforcement-learning/reinforcement-learning.ipynb
new file mode 100644
index 000000000..8eac40859
--- /dev/null
+++ b/sdk/python/foundation-models/system/reinforcement-learning/reinforcement-learning.ipynb
@@ -0,0 +1,763 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div style=\"background: linear-gradient(135deg, #0078d4 0%, #106ebe 50%, #005a9e 100%); color: white; padding: 30px; border-radius: 12px; margin: 20px 0; box-shadow: 0 4px 15px rgba(0, 120, 212, 0.3);\">\n",
+    "    <h1 style=\"margin: 0; text-align: center; font-size: 2.2em; font-weight: 600; letter-spacing: 0.5px; font-family: 'Segoe UI', -apple-system, BlinkMacSystemFont, Roboto, 'Helvetica Neue', sans-serif;\">\n",
+    "        Ignite Demo to Train, Customize, Optimize and Host Reasoning Models in AzureML\n",
+    "    </h1>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\"> Sections Breakdown </h3>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<ol style=\"color: #2c3e50; line-height: 1.8;\">\n",
+    "<li>🔧 <b>Setup Workspace:</b> Configure Azure ML workspace and authenticate</li>\n",
+    "<li>🧠 <b>RFT Training (GRPO):</b> Fine-tune reasoning model using Group Relative Policy Optimization</li>\n",
+    "<li>⚡ <b>RFT Training (Reinforce++):</b> Fine-tune using critic-free reinforcement learning</li>\n",
+    "<li>📦 <b>Create Data Assets:</b> Convert pipeline outputs to reusable data assets</li>\n",
+    "<li>📊 <b>Model Performance Comparison:</b> Evaluate and compare base model vs GRPO vs Reinforce++</li>\n",
+    "<li>🎯 <b>Create Draft Model:</b> Train EAGLE3 draft model for speculative decoding</li>\n",
+    "<li>🔗 <b>Combine Draft and Base Model:</b> Package base and draft models for deployment</li>\n",
+    "<li>🚀 <b>Deploy Speculative Endpoint:</b> Deploy managed online endpoint with speculative decoding</li>\n",
+    "<li>📡 <b>Deploy Base Endpoint:</b> Deploy baseline endpoint for performance comparison</li>\n",
+    "<li>🧪 <b>Test Base and Speculative Decoding Endpoints:</b> Validate both endpoints with inference requests</li>\n",
+    "<li>📈 <b>Endpoints Performance Evaluation:</b> Compare metrics between base and speculative decoding endpoints</li>\n",
+    "</ol>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">Prerequisites & Requirements</h3>\n",
+    "</div>\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Compute Requirements\n",
+    "* **Training:** Standard_ND96isr_H100_v5, Standard_ND96amsr_A100_v4\n",
+    "* **Deployment:** Kubernetes cluster with GPU instances (octagpu)\n",
+    "##### Dataset & Models\n",
+    "* **Dataset:** [FinQA](https://finqasite.github.io/) - 2.8k financial reports with 8k Q&A pairs\n",
+    "* **Models:** [Llama-3.1-8B-Instruct-FP8](https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8), [DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div style=\"background: #e7f3ff; border: 1px solid #b3d9ff; padding: 15px; border-radius: 5px; margin: 20px 0;\">\n",
+    "    <p style=\"margin: 0; color: #0066cc;\">\n",
+    "        <strong>💡 Note:</strong> Ensure your Azure ML workspace has access to the required compute resources and GPU instances before proceeding with the training and deployment steps.\n",
+    "    </p>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div style=\"background: linear-gradient(135deg, #0078d4 0%, #106ebe 50%, #005a9e 100%); color: white; padding: 30px; border-radius: 12px; margin: 20px 0; box-shadow: 0 4px 15px rgba(0, 120, 212, 0.3);\">\n",
+    "    <h1 style=\"margin: 0; text-align: center; font-size: 2.2em; font-weight: 600; letter-spacing: 0.5px; font-family: 'Segoe UI', -apple-system, BlinkMacSystemFont, Roboto, 'Helvetica Neue', sans-serif;\">\n",
+    "        RFT Finetuning - GRPO & Reinforce Plus Plus\n",
+    "    </h1>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">⚙️ Section 1. Setup Workspace and Register Components</h3>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "markdown"
+    }
+   },
+   "source": [
+    "<p>This section establishes connectivity to your workspace and sets up the required authentication.</p>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "from scripts.utils import setup_workspace\n",
+    "from scripts.dataset import prepare_finqa_dataset\n",
+    "from scripts.run import get_run_metrics\n",
+    "from scripts.reinforcement_learning import run_rl_training_pipeline\n",
+    "from scripts.evaluation import run_evaluation_pipeline\n",
+    "from scripts.speculative_decoding import (\n",
+    "    run_draft_model_pipeline,\n",
+    "    prepare_combined_model_for_deployment,\n",
+    "    deploy_speculative_decoding_endpoint,\n",
+    ")\n",
+    "from scripts.deployment import create_managed_deployment, test_deployment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setup Azure ML workspace and registry connections\n",
+    "ml_client, registry_ml_client = setup_workspace(\n",
+    "    config_path=\"./config.json\", registry_name=\"Ignite_2025_Demo\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<p>Prepare dataset for Finetuning. This would save train, test and valid dataset under data folder</p>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data_path, test_data_path, valid_data_path = prepare_finqa_dataset(\n",
+    "    ml_client, data_dir=\"data\", register_datasets=False\n",
+    ")  # Prepare the FinQA dataset for training and evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "##### 📖 Components and Pipelines used in this notebook can be installed locally by following the instructions listed here : [Ignite Components and Pipelines](Ignite_Components_And_Pipelines/README.md)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## <span style=\"font-size:0.8em;\"> </span>\n",
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">🧩 Section 2. Run RFT Training Pipeline (GRPO)</h3>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<p>GRPO (Group Relative Policy Optimization) is an advanced reinforcement learning technique for fine-tuning LLMs that uses relative learning instead of absolute rewards by comparing model outputs within groups/batches. \n",
+    "<ul><li>This approach processes multiple responses simultaneously to learn relative preferences through direct policy optimization using reinforcement learning signals and preference learning from human feedback or reward models.</li> \n",
+    "<li>Common use cases include instruction following improvement, mathematical reasoning enhancement, code generation optimization, and general conversational AI alignment. </li>\n",
+    "<li>In this notebook, we use GRPO to fine-tune an LLM on financial reasoning tasks, improving the model's ability to solve complex financial questions with step-by-step reasoning.</li>\n",
+    "</p>\n",
+    "\n",
+    "<p>\n",
+    "The RFT run will output multiple model checkpoints base on value of <b>trainer_save_freq</b> which is defined in config.\n",
+    "<p>\n",
+    "<i>For example, if this value is 20, the model checkpoint is stored for every 20th optimization step of the trainer. \n",
+    "Where model checkpoint is a fully deployable copy of model's weights fine-tuned until that point.</i></p>\n",
+    "</p>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run complete RL training pipeline: train model, register model\n",
+    "grpo_job, status, grpo_registered_model = run_rl_training_pipeline(\n",
+    "    ml_client=ml_client,\n",
+    "    registry_ml_client=registry_ml_client,\n",
+    "    base_model_id=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",  # Huggingface ID ot the model which is to be RFT finetuned.\n",
+    "    compute_cluster=\"k8s-a100-compute\",  # Name of the Kubernetes Cluster in Workspace\n",
+    "    rl_method=\"grpo\",  # RL methodology to be selected for training run.\n",
+    "    train_data_path=train_data_path,  # Path to training dataset\n",
+    "    valid_data_path=valid_data_path,  # Path to validation dataset\n",
+    "    config={\n",
+    "        \"num_nodes_finetune\": 1,  # Training specific arguments which can be overridden by user.\n",
+    "        \"trainer_total_epochs\": 1,\n",
+    "        \"trainer_save_freq\": 20,\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## <span style=\"font-size:0.8em;\"> </span>\n",
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">🧩Section 3. Run RFT Training Pipeline ( Reinforce++ )</h3>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "markdown"
+    }
+   },
+   "source": [
+    "<p>Reinforce++ is a critic-free reinforcement learning framework that addresses key limitations of traditional RLHF algorithms like PPO by introducing Global Advantage Normalization instead of prompt-level normalization.\n",
+    "<ul><li>This method eliminates the computational and memory overhead of critic networks while providing more stable and theoretically sound advantage estimation by normalizing across entire global batches rather than small prompt-specific groups.</li>\n",
+    "<li>Reinforce++ offers significant advantages including removal of critic network overhead, theoretically unbiased estimation (bias vanishes as batch size increases), superior stability compared to local normalization methods like GRPO/RLOO, and better resistance to overfitting in RLHF scenarios.</li>\n",
+    "<li>In this notebook, we use Reinforce++ to fine-tune an LLM on financial reasoning tasks, leveraging its global advantage normalization to achieve more stable policy updates and superior performance in complex agentic reasoning scenarios.</li>\n",
+    "</p>\n",
+    "\n",
+    "<p>\n",
+    "The RFT run will output multiple model checkpoints base on value of <b>trainer_save_freq</b> which is defined in config.\n",
+    "</p>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run complete RL training pipeline: verify datasets, register data, train model, register model\n",
+    "rlpp_job, status, rlpp_registered_model = run_rl_training_pipeline(\n",
+    "    ml_client=ml_client,\n",
+    "    registry_ml_client=registry_ml_client,\n",
+    "    base_model_id=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",  # Huggingface ID ot the model which is to be RFT finetuned.\n",
+    "    compute_cluster=\"k8s-a100-compute\",  # Name of the Kubernetes Cluster in workspace.\n",
+    "    rl_method=\"reinforce_plus_plus\",  # RL methodology to be selected for training run.\n",
+    "    train_data_path=train_data_path,  # Path to training dataset\n",
+    "    valid_data_path=valid_data_path,  # Path to validation dataset\n",
+    "    config={\n",
+    "        \"num_nodes_finetune\": 1,\n",
+    "        \"trainer_total_epochs\": 1,  # Training specific arguments which can be overridden by user.\n",
+    "        \"trainer_save_freq\": 20,\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## <span style=\"font-size:0.8em;\"> </span>\n",
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">📊Section 4. Compare Model Performance across Base Model vs GRPO vs Reinforce++ </h3>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "markdown"
+    }
+   },
+   "source": [
+    "\n",
+    "<p>This section evaluates and compares the performance of different finetuned models and base model across key metrics:</p>\n",
+    "\n",
+    "\n",
+    "\n",
+    "<p><strong>Evaluation Process:</strong></p>\n",
+    "<ul>\n",
+    "<li>Tests multiple checkpoints from each training method</li>\n",
+    "<li>Evaluates on FinQA validation dataset for financial reasoning accuracy</li>\n",
+    "<li>Provides comprehensive metrics to determine the best performing model</li>\n",
+    "</ul>\n",
+    "\n",
+    "<p><em>💡 The evaluation will help identify which RL method produces the most effective model for financial reasoning tasks.</em></p>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<p> We will now submit evaluation job, with grpo and rlpp model outputs </p>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function which invokes the model evaluation pipeline.\n",
+    "eval_job, status = run_evaluation_pipeline(\n",
+    "    ml_client=ml_client,\n",
+    "    registry_ml_client=registry_ml_client,\n",
+    "    compute_cluster=\"k8s-a100-compute\",\n",
+    "    grpo_model_dir=grpo_registered_model.path,  # Output from GPRO RL provided as data asset created from earlier step.\n",
+    "    rlpp_model_dir=rlpp_registered_model.path,  # Output from Reinforce_plus_plus RL provided as data asset created from earlier step.\n",
+    "    validation_dataset_path=test_data_path,  # Path to test dataset\n",
+    "    run_config={\n",
+    "        \"num_nodes\": 1,  # Number of nodes to be used for evaluation run.\n",
+    "        \"number_of_gpu_to_use\": 8,  # Number of GPUs in a node to be used for evaluation run.\n",
+    "        \"base_path_1_label\": \"GRPO\",  # Label to identify GRPO model outputs.\n",
+    "        \"base_path_2_label\": \"RLPP\",  # Label to identify RLPP model outputs.\n",
+    "        \"explore_pattern_1\": \"global_step_{checkpoint}/actor/lora_adapter/\",\n",
+    "        \"explore_pattern_2\": \"global_step_{checkpoint}/actor/lora_adapter/\",\n",
+    "        \"checkpoint_values_1\": \"12\",\n",
+    "        \"checkpoint_values_2\": \"12\",\n",
+    "        \"use_lora_adapters_1\": True,\n",
+    "        \"use_lora_adapters_2\": True,\n",
+    "        \"evaluate_base_model\": True,  # Set to True to evaluate base model along with RL finetuned models.\n",
+    "        \"hf_model_id\": \"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",  # Huggingface ID of the base model\n",
+    "        \"max_prompt_length\": 8196,\n",
+    "        \"max_response_length\": 1024,\n",
+    "        \"dtype\": \"bfloat16\",\n",
+    "        \"tensor_parallel_size\": 4,\n",
+    "    },  # Configuration parameters for evaluation run.\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<p> Now, lets fetch metrics from evalution run inorder to show comparison</p>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_metrics = get_run_metrics(eval_job)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BASE_metrics = {k: v for k, v in eval_metrics.items() if \"base_model\" in k}\n",
+    "GRPO_metrics = {k: v for k, v in eval_metrics.items() if \"GRPO\" in k}\n",
+    "RLPP_metrics = {k: v for k, v in eval_metrics.items() if \"RLPP\" in k}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min_base_accuracy = (\n",
+    "    min([v for k, v in BASE_metrics.items() if \"min\" in k]) if BASE_metrics else 0\n",
+    ")\n",
+    "max_grpo_accuracy = (\n",
+    "    max([v for k, v in GRPO_metrics.items() if \"max\" in k]) if GRPO_metrics else 0\n",
+    ")\n",
+    "max_rlpp_accuracy = (\n",
+    "    max([v for k, v in RLPP_metrics.items() if \"max\" in k]) if RLPP_metrics else 0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<p>GRPO vs Reinforce++ vs Base Model Performance Comparison</p>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "categories = [\"Baseline Model\", \"GRPO Model\", \"RL++ Model\"]\n",
+    "values = [min_base_accuracy, max_grpo_accuracy, max_rlpp_accuracy]\n",
+    "\n",
+    "plt.bar(categories, values, color=[\"blue\", \"orange\", \"green\"])\n",
+    "\n",
+    "# Add labels and title\n",
+    "plt.xlabel(\"Model Type\", fontsize=12, labelpad=10, color=\"#BC1B1B\")\n",
+    "plt.ylabel(\"Accuracy\", fontsize=12, labelpad=10, color=\"#BC1B1B\")\n",
+    "plt.title(\n",
+    "    \"Graph Comparing Baseline, GRPO, and RL++ Model Accuracies\", pad=10, color=\"#BC1B1B\"\n",
+    ")\n",
+    "\n",
+    "# Show plot\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<p>The evaluation results demonstrate that both GRPO and Reinforce++ fine-tuning methods significantly improve financial reasoning performance compared to the base model. \n",
+    "These accuracy metrics help identify the optimal checkpoint for deployment in the speculative decoding pipeline.</p>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div style=\"background: linear-gradient(135deg, #0078d4 0%, #106ebe 50%, #005a9e 100%); color: white; padding: 30px; border-radius: 12px; margin: 20px 0; box-shadow: 0 4px 15px rgba(0, 120, 212, 0.3);\">\n",
+    "    <h1 style=\"margin: 0; text-align: center; font-size: 2.2em; font-weight: 600; letter-spacing: 0.5px; font-family: 'Segoe UI', -apple-system, BlinkMacSystemFont, Roboto, 'Helvetica Neue', sans-serif;\">\n",
+    "        Speculative Decoding\n",
+    "    </h1>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### In the following sections would cover creation of draft model, combining base and draft model, deploying speculative decoding model, as well as endpoint benchmarking."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## <span style=\"font-size:0.8em;\"> </span>\n",
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">🧩Section 5. Create Draft Model for Speculative Decoding</h3>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "markdown"
+    }
+   },
+   "source": [
+    "<p>EAGLE3 (Enhanced Adaptive Generation with Lookahead for Efficient Execution) is the latest advancement in speculative decoding that provides significant performance improvements:</p>\n",
+    "\n",
+    "<ul>\n",
+    "<li><strong>Direct Token Prediction with Multi-layer Fusion:</strong> Abandons feature prediction for direct token prediction using advanced multi-layer feature fusion, enabling more accurate speculation and full benefit from scaled training data</li>\n",
+    "<li><strong>Superior Performance:</strong> Achieves speedup ratios up to 6.5x (1.4x improvement over EAGLE-2) while maintaining identical output quality through advanced speculative decoding techniques</li>\n",
+    "</ul>\n",
+    "\n",
+    "<p>This pipeline creates a specialized draft model that works alongside the base model to enable dramatically improved inference performance for reasoning tasks. The EAGLE3 approach is particularly effective for complex financial reasoning scenarios where maintaining accuracy while achieving significant speed improvements is crucial.</p>\n",
+    "\n",
+    "<p><strong>Reference:</strong> <a href=\"https://arxiv.org/abs/2503.01840\">https://arxiv.org/abs/2503.01840</a></p>\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train EAGLE3 draft model for speculative decoding\n",
+    "draft_job, draft_status = run_draft_model_pipeline(\n",
+    "    ml_client=ml_client,\n",
+    "    registry_ml_client=registry_ml_client,\n",
+    "    compute_cluster=\"k8s-a100-compute\",  # Name of the Kubernetes Cluster in Workspace.\n",
+    "    num_epochs=1,  # Number of train epochs to be run by draft trainer.\n",
+    "    monitor=False,  # Set to True to wait for completion.\n",
+    "    base_model_mlflow_path=\"azureml://registries/azureml-meta/models/Meta-Llama-3-8B-Instruct/versions/9\",\n",
+    "    draft_train_data_path=\"./data_for_draft_model/train/sharegpt_train_small.jsonl\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## <span style=\"font-size:0.8em;\"> </span>\n",
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">🔄Section 6. Prepare Combined Model for Deployment</h3>\n",
+    "</div>\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "html"
+    }
+   },
+   "source": [
+    "<p>For creation of a <strong>speculative decoding endpoint</strong>, we need <strong>two models</strong> working in tandem:</p>\n",
+    "\n",
+    "<ul>\n",
+    "    <li><strong>Base Model:</strong> The primary model (e.g., Llama-3.1-8B-Instruct-FP8) that generates high-quality outputs</li>\n",
+    "    <li><strong>Draft Model:</strong> The EAGLE3 model that quickly generates candidate tokens for speculation</li>\n",
+    "</ul>\n",
+    "\n",
+    "<p><strong>Why Combine Into Single AML Model?</strong></p>\n",
+    "\n",
+    "<p>We'll package both models into a <strong>single Azure ML model</strong> to:</p>\n",
+    "<ul>\n",
+    "    <li>Simplify deployment to Azure ML online endpoints</li>\n",
+    "    <li>Ensure both models are versioned and managed together</li>\n",
+    "    <li>Streamline the endpoint creation process</li>\n",
+    "    <li>Enable seamless speculative decoding inference</li>\n",
+    "</ul>\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download draft model, download base model, combine and register for deployment\n",
+    "combined_model = prepare_combined_model_for_deployment(\n",
+    "    ml_client=ml_client,\n",
+    "    registry_ml_client=registry_ml_client,\n",
+    "    draft_job_name=draft_job.name,  # Previous Draft Trainer job name for downloading draft model.\n",
+    "    base_model_hf_id=\"nvidia/Llama-3.1-8B-Instruct-FP8\",  # Huggingface ID of the base model paired along with draft model.\n",
+    "    model_name=\"speculative-decode-model\",  # User provided model name for combined model.\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## <span style=\"font-size:0.8em;\"> </span>\n",
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">🚀Section 7. Deploy Speculative Decoding Endpoint</h3>\n",
+    "</div>\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<p>This section creates and deploys a <strong>managed online endpoint</strong> that leverages the combined model for speculative decoding inference.</p>\n",
+    "<strong>What happens during deployment:</strong>\n",
+    "<ul>\n",
+    "    <li><strong>Endpoint Creation:</strong> Sets up a managed online endpoint in Azure ML.</li>\n",
+    "    <li><strong>Model Loading:</strong> Loads both the base model and EAGLE3 draft model onto GPU instances, setting it up for inference.</li>\n",
+    "</ul>\n",
+    "<p>The deployment process typically takes 15-20 minutes depending on instance availability.</p>\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Deploy managed online endpoint with speculative decoding\n",
+    "endpoint_name = deploy_speculative_decoding_endpoint(\n",
+    "    ml_client=ml_client,  # ML Client which specifies the workspace where endpoint gets deployed.\n",
+    "    combined_model=combined_model,  # Reference from previous steps where combined model is created.\n",
+    "    instance_type=\"octagepu\",  # Instance type Kubernetes Cluster\n",
+    "    compute_name=\"k8s-a100-compute\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## <span style=\"font-size:0.8em;\"> </span>\n",
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">🚀Section 8. Deploy Base Model Endpoint for Comparison</h3>\n",
+    "</div>\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<p>This section creates and deploys a <strong>managed online endpoint</strong> with just the base model for performance comparison against the speculative decoding endpoint.</p>\n",
+    "\n",
+    "<strong>What happens during deployment:</strong>\n",
+    "<ul>\n",
+    "    <li><strong>Endpoint Creation:</strong> Sets up a standard managed online endpoint in Azure ML.</li>\n",
+    "    <li><strong>Base Model Loading:</strong> Loads only the base model onto GPU instances for standard inference.</li>\n",
+    "    <li><strong>Performance Baseline:</strong> Provides a baseline to measure the speedup achieved by speculative decoding.</li>\n",
+    "</ul>\n",
+    "\n",
+    "<p>This baseline endpoint allows you to compare inference speed between standard generation and speculative decoding approaches.</p>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Deploy managed online endpoint with base model\n",
+    "base_endpoint_name = create_managed_deployment(  # Function to create endpoint for base model.\n",
+    "    ml_client=ml_client,  # ML Client which specifies the workspace where endpoint gets deployed.\n",
+    "    model_asset_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",  # Huggingface ID of the base model.\n",
+    "    instance_type=\"Standard_ND96amsr_A100_v4\",  # Compute SKU on which base model will be deployed.\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## <span style=\"font-size:0.8em;\"> </span>\n",
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">🧪Section 9. Test Deployment</h3>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<p>This section tests both the speculative decoding endpoint and base model endpoint.</p>\n",
+    "\n",
+    "<strong>What happens during testing:</strong>\n",
+    "<ul>\n",
+    "    <li><strong>Endpoint Validation:</strong> Confirms both endpoints are responding correctly to inference requests.</li>\n",
+    "</ul>\n",
+    "\n",
+    "<p>The testing process validates that the deployed models can handle requests and respond successfully.</p>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "speculative_result = test_deployment(\n",
+    "    ml_client, endpoint_name\n",
+    ")  # Test the deployed endpoint with a financial reasoning question\n",
+    "base_result = test_deployment(\n",
+    "    ml_client, base_endpoint_name\n",
+    ")  # Test the deployed endpoint with a financial reasoning question"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## <span style=\"font-size:0.8em;\"> </span>\n",
+    "\n",
+    "<div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 20px 0;\">\n",
+    "    <h3 style=\"margin: 0; text-align: center;\">📊Section 10. Performance Evaluation Pipeline</h3>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<p>This section launches a comprehensive evaluation pipeline to compare performance metrics between the base model endpoint and speculative decoding endpoint.</p>\n",
+    "\n",
+    "\n",
+    "<p><strong>What happens during evaluation:</strong></p>\n",
+    "<ul>\n",
+    "    <li><strong>Performance Comparison:</strong> Analyzes speed improvements achieved by speculative decoding</li>\n",
+    "    <li><strong>Statistical Analysis:</strong> Provides detailed metrics and visualizations of performance gains</li>\n",
+    "</ul>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run evaluation job to compare base model and speculative decoding endpoints' performance\n",
+    "evaluation_job = run_evaluation_speculative_decoding(\n",
+    "    ml_client=ml_client,\n",
+    "    base_endpoint_name=base_endpoint_name,  # Base model endpoint from previous step.\n",
+    "    speculative_endpoint_name=endpoint_name,  # Speculative endpoint from previous step.\n",
+    "    base_model=\"meta-llama/Meta-Llama-3-8B-Instruct\",  # HuggingFace repo ID of the model used in base endpoint, used for tokenization.\n",
+    "    speculative_model=\"meta-llama/Meta-Llama-3-8B-Instruct\",  # HuggingFace repo ID of the model used in speculative decoding endpoint, used for tokenization.\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Following metrics are used to evaluate the performance of the endpoints:\n",
+    " \n",
+    "- **Input Throughput (Tokens/sec)**: Measures how many input tokens per second the model/server can process.\n",
+    "- **Output Throughput (Tokens/sec)**: Measures how many output tokens per second the model/server can generate.\n",
+    "- **Request Throughput (Requests/sec)**: Measures how many complete requests the model/server can handle per second.\n",
+    " \n",
+    "It is expected that the **speculative decoding endpoint will outperform the base model endpoint** across all these metrics, demonstrating the efficiency gains achieved through speculative decoding."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"metrics-base-target-spec-dec.png\" alt=\"Performance Metrics: Base Model vs Speculative Decoding\" style=\"max-width: 100%; height: auto;\">"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/sdk/python/foundation-models/system/reinforcement-learning/requirements.txt b/sdk/python/foundation-models/system/reinforcement-learning/requirements.txt
new file mode 100644
index 000000000..0002d5983
--- /dev/null
+++ b/sdk/python/foundation-models/system/reinforcement-learning/requirements.txt
@@ -0,0 +1,8 @@
+# use python3.12 or above
+azure-ai-ml==1.30.0
+azure-identity==1.25.1
+azureml-mlflow==1.60.0.post1
+huggingface-hub==1.1.5
+matplotlib==3.10.7
+mlflow==2.22.2
+ipykernel
\ No newline at end of file
diff --git a/sdk/python/foundation-models/system/reinforcement-learning/scripts/dataset.py b/sdk/python/foundation-models/system/reinforcement-learning/scripts/dataset.py
new file mode 100644
index 000000000..13cd01942
--- /dev/null
+++ b/sdk/python/foundation-models/system/reinforcement-learning/scripts/dataset.py
@@ -0,0 +1,166 @@
+import os
+import json
+import subprocess
+import pandas as pd
+from tempfile import TemporaryDirectory
+from azure.ai.ml import MLClient
+from azure.ai.ml.entities import Data
+from azure.ai.ml.constants import AssetTypes
+
+
+def register_dataset(ml_client: MLClient, dataset_name: str, file_path: str):
+    """Register a dataset in Azure ML."""
+    data_asset = Data(
+        name=dataset_name,
+        path=file_path,
+        type=AssetTypes.URI_FILE,
+        description="FinQA dataset",
+        tags={"source": "https://github.com/czyssrs/FinQA"},
+        version="1",
+    )
+
+    registered_data = ml_client.data.create_or_update(data_asset)
+    print(f"Registered dataset {registered_data.name}.")
+    return registered_data
+
+
+def download_finqa_dataset(src: str, target_dir: str = "data/raw"):
+    """Prepare the FinQA dataset for training and evaluation."""
+    with TemporaryDirectory() as tmpdir:
+        print(f"Cloning raw FinQA dataset to {tmpdir} ...")
+        subprocess.run(["git", "clone", src, tmpdir], check=True)
+        os.makedirs(target_dir, exist_ok=True)
+        print("Converting FinQA dataset to jsonl format ...")
+        dataset_dir = os.path.join(tmpdir, "dataset")
+        filenames = ["train.json", "dev.json", "test.json"]
+        for filename in filenames:
+            target_file_name = filename.split(".")[0] + ".jsonl"
+            convert_to_jsonl(
+                current_path=os.path.join(dataset_dir, filename),
+                target_path=os.path.join(target_dir, target_file_name),
+            )
+
+
+def convert_to_jsonl(current_path: str, target_path: str):
+    """Convert FinQA dataset file to jsonl format."""
+    with open(current_path, "r") as rf, open(target_path, "w") as wf:
+        lines = json.loads(rf.read())
+        for item in lines:
+            wf.write(json.dumps(item) + "\n")
+    print(f"Converted {current_path} to {target_path}.")
+
+
+def prepare_finqa_dataset(
+    ml_client: MLClient, data_dir: str = "data", register_datasets: bool = False
+) -> tuple[str, str, str]:
+    """Prepare the FinQA dataset for training and evaluation."""
+    # VERL finetuning relies on acceptable data sources for reward modeling and evaluation
+    data_source = "openai/gsm8k"
+
+    # download and convert dataset
+    raw_data_dir = os.path.join(data_dir, "raw")
+    FINQA_GIT_REPO = "https://github.com/czyssrs/FinQA"
+    download_finqa_dataset(src=FINQA_GIT_REPO, target_dir=raw_data_dir)
+    train_dataset_path = os.path.join(raw_data_dir, "train.jsonl")
+    test_dataset_path = os.path.join(raw_data_dir, "test.jsonl")
+    valid_dataset_path = os.path.join(raw_data_dir, "dev.jsonl")
+
+    def format_list_to_string(data_list: list):
+        """Convert list to string with newline separation"""
+        if not data_list:
+            return ""
+        if isinstance(data_list, str):
+            return data_list
+        return "\n".join(str(item) for item in data_list)
+
+    def format_table(table_list: list):
+        """Format table data as string"""
+        if not table_list:
+            return ""
+        table_str = "\nTable:\n"
+        for row in table_list:
+            if isinstance(row, list):
+                table_str += " | ".join(str(cell) for cell in row) + "\n"
+            else:
+                table_str += str(row) + "\n"
+        return table_str
+
+    def map_fn(example: pd.Series, idx: int, split: str):
+        """Map function to transform each example into desired format."""
+        pre_instruction = "Please answer the following financial question based on the context provided."
+        post_instruction = (
+            'Let\'s think step by step and output the final answer after "####".'
+        )
+        qa = example.get("qa", {})
+        question = qa.get("question", "")
+        answer = qa.get("answer", qa.get("exe_ans", ""))
+        gold_evidence = "\n".join(qa.get("gold_inds", {}).values())
+        pre_text = format_list_to_string(example.get("pre_text", []))
+        post_text = format_list_to_string(example.get("post_text", []))
+        table = format_table(example.get("table", [])).strip()
+        # Build prompt content according to specified schema
+        prompt_content = "\n\n".join(
+            [
+                pre_instruction,
+                "Context: " + pre_text,
+                gold_evidence,
+                post_text,
+                table,
+                "Question: " + question,
+                post_instruction,
+            ]
+        )
+        data = {
+            "data_source": data_source,
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": prompt_content,
+                }
+            ],
+            "ability": "financial_reasoning",
+            "reward_model": {"style": "rule", "ground_truth": answer},
+            "extra_info": {
+                "index": idx,
+                "answer": answer,
+                "question": question,
+                "split": split,
+            },
+        }
+        return data
+
+    # load datasets
+    train_dataset = pd.read_json(train_dataset_path, lines=True)
+    test_dataset = pd.read_json(test_dataset_path, lines=True)
+    valid_dataset = pd.read_json(valid_dataset_path, lines=True)
+
+    # map datasets
+    train_dataset = train_dataset.apply(
+        lambda x: map_fn(x, x.name, split="train"), axis=1
+    )
+    test_dataset = test_dataset.apply(lambda x: map_fn(x, x.name, split="test"), axis=1)
+    valid_dataset = valid_dataset.apply(
+        lambda x: map_fn(x, x.name, split="valid"), axis=1
+    )
+
+    # save locally as jsonl
+    train_dataset_path = os.path.join(data_dir, "train.jsonl")
+    test_dataset_path = os.path.join(data_dir, "test.jsonl")
+    valid_dataset_path = os.path.join(data_dir, "valid.jsonl")
+    train_dataset.to_json(train_dataset_path, orient="records", lines=True)
+    test_dataset.to_json(test_dataset_path, orient="records", lines=True)
+    valid_dataset.to_json(valid_dataset_path, orient="records", lines=True)
+
+    # register datasets
+    if register_datasets:
+        train_data = register_dataset(ml_client, "finqa_train", train_dataset_path)
+        test_data = register_dataset(ml_client, "finqa_test", test_dataset_path)
+        valid_data = register_dataset(ml_client, "finqa_valid", valid_dataset_path)
+        if (
+            (train_data and train_data.id)
+            and (test_data and test_data.id)
+            and (valid_data and valid_data.id)
+        ):
+            return train_data.id, test_data.id, valid_data.id
+
+    return train_dataset_path, test_dataset_path, valid_dataset_path
diff --git a/sdk/python/foundation-models/system/reinforcement-learning/scripts/deployment.py b/sdk/python/foundation-models/system/reinforcement-learning/scripts/deployment.py
new file mode 100644
index 000000000..944d81a09
--- /dev/null
+++ b/sdk/python/foundation-models/system/reinforcement-learning/scripts/deployment.py
@@ -0,0 +1,200 @@
+import uuid
+import requests
+from typing import Optional
+from azure.ai.ml import MLClient
+from azure.ai.ml.entities import (
+    EndpointAuthKeys,
+    ManagedOnlineEndpoint,
+    ManagedOnlineDeployment,
+    KubernetesOnlineEndpoint,
+    KubernetesOnlineDeployment,
+    ProbeSettings,
+    OnlineRequestSettings,
+)
+
+
+def get_default_probe_settings() -> ProbeSettings:
+    """Get default probe settings for deployments."""
+    return ProbeSettings(  # Probes are APIs exposed by the deployment which informs the frameworktraffic
+        initial_delay=1400,  # if the deployment is healthy and ready to receive
+        period=30,
+        timeout=2,
+        success_threshold=1,
+        failure_threshold=30,
+    )
+
+
+def get_default_request_settings() -> OnlineRequestSettings:
+    """Get default request settings for deployments."""
+    return OnlineRequestSettings(  # Online request setting which controls timeout and concurrent request per instance
+        request_timeout_ms=90000,
+        max_concurrent_requests_per_instance=4,
+    )
+
+
+def create_managed_deployment(
+    ml_client: MLClient,
+    model_asset_id: str,  # Asset ID of the model to deploy
+    instance_type: str,  # Supported instance type for managed deployment
+    environment_asset_id: Optional[str] = None,  # Asset ID of the serving engine to use
+    endpoint_name: Optional[str] = None,
+    endpoint_description: str = "Sample endpoint",
+    endpoint_tags: dict = {},
+    deployment_name: Optional[str] = None,
+    deployment_env_vars: dict = {},
+) -> str:
+    """Create a managed deployment."""
+    guid = str(uuid.uuid4())[:8]  # Unique suffix to avoid name collisions
+    endpoint_name = endpoint_name or f"rl-endpoint"
+    endpoint_name = f"{endpoint_name}-{guid}"  # Unique names prevent collisions and allow parallel experiments
+    deployment_name = deployment_name or "default"
+
+    endpoint = ManagedOnlineEndpoint(  # Use AzureML endpoint abstraction for traffic management and auth
+        name=endpoint_name,
+        auth_mode="key",
+        description=endpoint_description,
+        tags=endpoint_tags,
+    )
+
+    print(f"Creating endpoint: {endpoint_name}")
+    ml_client.online_endpoints.begin_create_or_update(
+        endpoint
+    ).wait()  # Using there the endpoint object to trigger actual endpoint in AML workspace.
+
+    deployment = ManagedOnlineDeployment(  # Use deployment abstraction for scaling, versioning, and isolation
+        name=deployment_name,
+        endpoint_name=endpoint_name,
+        model=model_asset_id,
+        instance_type=instance_type,
+        instance_count=1,
+        environment=environment_asset_id,
+        environment_variables=deployment_env_vars,
+        liveness_probe=get_default_probe_settings(),
+        readiness_probe=get_default_probe_settings(),
+        request_settings=get_default_request_settings(),
+    )
+
+    print(f"Creating deployment (15-20 min)...")  #
+    ml_client.online_deployments.begin_create_or_update(deployment).wait()
+
+    # Route all traffic to new deployment for immediate use
+    endpoint.traffic = {deployment_name: 100}
+    ml_client.online_endpoints.begin_create_or_update(endpoint).result()
+
+    print(f"Endpoint ready: {endpoint_name}")
+
+    return endpoint_name
+
+
+def create_kubernetes_deployment(
+    ml_client: MLClient,
+    model_asset_id: str,  # Asset ID of the model to deploy
+    environment_asset_id: str,  # Asset ID of the serving engine to use
+    instance_type: str,  # Kubernetes supports partial node usage granular upto the GPU level
+    compute_name: str,  # Name of the compute which will be use for endpoint creation
+    endpoint_name: Optional[str] = None,
+    endpoint_description: str = "Sample endpoint",
+    endpoint_tags: dict = {},
+    deployment_name: Optional[str] = None,
+    deployment_env_vars: dict = {},
+    model_mount_path: str = "/var/model-mount",
+) -> str:
+    """Create endpoint using Kubernetes."""
+
+    print("🌐 Creating endpoint...")
+
+    guid = str(uuid.uuid4())[:8]  # Unique suffix to avoid name collisions
+    endpoint_name = endpoint_name or f"rl-endpoint"
+    endpoint_name = f"{endpoint_name}-{guid}"  # Unique names prevent collisions and allow parallel experiments
+    deployment_name = deployment_name or "default"
+
+    endpoint = KubernetesOnlineEndpoint(  # Use AzureML endpoint abstraction for traffic management and auth
+        name=endpoint_name,
+        auth_mode="key",
+        compute=compute_name,
+        description=endpoint_description,
+        tags=endpoint_tags,
+    )
+
+    print(f"Creating endpoint: {endpoint_name}")
+    ml_client.online_endpoints.begin_create_or_update(
+        endpoint
+    ).wait()  # Using there the endpoint object to trigger actual endpoint in AML workspace.
+
+    deployment = KubernetesOnlineDeployment(  # Use deployment abstraction for scaling, versioning, and isolation
+        name=deployment_name,
+        endpoint_name=endpoint_name,
+        model=model_asset_id,
+        model_mount_path=model_mount_path,
+        instance_type=instance_type,
+        instance_count=1,
+        environment=environment_asset_id,
+        environment_variables=deployment_env_vars,
+        liveness_probe=get_default_probe_settings(),
+        readiness_probe=get_default_probe_settings(),
+        request_settings=get_default_request_settings(),
+    )
+
+    print(f"Creating deployment (15-20 min)...")  #
+    ml_client.online_deployments.begin_create_or_update(deployment).wait()
+
+    # Route all traffic to new deployment for immediate use
+    endpoint.traffic = {deployment_name: 100}
+    ml_client.online_endpoints.begin_create_or_update(endpoint).result()
+
+    print(f"Endpoint ready: {endpoint_name}")
+
+    return endpoint_name
+
+
+def test_deployment(ml_client, endpoint_name):
+    """Run a test request against a deployed endpoint and print the result."""
+    print("Testing endpoint...")
+    # Retrieve endpoint URI and API key to authenticate test request
+    scoring_uri = ml_client.online_endpoints.get(endpoint_name).scoring_uri
+    if not scoring_uri:
+        raise ValueError("Scoring URI not found for endpoint.")
+
+    api_keys = ml_client.online_endpoints.get_keys(endpoint_name)
+    if not isinstance(api_keys, EndpointAuthKeys) or not api_keys.primary_key:
+        raise ValueError("API key not found for endpoint.")
+
+    # Use a realistic financial question to verify model reasoning and output format
+    payload = {
+        "messages": [
+            {
+                "role": "user",
+                "content": """Please answer the following financial question:
+
+Context: A company has revenue of $1,000,000 and expenses of $750,000.
+
+Question: What is the profit margin as a percentage?
+Let's think step by step and put final answer after ####.""",
+            }
+        ],
+        "max_tokens": 512,
+        "temperature": 0.7,
+    }
+
+    # Set headers for JSON content and bearer authentication
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {api_keys.primary_key}",
+    }
+
+    response = requests.post(scoring_uri, json=payload, headers=headers)
+
+    if response.status_code == 200:
+        result = response.json()
+        # Extract the model response
+        if "choices" in result and len(result["choices"]) > 0:
+            answer = result["choices"][0]["message"]["content"]
+            print(f"Response received")
+            print(f"\n{'='*60}")
+            print(answer)
+            print(f"{'='*60}\n")
+            return result
+    else:
+        print(f"  ✗ Error: {response.status_code}")
+        print(f"  {response.text}")
+        return None
diff --git a/sdk/python/foundation-models/system/reinforcement-learning/scripts/evaluation.py b/sdk/python/foundation-models/system/reinforcement-learning/scripts/evaluation.py
new file mode 100644
index 000000000..14256d600
--- /dev/null
+++ b/sdk/python/foundation-models/system/reinforcement-learning/scripts/evaluation.py
@@ -0,0 +1,123 @@
+import uuid
+from typing import Optional
+from azure.ai.ml import dsl, Input, MLClient
+from azure.ai.ml.constants import AssetTypes
+from azure.ai.ml.entities import Job
+from scripts.run import monitor_run
+
+
+class EvaluationPipeline:
+    """Run Evaluation"""
+
+    DEFAULT_CONFIGS = {
+        "evaluate_base_model": False,
+        "batch_size": 16,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "tensor_parallel_size": 1,
+        "gpu_memory_utilization": 0.8,
+        "extraction_method": "flexible",
+        "number_of_trials": 3,
+    }
+
+    def __init__(self, ml_client: MLClient, registry_ml_client: MLClient):
+        self.guid = str(uuid.uuid4())[:8]
+        self._ml_client = ml_client
+        self._eval_pipeline_component = registry_ml_client.components.get(
+            name="pipeline_model_evaluation", label="latest"
+        )
+
+    def create_evaluate_pipeline(
+        self,
+        compute: str,
+        model_dir_1: Input,
+        model_dir_2: Input,
+        validation_dataset_path: Input,
+        base_model_path: Optional[Input] = None,
+        instance_type: Optional[str] = None,
+        config={},
+    ) -> Job:
+        """Create and submit evaluation pipeline job using registry component."""
+
+        # Update default configs with any provided config
+        self.DEFAULT_CONFIGS.update(config)
+        print(f"Running with config {self.DEFAULT_CONFIGS}")
+
+        @dsl.pipeline
+        def create_pipeline():
+            eval_pipeline = self._eval_pipeline_component(
+                compute=compute,
+                instance_type=instance_type,
+                base_model_path=base_model_path,
+                checkpoint_base_path_1=model_dir_1,
+                checkpoint_base_path_2=model_dir_2,
+                validation_file=validation_dataset_path,
+                **self.DEFAULT_CONFIGS,
+            )
+            return {"evaluation_results": eval_pipeline.outputs.evaluation_results}
+
+        # Create pipeline object
+        print("Creating evaluation pipeline...")
+        pipeline_object = create_pipeline()
+
+        # Don't use cached results
+        if pipeline_object.settings is not None:
+            pipeline_object.settings.force_rerun = True
+            pipeline_object.settings.continue_on_step_failure = False
+
+        # Submit job
+        print("✓ Submitting Model Evaluation Pipeline ...")
+        pipeline_object.display_name = f"evaluate-model-{self.guid}"
+        eval_run = self._ml_client.jobs.create_or_update(
+            pipeline_object, experiment_name="evaluate-model"
+        )
+
+        print(f"✓ Job submitted: {eval_run.name}")
+        print(f"📊 Studio URL: {eval_run.studio_url}")
+
+        return eval_run
+
+
+def run_evaluation_pipeline(
+    ml_client: MLClient,
+    registry_ml_client: MLClient,
+    compute_cluster: str,
+    grpo_model_dir: str,
+    rlpp_model_dir: str,
+    validation_dataset_path: str,
+    base_model_path: Optional[str] = None,
+    instance_type: Optional[str] = None,
+    run_config: dict = {},
+):
+    """Run evaluation pipeline to compare finetuned models with baseline."""
+    print(" Starting Evaluation Pipeline")
+    pipeline = EvaluationPipeline(ml_client, registry_ml_client)
+
+    grpo_model_input = Input(type=AssetTypes.URI_FOLDER, path=grpo_model_dir)
+    rlpp_model_input = Input(type=AssetTypes.URI_FOLDER, path=rlpp_model_dir)
+    base_model_input = (
+        Input(type=AssetTypes.URI_FOLDER, path=base_model_path)
+        if isinstance(base_model_path, str)
+        else base_model_path
+    )
+    validation_dataset_input = Input(
+        type=AssetTypes.URI_FILE, path=validation_dataset_path
+    )
+
+    eval_job = pipeline.create_evaluate_pipeline(
+        compute=compute_cluster,
+        instance_type=instance_type,
+        model_dir_1=grpo_model_input,
+        model_dir_2=rlpp_model_input,
+        validation_dataset_path=validation_dataset_input,
+        base_model_path=base_model_input,
+        config=run_config,
+    )
+
+    eval_job, status = monitor_run(ml_client, eval_job)
+    if status == "Completed":
+        print("\n Evaluation completed successfully")
+        return eval_job, status
+    else:
+        print(f"\n Job did not complete successfully: {status}")
+        return eval_job, status
diff --git a/sdk/python/foundation-models/system/reinforcement-learning/scripts/reinforcement_learning.py b/sdk/python/foundation-models/system/reinforcement-learning/scripts/reinforcement_learning.py
new file mode 100644
index 000000000..084c1f5f7
--- /dev/null
+++ b/sdk/python/foundation-models/system/reinforcement-learning/scripts/reinforcement_learning.py
@@ -0,0 +1,168 @@
+import uuid
+from azure.ai.ml import Input, MLClient, dsl
+from azure.ai.ml.constants import AssetTypes
+from azure.ai.ml.entities import Model
+from scripts.run import get_run_output_path, monitor_run
+
+
+class RLSpecDecPipeline:
+    """Main class for managing RL training and Speculative Decoding workflow."""
+
+    def __init__(self, ml_client: MLClient, registry_ml_client: MLClient):
+        # We use an unique identifier for naming resources, this prevents name collisions for resources created in this lab
+        self.guid = str(uuid.uuid4())[:8]
+        self._ml_client = ml_client
+        self._registry_ml_client = registry_ml_client
+
+    def create_rl_pipeline(
+        self,
+        huggingface_id,
+        train_data_path,
+        valid_data_path,
+        compute_cluster,
+        config={},
+    ):
+        """Create and submit RL pipeline job using registry component."""
+
+        # Group Relative Position Optimization (GRPO) and Reinforce Plus Plus (RLPP) are novel Reinforcement techniques
+        # designed to finetune a model to comply to a given reward function.  The RL pipeline is an AzureML pipeline which
+        # provides all the steps to finetune a base model using GRPO or RLPP on a given dataset.
+        print("Creating RL pipeline...")
+
+        # Use defaults to ensure reproducibility and avoid missing params
+        default_config = {
+            "instance_type_finetune": "octagpu",
+            "instance_type_model_import": "octacpu",
+            "num_nodes_finetune": 2,
+            "number_of_gpu_to_use_finetuning": 8,
+            "algorithm_adv_estimator": "grpo",
+            "data_max_prompt_length": 8192,
+            "actor_strategy": "fsdp",
+            "trainer_total_epochs": 1,
+            "actor_fsdp_config_mixed_precision_reduce_dtype": "bf16",
+            "actor_fsdp_config_mixed_precision_buffer_dtype": "bf16",
+        }
+        default_config.update(config)  # Allow user override for flexibility
+        algorithm = default_config.get("algorithm_adv_estimator", "grpo").lower()
+        algorithm = algorithm.replace("_", "-")
+
+        # Extract experiment_name from config as that is passed separately
+        if "experiment_name" in default_config:
+            experiment_name = default_config["experiment_name"]
+            del default_config["experiment_name"]
+        else:
+            experiment_name = f"reinforcement-learning-{algorithm}"
+
+        # Use registry component for versioning and reuse
+        pipeline_component_func = self._registry_ml_client.components.get(
+            name="pipeline_rl_finetune", label="latest"
+        )
+
+        # Define pipeline to encapsulate all steps for traceability and reuse
+        @dsl.pipeline
+        def create_pipeline():
+            rl_pipeline = pipeline_component_func(
+                huggingface_id=huggingface_id,
+                compute_model_import=compute_cluster,
+                compute_finetune=compute_cluster,
+                data_train_files=Input(type=AssetTypes.URI_FILE, path=train_data_path),
+                data_val_files=Input(type=AssetTypes.URI_FILE, path=valid_data_path),
+                **default_config,  # Pass all config as kwargs for maintainability and future-proofing
+            )
+            return {"model_output": rl_pipeline.outputs.model_output}
+
+        pipeline_object = create_pipeline()
+
+        # Force rerun to ensure new job, avoid stale results
+        if pipeline_object.settings is not None:
+            pipeline_object.settings.force_rerun = True
+            pipeline_object.settings.continue_on_step_failure = (
+                False  # Fail fast for debugging
+            )
+
+        # Submit job
+        print("Submitting pipeline...")
+        pipeline_object.display_name = f"{algorithm}-{self.guid}"
+        rl_run = self._ml_client.jobs.create_or_update(
+            pipeline_object, experiment_name=experiment_name
+        )
+        print(f"Studio URL: {rl_run.studio_url}")  # Clickable link for monitoring
+
+        return rl_run
+
+    def register_model(self, job, model_name_prefix, description="", tags={}):
+        """Assets must be registered as models before use in endpoints."""
+        print("Registering model from job output...")
+
+        # Use GUID to ensure model name uniqueness across runs
+        model_name = f"{model_name_prefix}-{self.guid}"
+        model_output_path = get_run_output_path(
+            self._ml_client, job.name, "model_output"
+        )
+
+        model = Model(
+            name=model_name,
+            path=model_output_path,
+            type=AssetTypes.CUSTOM_MODEL,
+            description=description,
+            job_name=job.name,
+            tags=tags,
+        )
+
+        registered_model = self._ml_client.models.create_or_update(
+            model
+        )  # Register the model
+        print(f"Model: {registered_model.name} v{registered_model.version}")
+        print(f"ID: {registered_model.id}")
+
+        return registered_model
+
+
+def run_rl_training_pipeline(
+    ml_client: MLClient,
+    registry_ml_client: MLClient,
+    base_model_id: str,
+    train_data_path: str,
+    valid_data_path: str,
+    compute_cluster: str,
+    rl_method="grpo",
+    config={},
+):
+    # Group Relative Position Optimization (GRPO) and Reinforce Plus Plus (RLPP) are novel Reinforcement techniques
+    # designed to finetune a model to comply to a given reward function.  The RL pipeline is an AzureML pipeline which
+    # provides all the steps to finetune a base model using GRPO or RLPP on a given dataset.
+    print("Starting RL Training Pipeline")
+    pipeline = RLSpecDecPipeline(ml_client, registry_ml_client)
+
+    # # We have uploaded the data assets to our registry in advance for this tutorial
+    # train_asset = ml_client.data.get(name="dataset_training_finqa", label="latest")
+    # val_asset = ml_client.data.get(name="dataset_validation_finqa", label="latest")
+
+    # Submit RL pipeline job with all required config and assets
+    config["algorithm_adv_estimator"] = rl_method
+    rl_job = pipeline.create_rl_pipeline(
+        huggingface_id=base_model_id,
+        train_data_path=train_data_path,
+        valid_data_path=valid_data_path,
+        compute_cluster=compute_cluster,
+        config=config,
+    )
+
+    completed_job, status = monitor_run(ml_client, rl_job)
+    if status == "Completed":
+        # Register the trained model for downstream deployment and tracking
+        description = f"{rl_method} fine-tuned model on FinQA"
+        tags = {
+            "base_model_id": base_model_id,
+            "rl_method": rl_method,
+        }
+        registered_model = pipeline.register_model(
+            job=completed_job,
+            model_name_prefix=f"{rl_method}-finqa-model",
+            description=description,
+            tags=tags,
+        )
+        return rl_job, status, registered_model
+    else:
+        print(f"\n Job did not complete successfully: {status}")
+        return rl_job, status, None
diff --git a/sdk/python/foundation-models/system/reinforcement-learning/scripts/run.py b/sdk/python/foundation-models/system/reinforcement-learning/scripts/run.py
new file mode 100644
index 000000000..50f951141
--- /dev/null
+++ b/sdk/python/foundation-models/system/reinforcement-learning/scripts/run.py
@@ -0,0 +1,138 @@
+import time
+import mlflow
+import requests
+from typing import Optional
+from azure.ai.ml import MLClient
+from azure.ai.ml.entities import Job, Workspace
+from azure.ai.ml.entities._assets._artifacts.artifact import Artifact
+
+
+def get_run_details(ml_client: MLClient, job_name: str) -> dict:
+    """Get run details."""
+    # API endpoint template
+    run_details_template = "https://ml.azure.com/api/{location}/history/v1.0/subscriptions/{subscription}/resourceGroups/{resource_group_name}/providers/Microsoft.MachineLearningServices/workspaces/{workspace_name}/experimentids/00000000-0000-0000-0000-000000000000/runs/{job_name}/details"
+
+    # Get workspace details
+    workspace_details: Optional[Workspace] = ml_client.workspaces.get(
+        ml_client.workspace_name
+    )
+    if not workspace_details:
+        raise ValueError("Workspace not found.")
+
+    workspace_id: Optional[str] = workspace_details.id
+    location: Optional[str] = workspace_details.location
+    if not workspace_id or not location:
+        raise ValueError("Workspace ID or location is missing.")
+
+    # Extract subscription ID, resource group name, and workspace name from workspace ID
+    parts = workspace_id.split("/")
+    subscription_id: str = parts[2]
+    resource_group_name: str = parts[4]
+    workspace_name: str = parts[8]
+
+    # Construct run details URI
+    run_details_uri = run_details_template.format(
+        location=location,
+        subscription=subscription_id,
+        resource_group_name=resource_group_name,
+        workspace_name=workspace_name,
+        job_name=job_name,
+    )
+
+    print(f"requesting run details from: {run_details_uri}")
+
+    token = ml_client._credential.get_token(
+        "https://management.azure.com/.default"
+    ).token
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Content-Type": "application/json",
+    }
+
+    # Make GET request to retrieve run details
+    response = requests.get(run_details_uri, headers=headers)
+    response.raise_for_status()
+
+    # Return run details as JSON
+    return response.json()
+
+
+def get_run_output_assetid(ml_client: MLClient, job_name: str, output_name: str) -> str:
+    """Get the assetId of a specific job output."""
+    run_details = get_run_details(ml_client, job_name)
+    if run_details is None:
+        raise ValueError(f"Run details for job '{job_name}' not found.")
+    print(f"Run details retrieved for job: {job_name}")
+    if "outputs" in run_details and output_name in run_details["outputs"]:
+        return run_details["outputs"][output_name]["assetId"]
+    else:
+        raise ValueError(f"Output '{output_name}' not found in job '{job_name}'")
+
+
+def get_run_output_path(ml_client: MLClient, job_name: str, output_name: str) -> str:
+    """Get the path of a specific job output."""
+    run_details = get_run_details(ml_client, job_name)
+    if run_details is None:
+        raise ValueError(f"Run details for job '{job_name}' not found.")
+    print(f"Run details retrieved for job: {job_name}")
+    if "outputs" in run_details and output_name in run_details["outputs"]:
+        assetId = run_details["outputs"][output_name]["assetId"]
+        assetType = run_details["outputs"][output_name]["type"]
+        parts = assetId.split("/")
+        assetName = parts[-3]
+        assetVersion = parts[-1]
+        if assetType == "UriFile" or assetType == "UriFolder":
+            asset = ml_client.data.get(assetName, assetVersion)
+            return str(asset.path)
+        elif assetType == "CustomModel" or assetType == "MLFlowModel":
+            asset = ml_client.models.get(assetName, assetVersion)
+            return str(asset.path)
+        else:
+            raise ValueError(
+                f"Unsupported asset type '{assetType}' for output '{output_name}'"
+            )
+    else:
+        raise ValueError(f"Output '{output_name}' not found in job '{job_name}'")
+
+
+def get_run_metrics(job: Job) -> dict:
+    """Extract metrics from completed job."""
+    if job is None or job.name is None:
+        raise ValueError("Job or job.name is None.")
+
+    print(f"Fetching metrics for job {job.name} ...")
+    evaluation_run = mlflow.get_run(job.name)
+    search_result = mlflow.search_runs(
+        experiment_ids=[evaluation_run.info.experiment_id],
+        filter_string="tags.mlflow.rootRunId = '{}' AND tags.mlflow.runName = '{}'".format(
+            job.name, "component_model_evaluation"
+        ),
+        output_format="list",
+    )
+
+    if len(search_result) == 0:
+        print("No metrics found.")
+        return {}
+
+    eval_run = search_result[0]
+    metrics = eval_run.data.metrics
+    print(f"✓ Metrics extracted: {metrics}")
+    return metrics
+
+
+def monitor_run(
+    ml_client: MLClient, job: Job, poll_interval: int = 30
+) -> tuple[Job, str]:
+    if job is None or job.name is None:
+        raise ValueError("Job or job.name is None.")
+
+    job_name = job.name
+    print(f"Monitoring job: {job_name}")
+    print(f"Checking every {poll_interval} seconds...")
+    while True:
+        job = ml_client.jobs.get(job_name)
+        status = job.status
+        print(f"[{time.strftime('%H:%M:%S')}] Status: {status}")
+        if status in ["Completed", "Failed", "Canceled"]:
+            return job, status
+        time.sleep(poll_interval)
diff --git a/sdk/python/foundation-models/system/reinforcement-learning/scripts/speculative_decoding.py b/sdk/python/foundation-models/system/reinforcement-learning/scripts/speculative_decoding.py
new file mode 100644
index 000000000..b22c420ac
--- /dev/null
+++ b/sdk/python/foundation-models/system/reinforcement-learning/scripts/speculative_decoding.py
@@ -0,0 +1,434 @@
+import os
+import json
+import uuid
+import shutil
+from pathlib import Path
+from huggingface_hub import snapshot_download
+from azure.ai.ml import dsl, Input, MLClient
+from azure.ai.ml.constants import AssetTypes
+from azure.ai.ml.entities import Model
+from scripts.run import monitor_run
+from scripts.deployment import create_kubernetes_deployment
+
+
+class DraftModelPipeline:
+    """Class for managing draft model creation for speculative decoding."""
+
+    def __init__(self, ml_client: MLClient, registry_ml_client: MLClient):
+        self.guid = str(uuid.uuid4())[:8]
+        self.ml_client = ml_client
+        self.registry_ml_client = registry_ml_client
+
+    def create_draft_model_pipeline(
+        self,
+        base_model_path,
+        training_data_path,
+        validation_data_path=None,
+        draft_model_config=None,
+        compute_name=None,
+        num_epochs=1,
+        component_name="speculative_decoding_draft_pipeline",
+    ):
+        # Fine-tuning the draft model in speculative decoding makes its predictions closer to the target model, increasing token acceptance
+        # and reducing rollbacks. This alignment improves decoding speed and efficiency while maintaining output quality. It also enables
+        # better performance for domain-specific tasks by adapting the draft model to relevant data. AzureML provides a prebuilt pipeline for this fine-tuning process.
+
+        print("Creating draft model pipeline...")
+
+        # Use validation data same as training if not provided
+        if validation_data_path is None:
+            validation_data_path = training_data_path
+
+        # Get draft model configuration
+        if draft_model_config is None:
+            draft_model_config = create_draft_model_config()
+
+        # Save draft config locally
+        config_dir = "./draft_config"
+        os.makedirs(config_dir, exist_ok=True)
+        config_path = os.path.join(config_dir, f"draft_config_{self.guid}.json")
+
+        with open(config_path, "w") as f:
+            json.dump(draft_model_config, f, indent=4)
+
+        # Get the draft model pipeline
+        try:
+            pipeline_component_func = self.registry_ml_client.components.get(
+                name=component_name, label="latest"
+            )
+            print(
+                f"Component loaded: {pipeline_component_func.name} v{pipeline_component_func.version}"
+            )
+        except Exception as e:
+            print(f"Failed to load component: {e}")
+            print(f"Make sure component '{component_name}' exists in registry")
+            raise
+
+        # Define the pipeline job
+        @dsl.pipeline
+        def create_pipeline():
+            draft_pipeline = pipeline_component_func(
+                mlflow_model_path=Input(
+                    type=AssetTypes.MLFLOW_MODEL, path=base_model_path
+                ),
+                dataset_train_split=Input(
+                    type=AssetTypes.URI_FILE, path=training_data_path
+                ),
+                dataset_validation_split=Input(
+                    type=AssetTypes.URI_FILE, path=validation_data_path
+                ),
+                draft_model_config=Input(type=AssetTypes.URI_FILE, path=config_path),
+                compute_model_import=compute_name,
+                compute_eagle3_training=compute_name,
+                num_epochs=num_epochs,
+            )
+            return {"output_model": draft_pipeline.outputs.output_model_path}
+
+        # Create pipeline object
+        pipeline_object = create_pipeline()
+
+        # Don't use cached results
+        if pipeline_object.settings is not None:
+            pipeline_object.settings.force_rerun = True
+            pipeline_object.settings.continue_on_step_failure = False
+
+        # Submit job
+        print("Submitting draft model pipeline...")
+        pipeline_object.display_name = f"draft-model-{self.guid}"
+        draft_run = self.ml_client.jobs.create_or_update(
+            pipeline_object, experiment_name="speculative-decoding-draft-model"
+        )
+
+        print(f"Job submitted: {draft_run.name}")
+        print(f"Studio URL: {draft_run.studio_url}")
+
+        return draft_run
+
+    def download_draft_model(self, job_name, output_dir="./models/draft"):
+        """Download draft model from completed pipeline job."""
+        print(f"Downloading draft model from job: {job_name}")
+
+        # Create output directory
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Download model output
+        self.ml_client.jobs.download(
+            name=job_name,
+            output_name="output_model",
+            download_path=output_dir,
+            all=True,
+        )
+
+        print(f"Draft model downloaded to: {output_dir}")
+
+        # Flatten directory structure
+        self._flatten_directory(output_dir)
+
+        # Update config with extended context
+        self._update_draft_config(output_dir)
+
+        return output_dir
+
+    def _flatten_directory(self, directory):
+        """Move all files from subdirectories to root."""
+
+        print("Flattening directory structure...")
+
+        for root, dirs, files in os.walk(directory):
+            for file in files:
+                if root != directory:
+                    source = os.path.join(root, file)
+                    destination = os.path.join(directory, file)
+                    if not os.path.exists(destination):
+                        shutil.move(source, destination)
+
+        # Remove empty subdirectories
+        for root, dirs, files in os.walk(directory, topdown=False):
+            for dir_name in dirs:
+                dir_path = os.path.join(root, dir_name)
+                if not os.listdir(dir_path):
+                    os.rmdir(dir_path)
+
+    def _update_draft_config(self, model_dir):
+        """Update draft model config with extended context settings."""
+        # The settings for running a model come both from the model files as well as tuning we apply on top.
+
+        config_path = os.path.join(model_dir, "config.json")
+
+        if not os.path.exists(config_path):
+            print("config.json not found, skipping update")
+            return
+
+        print("Updating draft model config...")
+
+        with open(config_path, "r") as f:
+            draft_config = json.load(f)
+
+        # Update with extended context settings
+        draft_config.update(
+            {
+                "max_position_embeddings": 131072,
+                "rope_scaling": {
+                    "factor": 8,
+                    "high_freq_factor": 4,
+                    "low_freq_factor": 1,
+                    "original_max_position_embeddings": 8192,
+                    "rope_type": "llama3",
+                },
+            }
+        )
+
+        with open(config_path, "w") as f:
+            json.dump(draft_config, f, indent=4)
+
+        print("Config updated with extended context settings")
+
+    def upload_combined_model(
+        self,
+        base_model_dir,
+        draft_model_dir,
+        model_name="speculative-decoding-combined",
+    ):
+        """Upload base and draft models as a combined custom model."""
+        # A draft model deployment requires both the draft model and the base model.
+        # The sglang engine uses the draft model to generate speculative tokens, while the base model
+        # verifies and finalizes the output. This function prepares both models for deployment.
+
+        print("Creating combined model package...")
+        combined_dir = "./models/"
+        print(f"Base model: {base_model_dir}")
+        print(f"Draft model: {draft_model_dir}")
+
+        # Register combined model
+        model_name_versioned = f"{model_name}-{self.guid}"
+
+        model = Model(
+            path=combined_dir,
+            name=model_name_versioned,
+            description="Combined base and draft model for speculative decoding",
+            tags={
+                "type": "speculative_decoding",
+                "architecture": "eagle3",
+            },
+        )
+
+        registered_model = self.ml_client.models.create_or_update(model)
+        print(f"Model registered: {registered_model.name} v{registered_model.version}")
+
+        return registered_model
+
+
+def create_draft_model_config(base_model_config=None):
+    """Combines user config and draft model configuration for EAGLE3."""
+    default_config = {
+        "architectures": ["LlamaForCausalLMEagle3"],
+        "bos_token_id": 128000,
+        "eos_token_id": 128001,
+        "hidden_act": "silu",
+        "hidden_size": 4096,
+        "initializer_range": 0.02,
+        "intermediate_size": 14336,
+        "max_position_embeddings": 2048,
+        "model_type": "llama",
+        "num_attention_heads": 32,
+        "num_key_value_heads": 8,
+        "num_hidden_layers": 1,
+        "pad_token_id": 0,
+        "rms_norm_eps": 1e-05,
+        "tie_word_embeddings": False,
+        "torch_dtype": "float16",
+        "transformers_version": "4.28.1",
+        "use_cache": True,
+        "vocab_size": 128256,
+        "draft_vocab_size": 32000,
+    }
+
+    if base_model_config:
+        default_config.update(base_model_config)
+
+    return default_config
+
+
+def run_draft_model_pipeline(
+    ml_client: MLClient,
+    registry_ml_client: MLClient,
+    compute_cluster: str,
+    base_model_mlflow_path: str,
+    draft_train_data_path="./data_for_draft_model/train/sharegpt_train_small.jsonl",
+    num_epochs=1,
+    monitor=False,
+):
+    # Fine-tuning the draft model in speculative decoding makes its predictions closer to the target model, increasing token acceptance
+    # and reducing rollbacks. This alignment improves decoding speed and efficiency while maintaining output quality. It also enables
+    # better performance for domain-specific tasks by adapting the draft model to relevant data. AzureML provides a prebuilt pipeline for this fine-tuning process.
+    print("\n" + "=" * 60)
+    print("🎯 STARTING DRAFT MODEL PIPELINE")
+    print("=" * 60 + "\n")
+
+    # Create draft model config
+    draft_model_config = create_draft_model_config()
+
+    config_dir = "./draft_config"
+    os.makedirs(config_dir, exist_ok=True)
+    draft_config_path = os.path.join(config_dir, "draft_model_config.json")
+
+    with open(draft_config_path, "w") as f:
+        json.dump(draft_model_config, f, indent=4)
+    print(f"Draft model config saved: {draft_config_path}")
+
+    # Verify training data
+    if not os.path.exists(draft_train_data_path):
+        raise FileNotFoundError(
+            f"Draft model training data not found: {draft_train_data_path}"
+        )
+    print(f"Draft training data: {draft_train_data_path}")
+
+    # Get component from registry
+    draft_component_name = "eagle3_chat_completion_pipeline"
+    eagle3_comp = registry_ml_client.components.get(
+        name=draft_component_name, label="latest"
+    )
+
+    # Define pipeline
+    @dsl.pipeline
+    def speculative_decoding_draft_pipeline():
+        node = eagle3_comp(
+            mlflow_model_path=Input(
+                type=AssetTypes.MLFLOW_MODEL, path=base_model_mlflow_path
+            ),
+            dataset_train_split=Input(
+                type=AssetTypes.URI_FILE, path=draft_train_data_path
+            ),
+            dataset_validation_split=Input(
+                type=AssetTypes.URI_FILE, path=draft_train_data_path
+            ),
+            draft_model_config=Input(type=AssetTypes.URI_FILE, path=draft_config_path),
+            compute_model_import=compute_cluster,
+            compute_eagle3_training=compute_cluster,
+            instance_type_model_import="octacpu",
+            instance_type_eagle3_training="octagpu",
+            num_epochs=num_epochs,
+        )
+        return {"output_model": node.outputs.output_model_path}
+
+    # Submit pipeline
+    draft_job = speculative_decoding_draft_pipeline()
+    print("Submitting draft model training pipeline...")
+    draft_job = ml_client.jobs.create_or_update(
+        draft_job, experiment_name="speculative-decoding-draft-model"
+    )
+
+    print(f"Job submitted: {draft_job.name}")
+    print(f"📊 Studio URL: {draft_job.studio_url}")
+
+    # Monitor if requested
+    if monitor:
+        _, status = monitor_run(ml_client, draft_job, poll_interval=60)
+        return draft_job, status
+
+    return draft_job, None
+
+
+def prepare_combined_model_for_deployment(
+    ml_client: MLClient,
+    registry_ml_client: MLClient,
+    draft_job_name: str,
+    base_model_hf_id="nvidia/Llama-3.1-8B-Instruct-FP8",
+    model_name="grpo-speculative-decoding",
+    force=False,
+):
+    # A draft model deployment requires both the draft model and the base model.
+    # The sglang engine uses the draft model to generate speculative tokens, while the base model
+    # verifies and finalizes the output. This function prepares both models for deployment.
+    print("Preparing combined model for deployment...")
+
+    draft_pipeline = DraftModelPipeline(
+        ml_client=ml_client, registry_ml_client=registry_ml_client
+    )
+
+    # Define paths
+    draft_model_dir = "./models/draft"
+    base_model_dir = "./models/base"
+
+    temp_download_dir = "./models/draft_temp"
+    temp_path = Path(temp_download_dir)
+    required_files = ["config.json", "model.safetensors", "training_state.pt"]
+
+    for file_pattern in required_files:
+        files_found = list(temp_path.rglob(file_pattern))
+        if files_found:
+            src_path = files_found[0]  # Take the first match
+            dst_path = Path(draft_model_dir) / file_pattern
+            shutil.move(str(src_path), str(dst_path))
+            print(f"Moved {file_pattern}")
+        else:
+            print(f"File not found: {file_pattern}")
+
+    # Clean up temporary directory
+    if os.path.exists(temp_download_dir):
+        shutil.rmtree(temp_download_dir)
+        print(f"Cleaned up temporary directory")
+    else:
+        print(f"Draft model already exists: {draft_model_dir}")
+
+    # Download base model from HuggingFace
+    if force or not os.path.exists(base_model_dir):
+        print("\nDownloading base model...")
+        snapshot_download(repo_id=base_model_hf_id, local_dir=base_model_dir)
+        print(f"Base model downloaded to: {base_model_dir}")
+    else:
+        print(f"Base model already exists: {base_model_dir}")
+
+    # Upload combined model
+    combined_model = draft_pipeline.upload_combined_model(
+        base_model_dir=base_model_dir,
+        draft_model_dir=draft_model_dir,
+        model_name=model_name,
+    )
+
+    print(f"\nCombined model ready for deployment: {combined_model.name}")
+    return combined_model
+
+
+def deploy_speculative_decoding_endpoint(
+    ml_client: MLClient,
+    combined_model,
+    instance_type,  # In kubernetes we can be granular upto the gpu level and leave the rest of the node unused
+    compute_name,  # Compute argument for KubernetesOnlineEndpoint
+):
+    print("Deploying speculative decoding endpoint")
+
+    endpoint_name = f"spec-dec-grpo"
+    deployment_name = "speculative-deployment"
+    model_mount_path = "/var/model-mount"
+    endpoint_description = (
+        "Speculative decoding endpoint with GRPO fine-tuned base model"
+    )
+    endpoint_tags = {"model_type": "speculative_decoding", "algorithm": "grpo"}
+    environment = ml_client.environments.get("speculative-decoding-env", label="latest")
+    if environment is None or environment.id is None:
+        raise ValueError("Speculative decoding environment not found in registry")
+
+    environment_variables = {  # Environment variables configure the serving engine and model paths for the container
+        "SPECULATIVE_DECODING_MODE": "true",  # Used sglang framework for inference
+        "BASE_MODEL": f"{model_mount_path}/models/base",  # Path for base model
+        "DRAFT_MODEL": f"{model_mount_path}/models/draft",  # Path for draft model
+        "NUM_SPECULATIVE_TOKENS": "5",
+        "SERVING_ENGINE": "sglang",  # the serving engine to use
+    }
+
+    endpoint_name = create_kubernetes_deployment(
+        ml_client=ml_client,
+        model_asset_id=combined_model.id,
+        environment_asset_id=environment.id,
+        instance_type=instance_type,
+        compute_name=compute_name,
+        endpoint_name=endpoint_name,
+        endpoint_description=endpoint_description,
+        endpoint_tags=endpoint_tags,
+        deployment_name=deployment_name,
+        deployment_env_vars=environment_variables,
+    )
+
+    print(f"Speculative decoding endpoint deployed: {endpoint_name}")
+    return endpoint_name
diff --git a/sdk/python/foundation-models/system/reinforcement-learning/scripts/utils.py b/sdk/python/foundation-models/system/reinforcement-learning/scripts/utils.py
new file mode 100644
index 000000000..e999f379e
--- /dev/null
+++ b/sdk/python/foundation-models/system/reinforcement-learning/scripts/utils.py
@@ -0,0 +1,33 @@
+import mlflow
+from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
+from azure.ai.ml import MLClient
+
+
+def setup_workspace(config_path="./config.json", registry_name="test_centralus"):
+    """Setup Azure ML workspace and registry clients."""
+    global ml_client, registry_ml_client
+    try:
+        credential = DefaultAzureCredential()
+        credential.get_token("https://management.azure.com/.default")
+    except Exception:
+        credential = InteractiveBrowserCredential()
+
+    ml_client = MLClient.from_config(credential=credential, path=config_path)
+    _ = ml_client._workspaces.get(
+        ml_client.workspace_name
+    )  # Load credentials to verify access
+    registry_ml_client = MLClient(credential, registry_name=registry_name)
+
+    ws = ml_client.workspaces.get(ml_client.workspace_name)
+    if ws is None:
+        raise ValueError(f"Workspace {ml_client.workspace_name} not found.")
+
+    mlflow_tracking_uri = ws.mlflow_tracking_uri
+    if mlflow_tracking_uri is None:
+        raise ValueError("MLflow tracking URI is not set for the workspace.")
+
+    # set mlflow tracking uri for workspace
+    mlflow.set_tracking_uri(mlflow_tracking_uri)
+
+    print(f"Workspace setup complete, connected")
+    return ml_client, registry_ml_client