PaddlePaddle · LiqinruiG · Oct 23, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -521,8 +521,10 @@ async def chat_completion_full_generator(
                     if data["finished"]:
                         num_choices -= 1
                         choice = await self._create_chat_completion_choice(
-                            data=data,
+                            output=output,
+                            index=idx,
                             request=request,
+                            previous_num_tokens=previous_num_tokens[idx],
                             prompt_token_ids=prompt_token_ids,
                             prompt_tokens=prompt_tokens,
                             completion_token_ids=completion_token_ids[idx],
@@ -557,18 +559,17 @@ async def chat_completion_full_generator(
 
     async def _create_chat_completion_choice(
         self,
-        data: dict,
+        output: dict,
+        index: int,
         request: ChatCompletionRequest,
+        previous_num_tokens: int,
         prompt_token_ids: list,
         prompt_tokens: str,
         completion_token_ids: list,
         num_cached_tokens: list,
         logprob_contents: list,
         response_processor: ChatResponseProcessor,
     ) -> ChatCompletionResponseChoice:
-        idx = int(data["request_id"].split("_")[-1])
-        output = data["outputs"]
-        previous_num_tokens = len(data["outputs"]["token_ids"])
 
         if output is not None and output.get("metrics") and output["metrics"].get("request_start_time"):
             work_process_metrics.e2e_request_latency.observe(
@@ -589,12 +590,12 @@ async def _create_chat_completion_choice(
             message.content = output["text"]
 
         logprobs_full_res = None
-        if logprob_contents[idx]:
-            logprobs_full_res = LogProbs(content=logprob_contents[idx])
+        if logprob_contents[index]:
+            logprobs_full_res = LogProbs(content=logprob_contents[index])
 
         has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None
         max_tokens = request.max_completion_tokens or request.max_tokens
-        num_cached_tokens[idx] = output.get("num_cached_tokens", 0)
+        num_cached_tokens[index] = output.get("num_cached_tokens", 0)
 
         finish_reason = "stop"
         if has_no_token_limit or previous_num_tokens != max_tokens:
@@ -607,7 +608,7 @@ async def _create_chat_completion_choice(
             finish_reason = "recover_stop"
 
         return ChatCompletionResponseChoice(
-            index=idx,
+            index=index,
             message=message,
             logprobs=logprobs_full_res,
             finish_reason=finish_reason,

diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py
@@ -287,6 +287,69 @@ def test_non_streaming_chat(openai_client):
     assert hasattr(response.choices[0].message, "content")
 
 
+def test_non_streaming_chat_finish_reason(openai_client):
+    """
+    Test non-streaming chat functionality with the local service
+    """
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=1,
+        max_tokens=5,
+        stream=False,
+    )
+
+    assert hasattr(response, "choices")
+    assert response.choices[0].finish_reason == "length"
+
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=1,
+        max_completion_tokens=5,
+        stream=False,
+    )
+
+    assert hasattr(response, "choices")
+    assert response.choices[0].finish_reason == "length"
+
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=1,
+        max_tokens=5,
+        stream=False,
+        n=2,
+    )
+    assert hasattr(response, "choices")
+    for choice in response.choices:
+        assert choice.finish_reason == "length"
+
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=1,
+        max_completion_tokens=5,
+        stream=False,
+        n=2,
+    )
+    assert hasattr(response, "choices")
+    for choice in response.choices:
+        assert choice.finish_reason == "length"
+
+
 # Streaming test
 def test_streaming_chat(openai_client, capsys):
     """
@@ -1281,6 +1344,89 @@ def test_streaming_completion_with_bad_words(openai_client, capsys):
     assert not any(ids in output_ids_2 for ids in bad_token_ids)
 
 
+def test_streaming_chat_finish_reason(openai_client):
+    """
+    Test non-streaming chat functionality with the local service
+    """
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=1,
+        max_tokens=5,
+        stream=True,
+    )
+
+    for chunk in response:
+        last_token = chunk.choices[0].finish_reason
+    assert last_token == "length"
+
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=1,
+        max_completion_tokens=5,
+        stream=True,
+    )
+
+    for chunk in response:
+        last_token = chunk.choices[0].finish_reason
+    assert last_token == "length"
+
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=1,
+        max_completion_tokens=5,
+        stream=True,
+        n=2,
+    )
+    finish_reason_1 = ""
+    finish_reason_1 = ""
+
+    for chunk in response:
+        last_token = chunk.choices[0].finish_reason
+        if last_token:
+            if chunk.choices[0].index == 0:
+                finish_reason_1 = last_token
+            else:
+                finish_reason_2 = last_token
+    assert finish_reason_1 == "length"
+    assert finish_reason_2 == "length"
+
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant."},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=1,
+        max_tokens=5,
+        stream=True,
+        n=2,
+    )
+    finish_reason_1 = ""
+    finish_reason_1 = ""
+
+    for chunk in response:
+        last_token = chunk.choices[0].finish_reason
+        if last_token:
+            if chunk.choices[0].index == 0:
+                finish_reason_1 = last_token
+            else:
+                finish_reason_2 = last_token
+    assert finish_reason_1 == "length"
+    assert finish_reason_2 == "length"
+
+
 def test_profile_reset_block_num():
     """测试profile reset_block_num功能，与baseline diff不能超过5%"""
     log_file = "./log/config.log"

diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py
@@ -391,6 +391,7 @@ async def test_create_chat_completion_choice(self):
                         "raw_prediction": "raw_answer_0",
                     },
                     "finished": True,
+                    "previous_num_tokens": 2,
                 },
                 "mock_request": ChatCompletionRequest(
                     model="test", messages=[], return_token_ids=True, max_tokens=10, n=2
@@ -417,6 +418,7 @@ async def test_create_chat_completion_choice(self):
                         "raw_prediction": None,
                     },
                     "finished": True,
+                    "previous_num_tokens": 1,
                 },
                 "mock_request": ChatCompletionRequest(
                     model="test", messages=[], return_token_ids=True, max_tokens=5, n=2
@@ -435,16 +437,18 @@ async def test_create_chat_completion_choice(self):
 
         prompt_token_ids = [1, 2]
         prompt_tokens = "test_prompt"
-        logprob_contents = [[], []]
+        logprob_contents = [[{"token": "hello", "logprob": 0.1}], [{"token": "hello", "logprob": 0.1}]]
         mock_response_processor = Mock()
         mock_response_processor.enable_multimodal_content.return_value = False
         completion_token_ids = [[], []]
         num_cached_tokens = [0, 0]
 
         for idx, case in enumerate(test_cases):
             actual_choice = await self.chat_serving._create_chat_completion_choice(
-                data=case["test_data"],
+                output=case["test_data"]["outputs"],
+                index=idx,
                 request=case["mock_request"],
+                previous_num_tokens=case["test_data"]["previous_num_tokens"],
                 prompt_token_ids=prompt_token_ids,
                 prompt_tokens=prompt_tokens,
                 completion_token_ids=completion_token_ids[idx],
@@ -465,6 +469,7 @@ async def test_create_chat_completion_choice(self):
 
             self.assertEqual(num_cached_tokens[expected["index"]], expected["num_cached_tokens"])
             self.assertEqual(actual_choice.finish_reason, expected["finish_reason"])
+            assert actual_choice.logprobs is not None
 
 
 if __name__ == "__main__":