diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index aaa534228f9..618f3219047 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -521,8 +521,10 @@ async def chat_completion_full_generator( if data["finished"]: num_choices -= 1 choice = await self._create_chat_completion_choice( - data=data, + output=output, + index=idx, request=request, + previous_num_tokens=previous_num_tokens[idx], prompt_token_ids=prompt_token_ids, prompt_tokens=prompt_tokens, completion_token_ids=completion_token_ids[idx], @@ -557,8 +559,10 @@ async def chat_completion_full_generator( async def _create_chat_completion_choice( self, - data: dict, + output: dict, + index: int, request: ChatCompletionRequest, + previous_num_tokens: int, prompt_token_ids: list, prompt_tokens: str, completion_token_ids: list, @@ -566,9 +570,6 @@ async def _create_chat_completion_choice( logprob_contents: list, response_processor: ChatResponseProcessor, ) -> ChatCompletionResponseChoice: - idx = int(data["request_id"].split("_")[-1]) - output = data["outputs"] - previous_num_tokens = len(data["outputs"]["token_ids"]) if output is not None and output.get("metrics") and output["metrics"].get("request_start_time"): work_process_metrics.e2e_request_latency.observe( @@ -589,12 +590,12 @@ async def _create_chat_completion_choice( message.content = output["text"] logprobs_full_res = None - if logprob_contents[idx]: - logprobs_full_res = LogProbs(content=logprob_contents[idx]) + if logprob_contents[index]: + logprobs_full_res = LogProbs(content=logprob_contents[index]) has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None max_tokens = request.max_completion_tokens or request.max_tokens - num_cached_tokens[idx] = output.get("num_cached_tokens", 0) + num_cached_tokens[index] = output.get("num_cached_tokens", 0) finish_reason = "stop" if has_no_token_limit or previous_num_tokens != max_tokens: @@ -607,7 +608,7 @@ async def _create_chat_completion_choice( finish_reason = "recover_stop" return ChatCompletionResponseChoice( - index=idx, + index=index, message=message, logprobs=logprobs_full_res, finish_reason=finish_reason, diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index 4f07e817f0e..ef489ff4c95 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -287,6 +287,69 @@ def test_non_streaming_chat(openai_client): assert hasattr(response.choices[0].message, "content") +def test_non_streaming_chat_finish_reason(openai_client): + """ + Test non-streaming chat functionality with the local service + """ + response = openai_client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=1, + max_tokens=5, + stream=False, + ) + + assert hasattr(response, "choices") + assert response.choices[0].finish_reason == "length" + + response = openai_client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=1, + max_completion_tokens=5, + stream=False, + ) + + assert hasattr(response, "choices") + assert response.choices[0].finish_reason == "length" + + response = openai_client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=1, + max_tokens=5, + stream=False, + n=2, + ) + assert hasattr(response, "choices") + for choice in response.choices: + assert choice.finish_reason == "length" + + response = openai_client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=1, + max_completion_tokens=5, + stream=False, + n=2, + ) + assert hasattr(response, "choices") + for choice in response.choices: + assert choice.finish_reason == "length" + + # Streaming test def test_streaming_chat(openai_client, capsys): """ @@ -1281,6 +1344,89 @@ def test_streaming_completion_with_bad_words(openai_client, capsys): assert not any(ids in output_ids_2 for ids in bad_token_ids) +def test_streaming_chat_finish_reason(openai_client): + """ + Test non-streaming chat functionality with the local service + """ + response = openai_client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=1, + max_tokens=5, + stream=True, + ) + + for chunk in response: + last_token = chunk.choices[0].finish_reason + assert last_token == "length" + + response = openai_client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=1, + max_completion_tokens=5, + stream=True, + ) + + for chunk in response: + last_token = chunk.choices[0].finish_reason + assert last_token == "length" + + response = openai_client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=1, + max_completion_tokens=5, + stream=True, + n=2, + ) + finish_reason_1 = "" + finish_reason_1 = "" + + for chunk in response: + last_token = chunk.choices[0].finish_reason + if last_token: + if chunk.choices[0].index == 0: + finish_reason_1 = last_token + else: + finish_reason_2 = last_token + assert finish_reason_1 == "length" + assert finish_reason_2 == "length" + + response = openai_client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=1, + max_tokens=5, + stream=True, + n=2, + ) + finish_reason_1 = "" + finish_reason_1 = "" + + for chunk in response: + last_token = chunk.choices[0].finish_reason + if last_token: + if chunk.choices[0].index == 0: + finish_reason_1 = last_token + else: + finish_reason_2 = last_token + assert finish_reason_1 == "length" + assert finish_reason_2 == "length" + + def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" log_file = "./log/config.log" diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py index 3454c834072..04c1ea6b617 100644 --- a/tests/entrypoints/openai/test_max_streaming_tokens.py +++ b/tests/entrypoints/openai/test_max_streaming_tokens.py @@ -391,6 +391,7 @@ async def test_create_chat_completion_choice(self): "raw_prediction": "raw_answer_0", }, "finished": True, + "previous_num_tokens": 2, }, "mock_request": ChatCompletionRequest( model="test", messages=[], return_token_ids=True, max_tokens=10, n=2 @@ -417,6 +418,7 @@ async def test_create_chat_completion_choice(self): "raw_prediction": None, }, "finished": True, + "previous_num_tokens": 1, }, "mock_request": ChatCompletionRequest( model="test", messages=[], return_token_ids=True, max_tokens=5, n=2 @@ -435,7 +437,7 @@ async def test_create_chat_completion_choice(self): prompt_token_ids = [1, 2] prompt_tokens = "test_prompt" - logprob_contents = [[], []] + logprob_contents = [[{"token": "hello", "logprob": 0.1}], [{"token": "hello", "logprob": 0.1}]] mock_response_processor = Mock() mock_response_processor.enable_multimodal_content.return_value = False completion_token_ids = [[], []] @@ -443,8 +445,10 @@ async def test_create_chat_completion_choice(self): for idx, case in enumerate(test_cases): actual_choice = await self.chat_serving._create_chat_completion_choice( - data=case["test_data"], + output=case["test_data"]["outputs"], + index=idx, request=case["mock_request"], + previous_num_tokens=case["test_data"]["previous_num_tokens"], prompt_token_ids=prompt_token_ids, prompt_tokens=prompt_tokens, completion_token_ids=completion_token_ids[idx], @@ -465,6 +469,7 @@ async def test_create_chat_completion_choice(self): self.assertEqual(num_cached_tokens[expected["index"]], expected["num_cached_tokens"]) self.assertEqual(actual_choice.finish_reason, expected["finish_reason"]) + assert actual_choice.logprobs is not None if __name__ == "__main__":