Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions fastdeploy/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,10 @@ async def chat_completion_full_generator(
if data["finished"]:
num_choices -= 1
choice = await self._create_chat_completion_choice(
data=data,
output=output,
index=idx,
request=request,
previous_num_tokens=previous_num_tokens[idx],
prompt_token_ids=prompt_token_ids,
prompt_tokens=prompt_tokens,
completion_token_ids=completion_token_ids[idx],
Expand Down Expand Up @@ -557,18 +559,17 @@ async def chat_completion_full_generator(

async def _create_chat_completion_choice(
self,
data: dict,
output: dict,
index: int,
request: ChatCompletionRequest,
previous_num_tokens: int,
prompt_token_ids: list,
prompt_tokens: str,
completion_token_ids: list,
num_cached_tokens: list,
logprob_contents: list,
response_processor: ChatResponseProcessor,
) -> ChatCompletionResponseChoice:
idx = int(data["request_id"].split("_")[-1])
output = data["outputs"]
previous_num_tokens = len(data["outputs"]["token_ids"])

if output is not None and output.get("metrics") and output["metrics"].get("request_start_time"):
work_process_metrics.e2e_request_latency.observe(
Expand All @@ -589,12 +590,12 @@ async def _create_chat_completion_choice(
message.content = output["text"]

logprobs_full_res = None
if logprob_contents[idx]:
logprobs_full_res = LogProbs(content=logprob_contents[idx])
if logprob_contents[index]:
logprobs_full_res = LogProbs(content=logprob_contents[index])

has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None
max_tokens = request.max_completion_tokens or request.max_tokens
num_cached_tokens[idx] = output.get("num_cached_tokens", 0)
num_cached_tokens[index] = output.get("num_cached_tokens", 0)

finish_reason = "stop"
if has_no_token_limit or previous_num_tokens != max_tokens:
Expand All @@ -607,7 +608,7 @@ async def _create_chat_completion_choice(
finish_reason = "recover_stop"

return ChatCompletionResponseChoice(
index=idx,
index=index,
message=message,
logprobs=logprobs_full_res,
finish_reason=finish_reason,
Expand Down
146 changes: 146 additions & 0 deletions tests/e2e/test_EB_Lite_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,69 @@ def test_non_streaming_chat(openai_client):
assert hasattr(response.choices[0].message, "content")


def test_non_streaming_chat_finish_reason(openai_client):
"""
Test non-streaming chat functionality with the local service
"""
response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=1,
max_tokens=5,
stream=False,
)

assert hasattr(response, "choices")
assert response.choices[0].finish_reason == "length"

response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=1,
max_completion_tokens=5,
stream=False,
)

assert hasattr(response, "choices")
assert response.choices[0].finish_reason == "length"

response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=1,
max_tokens=5,
stream=False,
n=2,
)
assert hasattr(response, "choices")
for choice in response.choices:
assert choice.finish_reason == "length"

response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=1,
max_completion_tokens=5,
stream=False,
n=2,
)
assert hasattr(response, "choices")
for choice in response.choices:
assert choice.finish_reason == "length"


# Streaming test
def test_streaming_chat(openai_client, capsys):
"""
Expand Down Expand Up @@ -1281,6 +1344,89 @@ def test_streaming_completion_with_bad_words(openai_client, capsys):
assert not any(ids in output_ids_2 for ids in bad_token_ids)


def test_streaming_chat_finish_reason(openai_client):
"""
Test non-streaming chat functionality with the local service
"""
response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=1,
max_tokens=5,
stream=True,
)

for chunk in response:
last_token = chunk.choices[0].finish_reason
assert last_token == "length"

response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=1,
max_completion_tokens=5,
stream=True,
)

for chunk in response:
last_token = chunk.choices[0].finish_reason
assert last_token == "length"

response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=1,
max_completion_tokens=5,
stream=True,
n=2,
)
finish_reason_1 = ""
finish_reason_1 = ""

for chunk in response:
last_token = chunk.choices[0].finish_reason
if last_token:
if chunk.choices[0].index == 0:
finish_reason_1 = last_token
else:
finish_reason_2 = last_token
assert finish_reason_1 == "length"
assert finish_reason_2 == "length"

response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=1,
max_tokens=5,
stream=True,
n=2,
)
finish_reason_1 = ""
finish_reason_1 = ""

for chunk in response:
last_token = chunk.choices[0].finish_reason
if last_token:
if chunk.choices[0].index == 0:
finish_reason_1 = last_token
else:
finish_reason_2 = last_token
assert finish_reason_1 == "length"
assert finish_reason_2 == "length"


def test_profile_reset_block_num():
"""测试profile reset_block_num功能,与baseline diff不能超过5%"""
log_file = "./log/config.log"
Expand Down
9 changes: 7 additions & 2 deletions tests/entrypoints/openai/test_max_streaming_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ async def test_create_chat_completion_choice(self):
"raw_prediction": "raw_answer_0",
},
"finished": True,
"previous_num_tokens": 2,
},
"mock_request": ChatCompletionRequest(
model="test", messages=[], return_token_ids=True, max_tokens=10, n=2
Expand All @@ -417,6 +418,7 @@ async def test_create_chat_completion_choice(self):
"raw_prediction": None,
},
"finished": True,
"previous_num_tokens": 1,
},
"mock_request": ChatCompletionRequest(
model="test", messages=[], return_token_ids=True, max_tokens=5, n=2
Expand All @@ -435,16 +437,18 @@ async def test_create_chat_completion_choice(self):

prompt_token_ids = [1, 2]
prompt_tokens = "test_prompt"
logprob_contents = [[], []]
logprob_contents = [[{"token": "hello", "logprob": 0.1}], [{"token": "hello", "logprob": 0.1}]]
mock_response_processor = Mock()
mock_response_processor.enable_multimodal_content.return_value = False
completion_token_ids = [[], []]
num_cached_tokens = [0, 0]

for idx, case in enumerate(test_cases):
actual_choice = await self.chat_serving._create_chat_completion_choice(
data=case["test_data"],
output=case["test_data"]["outputs"],
index=idx,
request=case["mock_request"],
previous_num_tokens=case["test_data"]["previous_num_tokens"],
prompt_token_ids=prompt_token_ids,
prompt_tokens=prompt_tokens,
completion_token_ids=completion_token_ids[idx],
Expand All @@ -465,6 +469,7 @@ async def test_create_chat_completion_choice(self):

self.assertEqual(num_cached_tokens[expected["index"]], expected["num_cached_tokens"])
self.assertEqual(actual_choice.finish_reason, expected["finish_reason"])
assert actual_choice.logprobs is not None


if __name__ == "__main__":
Expand Down
Loading