Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions fastdeploy/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,16 @@ class PromptTokenUsageInfo(BaseModel):
"""

cached_tokens: Optional[int] = None
image_tokens: Optional[int] = None
video_tokens: Optional[int] = None


class CompletionTokenUsageInfo(BaseModel):
"""
Completion-related token usage info.
"""

image_tokens: Optional[int] = None


class UsageInfo(BaseModel):
Expand All @@ -77,6 +87,7 @@ class UsageInfo(BaseModel):
total_tokens: int = 0
completion_tokens: Optional[int] = 0
prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
completion_tokens_details: Optional[CompletionTokenUsageInfo] = None


class ModelPermission(BaseModel):
Expand Down
5 changes: 5 additions & 0 deletions fastdeploy/entrypoints/openai/response_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from typing import Any, List, Optional

from fastdeploy.entrypoints.openai.usage_calculator import count_tokens
from fastdeploy.input.tokenzier_client import AsyncTokenizerClient, ImageDecodeRequest
from fastdeploy.utils import api_server_logger

Expand Down Expand Up @@ -103,6 +104,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
image_output = self._end_image_code_request_output
image_output["outputs"]["multipart"] = [image]
image_output["outputs"]["token_ids"] = all_tokens
image_output["outputs"]["num_image_tokens"] = count_tokens(all_tokens)
yield image_output

self.data_processor.process_response_dict(
Expand All @@ -123,6 +125,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
token_ids = request_output["outputs"]["token_ids"]
if token_ids[-1] == self.eos_token_id:
multipart = []
num_image_tokens = 0
for part in self._multipart_buffer:
if part["decode_type"] == 0:
self.data_processor.process_response_dict(
Expand All @@ -138,6 +141,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
if self.decoder_client:
req_id = part["request_output"]["request_id"]
all_tokens = part["request_output"]["outputs"]["token_ids"]
num_image_tokens += count_tokens(all_tokens)
image_ret = await self.decoder_client.decode_image(
request=ImageDecodeRequest(req_id=req_id, data=all_tokens)
)
Expand All @@ -146,4 +150,5 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,

lasrt_request_output = self._multipart_buffer[-1]["request_output"]
lasrt_request_output["outputs"]["multipart"] = multipart
lasrt_request_output["outputs"]["num_image_tokens"] = num_image_tokens
yield lasrt_request_output
13 changes: 13 additions & 0 deletions fastdeploy/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
ChatCompletionResponseStreamChoice,
ChatCompletionStreamResponse,
ChatMessage,
CompletionTokenUsageInfo,
DeltaMessage,
ErrorInfo,
ErrorResponse,
Expand Down Expand Up @@ -187,6 +188,8 @@ async def chat_completion_stream_generator(
first_iteration = True
previous_num_tokens = [0] * num_choices
num_prompt_tokens = 0
num_cached_tokens = 0
num_image_tokens = [0] * num_choices
tool_called = [False] * num_choices
max_streaming_response_tokens = (
request.max_streaming_response_tokens
Expand Down Expand Up @@ -318,6 +321,7 @@ async def chat_completion_stream_generator(
output_top_logprobs = output["top_logprobs"]
output_draft_top_logprobs = output["draft_top_logprobs"]
previous_num_tokens[idx] += len(output["token_ids"])
num_image_tokens[idx] += output.get("num_image_tokens") or 0
logprobs_res: Optional[LogProbs] = None
draft_logprobs_res: Optional[LogProbs] = None
if request.logprobs and output_top_logprobs is not None:
Expand Down Expand Up @@ -385,6 +389,8 @@ async def chat_completion_stream_generator(
prompt_tokens=num_prompt_tokens,
completion_tokens=previous_num_tokens[idx],
total_tokens=num_prompt_tokens + previous_num_tokens[idx],
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
completion_tokens_details=CompletionTokenUsageInfo(image_tokens=num_image_tokens[idx]),
)
choices.append(choice)

Expand All @@ -401,6 +407,8 @@ async def chat_completion_stream_generator(
prompt_tokens=num_prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=num_prompt_tokens + completion_tokens,
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
completion_tokens_details=CompletionTokenUsageInfo(image_tokens=sum(num_image_tokens)),
)
chunk = ChatCompletionStreamResponse(
id=request_id,
Expand Down Expand Up @@ -456,6 +464,7 @@ async def chat_completion_full_generator(
draft_logprob_contents = [[] for _ in range(num_choices)]
completion_token_ids = [[] for _ in range(num_choices)]
num_cached_tokens = [0] * num_choices
num_image_tokens = [0] * num_choices
response_processor = ChatResponseProcessor(
data_processor=self.engine_client.data_processor,
enable_mm_output=self.enable_mm_output,
Expand Down Expand Up @@ -527,6 +536,7 @@ async def chat_completion_full_generator(
prompt_tokens=prompt_tokens,
completion_token_ids=completion_token_ids[idx],
num_cached_tokens=num_cached_tokens,
num_image_tokens=num_image_tokens,
logprob_contents=logprob_contents,
response_processor=response_processor,
)
Expand All @@ -543,6 +553,7 @@ async def chat_completion_full_generator(
completion_tokens=num_generated_tokens,
total_tokens=num_prompt_tokens + num_generated_tokens,
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=sum(num_cached_tokens)),
completion_tokens_details=CompletionTokenUsageInfo(image_tokens=sum(num_image_tokens)),
)
choices = sorted(choices, key=lambda x: x.index)
res = ChatCompletionResponse(
Expand All @@ -563,6 +574,7 @@ async def _create_chat_completion_choice(
prompt_tokens: str,
completion_token_ids: list,
num_cached_tokens: list,
num_image_tokens: list,
logprob_contents: list,
response_processor: ChatResponseProcessor,
) -> ChatCompletionResponseChoice:
Expand Down Expand Up @@ -595,6 +607,7 @@ async def _create_chat_completion_choice(
has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None
max_tokens = request.max_completion_tokens or request.max_tokens
num_cached_tokens[idx] = output.get("num_cached_tokens", 0)
num_image_tokens[idx] = output.get("num_image_tokens", 0)

finish_reason = "stop"
if has_no_token_limit or previous_num_tokens != max_tokens:
Expand Down
18 changes: 17 additions & 1 deletion fastdeploy/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@
CompletionResponseChoice,
CompletionResponseStreamChoice,
CompletionStreamResponse,
CompletionTokenUsageInfo,
ErrorInfo,
ErrorResponse,
PromptTokenUsageInfo,
UsageInfo,
)
from fastdeploy.utils import (
Expand Down Expand Up @@ -369,6 +371,8 @@ async def completion_stream_generator(
req_id = f"{request_id}_{i}"
dealer.write([b"", req_id.encode("utf-8")]) # 发送多路请求
output_tokens = [0] * num_choices
num_cache_tokens = [0] * num_choices
num_image_tokens = [0] * num_choices
inference_start_time = [0] * num_choices
first_iteration = [True] * num_choices
tool_called = [False] * num_choices
Expand Down Expand Up @@ -457,7 +461,9 @@ async def completion_stream_generator(
draft_logprobs_res = self._create_completion_logprobs(
output_draft_top_logprobs, request.logprobs, 0
)
output_tokens[idx] += 1
output_tokens[idx] += len(output.get("token_ids", [])) or 0
num_cache_tokens[idx] += output.get("num_cache_tokens") or 0
num_image_tokens[idx] += output.get("num_image_tokens") or 0
delta_message = CompletionResponseStreamChoice(
index=idx,
text=output["text"],
Expand Down Expand Up @@ -524,6 +530,10 @@ async def completion_stream_generator(
prompt_batched_token_ids[idx // (1 if request.n is None else request.n)]
)
+ output_tokens[idx],
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cache_tokens[idx]),
completion_tokens_details=CompletionTokenUsageInfo(
image_tokens=num_image_tokens[idx]
),
),
)
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
Expand Down Expand Up @@ -553,6 +563,8 @@ def request_output_to_completion_response(
choices: List[CompletionResponseChoice] = []
num_prompt_tokens = 0
num_generated_tokens = 0
num_cache_tokens = 0
num_image_tokens = 0

for idx in range(len(final_res_batch)):
final_res = final_res_batch[idx]
Expand Down Expand Up @@ -607,12 +619,16 @@ def request_output_to_completion_response(
num_generated_tokens += final_res["output_token_ids"]

num_prompt_tokens += len(prompt_token_ids)
num_cache_tokens += output.get("num_cache_tokens") or 0
num_image_tokens += output.get("num_image_tokens") or 0

num_prompt_tokens = num_prompt_tokens // (1 if request.n is None else request.n)
usage = UsageInfo(
prompt_tokens=num_prompt_tokens,
completion_tokens=num_generated_tokens,
total_tokens=num_prompt_tokens + num_generated_tokens,
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cache_tokens),
completion_tokens_details=CompletionTokenUsageInfo(image_tokens=num_image_tokens),
)
del request

Expand Down
33 changes: 33 additions & 0 deletions fastdeploy/entrypoints/openai/usage_calculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

import numpy as np


def count_tokens(tokens):
"""
Count the number of tokens in a nested list or array structure.
"""
count = 0
stack = [tokens]
while stack:
current = stack.pop()
if isinstance(current, (list, tuple, np.ndarray)):
for item in reversed(current):
stack.append(item)
else:
count += 1
return count
7 changes: 7 additions & 0 deletions tests/entrypoints/openai/test_max_streaming_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ async def test_create_chat_completion_choice(self):
"reasoning_content": "Normal reasoning",
"tool_call": None,
"num_cached_tokens": 3,
"num_image_tokens": 2,
"raw_prediction": "raw_answer_0",
},
"finished": True,
Expand All @@ -402,6 +403,7 @@ async def test_create_chat_completion_choice(self):
"tool_calls": None,
"raw_prediction": "raw_answer_0",
"num_cached_tokens": 3,
"num_image_tokens": 2,
"finish_reason": "stop",
},
},
Expand All @@ -414,6 +416,7 @@ async def test_create_chat_completion_choice(self):
"reasoning_content": None,
"tool_call": None,
"num_cached_tokens": 0,
"num_image_tokens": 0,
"raw_prediction": None,
},
"finished": True,
Expand All @@ -428,6 +431,7 @@ async def test_create_chat_completion_choice(self):
"tool_calls": None,
"raw_prediction": None,
"num_cached_tokens": 0,
"num_image_tokens": 0,
"finish_reason": "stop",
},
},
Expand All @@ -440,6 +444,7 @@ async def test_create_chat_completion_choice(self):
mock_response_processor.enable_multimodal_content.return_value = False
completion_token_ids = [[], []]
num_cached_tokens = [0, 0]
num_image_tokens = [0, 0]

for idx, case in enumerate(test_cases):
actual_choice = await self.chat_serving._create_chat_completion_choice(
Expand All @@ -449,6 +454,7 @@ async def test_create_chat_completion_choice(self):
prompt_tokens=prompt_tokens,
completion_token_ids=completion_token_ids[idx],
num_cached_tokens=num_cached_tokens,
num_image_tokens=num_image_tokens,
logprob_contents=logprob_contents,
response_processor=mock_response_processor,
)
Expand All @@ -464,6 +470,7 @@ async def test_create_chat_completion_choice(self):
self.assertEqual(actual_choice.message.completion_token_ids, completion_token_ids[idx])

self.assertEqual(num_cached_tokens[expected["index"]], expected["num_cached_tokens"])
self.assertEqual(num_image_tokens[expected["index"]], expected["num_image_tokens"])
self.assertEqual(actual_choice.finish_reason, expected["finish_reason"])


Expand Down
Loading
Loading