diff --git a/.github/workflows/cuda12.8-whl-release.yml b/.github/workflows/cuda12.8_whl_release.yml
similarity index 100%
rename from .github/workflows/cuda12.8-whl-release.yml
rename to .github/workflows/cuda12.8_whl_release.yml
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit_test.yml
similarity index 100%
rename from .github/workflows/unit-test.yml
rename to .github/workflows/unit_test.yml
diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows_x64_gpu.yml
similarity index 100%
rename from .github/workflows/windows-x64-gpu.yml
rename to .github/workflows/windows_x64_gpu.yml
diff --git a/README.md b/README.md
index f903b837b1..a9c57ac665 100644
--- a/README.md
+++ b/README.md
@@ -176,7 +176,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
Qwen2-VL (2B, 7B, 72B)
Qwen2.5-VL (3B, 7B, 72B)
Qwen3-VL (2B - 235B)
- Qwen3.5
+ Qwen3.5 (27B - 397B)
DeepSeek-VL (7B)
DeepSeek-VL2 (3B, 16B, 27B)
InternVL-Chat (v1.1-v1.5)
@@ -228,7 +228,7 @@ The default prebuilt package is compiled on **CUDA 12** since v0.3.0.
For the GeForce RTX 50 series, please install the LMDeploy prebuilt package complied with **CUDA 12.8**
```shell
-export LMDEPLOY_VERSION=0.12.1
+export LMDEPLOY_VERSION=0.12.2
export PYTHON_VERSION=310
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu128-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu128
```
diff --git a/README_ja.md b/README_ja.md
index 8537d4085c..ad6bf24361 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -155,7 +155,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
Qwen2-VL (2B, 7B, 72B)
Qwen2.5-VL (3B, 7B, 72B)
Qwen3-VL (2B - 235B)
- Qwen3.5
+ Qwen3.5 (27B - 397B)
DeepSeek-VL (7B)
DeepSeek-VL2 (3B, 16B, 27B)
InternVL-Chat (v1.1-v1.5)
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 4c2dd759ba..bf23cb8509 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -178,7 +178,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
Qwen2-VL (2B, 7B, 72B)
Qwen2.5-VL (3B, 7B, 72B)
Qwen3-VL (2B - 235B)
- Qwen3.5
+ Qwen3.5 (27B - 397B)
DeepSeek-VL (7B)
DeepSeek-VL2 (3B, 16B, 27B)
InternVL-Chat (v1.1-v1.5)
@@ -230,7 +230,7 @@ pip install lmdeploy
若使用 GeForce RTX 50 系列显卡,请安装基于 **CUDA 12.8** 编译的 LMDeploy 预编译包。
```shell
-export LMDEPLOY_VERSION=0.12.1
+export LMDEPLOY_VERSION=0.12.2
export PYTHON_VERSION=310
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu128-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu128
```
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
index 6503cc70f5..8ded44555d 100644
--- a/docs/en/get_started/installation.md
+++ b/docs/en/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by:
```shell
-export LMDEPLOY_VERSION=0.12.1
+export LMDEPLOY_VERSION=0.12.2
export PYTHON_VERSION=310
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
```
diff --git a/docs/en/quantization/llm_compressor.md b/docs/en/quantization/llm_compressor.md
index 7522f8c5d7..2b6ab9bfc4 100644
--- a/docs/en/quantization/llm_compressor.md
+++ b/docs/en/quantization/llm_compressor.md
@@ -44,7 +44,7 @@ conda create -n lmdeploy python=3.10 -y
conda activate lmdeploy
# Install llm-compressor
-pip install llm-compressor
+pip install llmcompressor
# Clone lmdeploy source code and run the quantization example
git clone https://github.com/InternLM/lmdeploy
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
index 0ce854a24a..988252b028 100644
--- a/docs/zh_cn/get_started/installation.md
+++ b/docs/zh_cn/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
默认的预构建包是在 **CUDA 12** 上编译的。如果需要 CUDA 11+ (>=11.3),你可以使用以下命令安装 lmdeploy:
```shell
-export LMDEPLOY_VERSION=0.12.1
+export LMDEPLOY_VERSION=0.12.2
export PYTHON_VERSION=310
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
```
diff --git a/docs/zh_cn/quantization/llm_compressor.md b/docs/zh_cn/quantization/llm_compressor.md
index 10a90b4538..8fdcbba6a4 100644
--- a/docs/zh_cn/quantization/llm_compressor.md
+++ b/docs/zh_cn/quantization/llm_compressor.md
@@ -42,7 +42,7 @@ conda create -n lmdeploy python=3.10 -y
conda activate lmdeploy
# 安装 llm-compressor
-pip install llm-compressor
+pip install llmcompressor
# 下载 lmdeploy 源码,运行量化用用例
git clone https://github.com/InternLM/lmdeploy
diff --git a/lmdeploy/pytorch/engine/engine_loop.py b/lmdeploy/pytorch/engine/engine_loop.py
index 1106f20c9c..d0b6a5e2d6 100644
--- a/lmdeploy/pytorch/engine/engine_loop.py
+++ b/lmdeploy/pytorch/engine/engine_loop.py
@@ -146,8 +146,7 @@ def _log_resps(outputs: List[InferOutput]):
if logger.level <= logging.DEBUG:
session_ids = [out.session_id for out in outputs]
logger.debug(f'Response sessions: {session_ids}')
- elif logger.level <= logging.INFO:
- logger.info(f'Response: num_outputs={len(outputs)}.')
+ logger.debug(f'Response: num_outputs={len(outputs)}.')
def _send_resp(self, out: InferOutput):
"""Send response."""
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index 2a68483ad4..3adce23d43 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -268,7 +268,7 @@ async def safe_run(self, handle, session, **kwargs):
metrics_processor.increase_api_routed_requests()
yield generator
except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa
- logger.error(f'[safe_run] session {session.session_id} exception caught: {type(e).__name__} {e}')
+ logger.exception(f'[safe_run] session {session.session_id} exception caught: {e}')
await session.async_abort()
if self.backend == 'pytorch':
await handle.async_end(session.session_id)
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index 2ea599e687..93f636e27e 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -99,8 +99,6 @@ async def async_convert_multimodal_data(messages: List[Dict]) -> List[Dict]:
def _inner_call(i, in_messages, out_messages):
role = in_messages[i]['role']
content = in_messages[i]['content']
- assert role in ['system', 'user', 'assistant'], \
- f'unsupported role "{role}"'
if role != 'user' or isinstance(content, str):
# the content is a user's prompt or an assistant's prompt,
# returning it directly
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index 54a2e46ee5..6c412114c3 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
-__version__ = '0.12.1'
+__version__ = '0.12.2'
short_version = __version__
diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc
index 594fa78b6b..f0e9e4ae40 100644
--- a/src/turbomind/models/llama/SequenceManager.cc
+++ b/src/turbomind/models/llama/SequenceManager.cc
@@ -528,7 +528,7 @@ auto SequenceManager::Materialize(Sequences sequences,
// release preempted blocks -> cached
if (!schedule.victims.empty()) {
- TM_LOG_WARNING("[SeqMgr] #victim: %d", (int)schedule.victims.size());
+ TM_LOG_INFO("[SeqMgr] #victim: %d", (int)schedule.victims.size());
for (const auto& p : schedule.victims) {
UpdateAndSetUnlock(*p);
}