diff --git a/.github/workflows/cuda12.8-whl-release.yml b/.github/workflows/cuda12.8_whl_release.yml similarity index 100% rename from .github/workflows/cuda12.8-whl-release.yml rename to .github/workflows/cuda12.8_whl_release.yml diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit_test.yml similarity index 100% rename from .github/workflows/unit-test.yml rename to .github/workflows/unit_test.yml diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows_x64_gpu.yml similarity index 100% rename from .github/workflows/windows-x64-gpu.yml rename to .github/workflows/windows_x64_gpu.yml diff --git a/README.md b/README.md index f903b837b1..a9c57ac665 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
  • Qwen2-VL (2B, 7B, 72B)
  • Qwen2.5-VL (3B, 7B, 72B)
  • Qwen3-VL (2B - 235B)
  • -
  • Qwen3.5
  • +
  • Qwen3.5 (27B - 397B)
  • DeepSeek-VL (7B)
  • DeepSeek-VL2 (3B, 16B, 27B)
  • InternVL-Chat (v1.1-v1.5)
  • @@ -228,7 +228,7 @@ The default prebuilt package is compiled on **CUDA 12** since v0.3.0. For the GeForce RTX 50 series, please install the LMDeploy prebuilt package complied with **CUDA 12.8** ```shell -export LMDEPLOY_VERSION=0.12.1 +export LMDEPLOY_VERSION=0.12.2 export PYTHON_VERSION=310 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu128-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu128 ``` diff --git a/README_ja.md b/README_ja.md index 8537d4085c..ad6bf24361 100644 --- a/README_ja.md +++ b/README_ja.md @@ -155,7 +155,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
  • Qwen2-VL (2B, 7B, 72B)
  • Qwen2.5-VL (3B, 7B, 72B)
  • Qwen3-VL (2B - 235B)
  • -
  • Qwen3.5
  • +
  • Qwen3.5 (27B - 397B)
  • DeepSeek-VL (7B)
  • DeepSeek-VL2 (3B, 16B, 27B)
  • InternVL-Chat (v1.1-v1.5)
  • diff --git a/README_zh-CN.md b/README_zh-CN.md index 4c2dd759ba..bf23cb8509 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -178,7 +178,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
  • Qwen2-VL (2B, 7B, 72B)
  • Qwen2.5-VL (3B, 7B, 72B)
  • Qwen3-VL (2B - 235B)
  • -
  • Qwen3.5
  • +
  • Qwen3.5 (27B - 397B)
  • DeepSeek-VL (7B)
  • DeepSeek-VL2 (3B, 16B, 27B)
  • InternVL-Chat (v1.1-v1.5)
  • @@ -230,7 +230,7 @@ pip install lmdeploy 若使用 GeForce RTX 50 系列显卡,请安装基于 **CUDA 12.8** 编译的 LMDeploy 预编译包。 ```shell -export LMDEPLOY_VERSION=0.12.1 +export LMDEPLOY_VERSION=0.12.2 export PYTHON_VERSION=310 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu128-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu128 ``` diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md index 6503cc70f5..8ded44555d 100644 --- a/docs/en/get_started/installation.md +++ b/docs/en/get_started/installation.md @@ -23,7 +23,7 @@ pip install lmdeploy The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by: ```shell -export LMDEPLOY_VERSION=0.12.1 +export LMDEPLOY_VERSION=0.12.2 export PYTHON_VERSION=310 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` diff --git a/docs/en/quantization/llm_compressor.md b/docs/en/quantization/llm_compressor.md index 7522f8c5d7..2b6ab9bfc4 100644 --- a/docs/en/quantization/llm_compressor.md +++ b/docs/en/quantization/llm_compressor.md @@ -44,7 +44,7 @@ conda create -n lmdeploy python=3.10 -y conda activate lmdeploy # Install llm-compressor -pip install llm-compressor +pip install llmcompressor # Clone lmdeploy source code and run the quantization example git clone https://github.com/InternLM/lmdeploy diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md index 0ce854a24a..988252b028 100644 --- a/docs/zh_cn/get_started/installation.md +++ b/docs/zh_cn/get_started/installation.md @@ -23,7 +23,7 @@ pip install lmdeploy 默认的预构建包是在 **CUDA 12** 上编译的。如果需要 CUDA 11+ (>=11.3),你可以使用以下命令安装 lmdeploy: ```shell -export LMDEPLOY_VERSION=0.12.1 +export LMDEPLOY_VERSION=0.12.2 export PYTHON_VERSION=310 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` diff --git a/docs/zh_cn/quantization/llm_compressor.md b/docs/zh_cn/quantization/llm_compressor.md index 10a90b4538..8fdcbba6a4 100644 --- a/docs/zh_cn/quantization/llm_compressor.md +++ b/docs/zh_cn/quantization/llm_compressor.md @@ -42,7 +42,7 @@ conda create -n lmdeploy python=3.10 -y conda activate lmdeploy # 安装 llm-compressor -pip install llm-compressor +pip install llmcompressor # 下载 lmdeploy 源码,运行量化用用例 git clone https://github.com/InternLM/lmdeploy diff --git a/lmdeploy/pytorch/engine/engine_loop.py b/lmdeploy/pytorch/engine/engine_loop.py index 1106f20c9c..d0b6a5e2d6 100644 --- a/lmdeploy/pytorch/engine/engine_loop.py +++ b/lmdeploy/pytorch/engine/engine_loop.py @@ -146,8 +146,7 @@ def _log_resps(outputs: List[InferOutput]): if logger.level <= logging.DEBUG: session_ids = [out.session_id for out in outputs] logger.debug(f'Response sessions: {session_ids}') - elif logger.level <= logging.INFO: - logger.info(f'Response: num_outputs={len(outputs)}.') + logger.debug(f'Response: num_outputs={len(outputs)}.') def _send_resp(self, out: InferOutput): """Send response.""" diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py index 2a68483ad4..3adce23d43 100644 --- a/lmdeploy/serve/core/async_engine.py +++ b/lmdeploy/serve/core/async_engine.py @@ -268,7 +268,7 @@ async def safe_run(self, handle, session, **kwargs): metrics_processor.increase_api_routed_requests() yield generator except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa - logger.error(f'[safe_run] session {session.session_id} exception caught: {type(e).__name__} {e}') + logger.exception(f'[safe_run] session {session.session_id} exception caught: {e}') await session.async_abort() if self.backend == 'pytorch': await handle.async_end(session.session_id) diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py index 2ea599e687..93f636e27e 100644 --- a/lmdeploy/serve/processors/multimodal.py +++ b/lmdeploy/serve/processors/multimodal.py @@ -99,8 +99,6 @@ async def async_convert_multimodal_data(messages: List[Dict]) -> List[Dict]: def _inner_call(i, in_messages, out_messages): role = in_messages[i]['role'] content = in_messages[i]['content'] - assert role in ['system', 'user', 'assistant'], \ - f'unsupported role "{role}"' if role != 'user' or isinstance(content, str): # the content is a user's prompt or an assistant's prompt, # returning it directly diff --git a/lmdeploy/version.py b/lmdeploy/version.py index 54a2e46ee5..6c412114c3 100644 --- a/lmdeploy/version.py +++ b/lmdeploy/version.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple -__version__ = '0.12.1' +__version__ = '0.12.2' short_version = __version__ diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 594fa78b6b..f0e9e4ae40 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -528,7 +528,7 @@ auto SequenceManager::Materialize(Sequences sequences, // release preempted blocks -> cached if (!schedule.victims.empty()) { - TM_LOG_WARNING("[SeqMgr] #victim: %d", (int)schedule.victims.size()); + TM_LOG_INFO("[SeqMgr] #victim: %d", (int)schedule.victims.size()); for (const auto& p : schedule.victims) { UpdateAndSetUnlock(*p); }