Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
54db20a
add singleton
lvhan028 Dec 23, 2025
bb7ad10
init
lvhan028 Jan 4, 2026
8af17e4
remove useless code
lvhan028 Jan 4, 2026
fdeb371
merge main
lvhan028 Jan 5, 2026
3c2100b
merge main
lvhan028 Jan 5, 2026
199d8bc
implement Pipeline
lvhan028 Jan 8, 2026
29eef8e
fix when test llm pipeline
lvhan028 Jan 8, 2026
87563dc
fix chat
lvhan028 Jan 8, 2026
5830872
merge main
lvhan028 Jan 8, 2026
4932a28
fix chat
lvhan028 Jan 9, 2026
d9e7380
fix profile pipeline
lvhan028 Jan 9, 2026
ca497b1
fix acquire_inst
lvhan028 Jan 9, 2026
8233746
fix get_ppl
lvhan028 Jan 12, 2026
5aaf383
fix
lvhan028 Jan 12, 2026
a3c721f
use inst.async_end
lvhan028 Jan 12, 2026
b887433
fix session's async_end
lvhan028 Jan 12, 2026
e24f3f2
Merge branch 'main' into refactor-async-engine
lvhan028 Jan 12, 2026
0637b97
fix max_new_tokens
lvhan028 Jan 12, 2026
3fb6bdc
Merge branch 'refactor-async-engine' of https://github.com/lvhan028/l…
lvhan028 Jan 12, 2026
730ac66
rename session_mgr.create to session_mgr.get
lvhan028 Jan 12, 2026
6a740b9
fix step
lvhan028 Jan 13, 2026
e621a00
rollback fused_moe_ep
lvhan028 Jan 13, 2026
6193ff4
improve singleton
lvhan028 Jan 13, 2026
71ed1b3
fix processing multimodal data
lvhan028 Jan 14, 2026
bd6f437
Merge branch 'refactor-async-engine' of https://github.com/lvhan028/l…
lvhan028 Jan 14, 2026
251d4db
fix
lvhan028 Jan 14, 2026
edafe71
fix
lvhan028 Jan 14, 2026
03d8f79
remove useless code from vl_async_engine
lvhan028 Jan 14, 2026
3afc3ad
fix is_single
lvhan028 Jan 14, 2026
2e99627
make managers package including inst_manager and session_manager
lvhan028 Jan 14, 2026
9ab9e9e
make processors package including multimodal_processor
lvhan028 Jan 14, 2026
c12417c
move async_engine and vl_async_engine to serve/core package
lvhan028 Jan 14, 2026
3fe1755
about utils
lvhan028 Jan 14, 2026
96cd73e
docs
lvhan028 Jan 15, 2026
3976029
docs
lvhan028 Jan 15, 2026
1127221
fix BC
lvhan028 Jan 19, 2026
5310c28
mark serve and client APIs are not available
lvhan028 Jan 19, 2026
7ee4265
rename inst to handle
lvhan028 Jan 19, 2026
8a03fdf
enhance comments
lvhan028 Jan 19, 2026
6932a9f
fix according to lzhangzz comment
lvhan028 Jan 19, 2026
4196e5f
fix according to lzhangzz comments
lvhan028 Jan 20, 2026
a1321c7
following python3.10+ type hint spec
lvhan028 Jan 20, 2026
daad15d
Merge branch 'main' into refactor-async-engine
lvhan028 Jan 20, 2026
222afb5
apply_session_id to get_session_id
lvhan028 Jan 20, 2026
41ac0fe
refactor session and sessionmanager
lvhan028 Jan 21, 2026
99eb95c
update api_server
lvhan028 Jan 23, 2026
9487486
Merge branch 'main' into refactor-async-engine
lvhan028 Jan 26, 2026
7457e77
move EventThread from async_engine to pipeline
lvhan028 Jan 27, 2026
a00d1d9
apply lazy import to resolve the readthedocs issue
lvhan028 Jan 27, 2026
33b12f9
Merge branch 'main' into refactor-async-engine
lvhan028 Jan 28, 2026
b50914e
call self._handle.async_end when SafeRunException happens
lvhan028 Jan 28, 2026
beb6b91
Merge branch 'main' into refactor-async-engine
lvhan028 Jan 28, 2026
8ae2ada
fix get_reward_score
lvhan028 Jan 28, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions docs/en/api/pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@ Inference pipeline
Pipeline
--------
.. autofunction:: pipeline

Serving
--------
.. autofunction:: lmdeploy.api.serve
.. autofunction:: lmdeploy.api.client

.. autoclass:: Pipeline
:undoc-members:
:show-inheritance:
:members: __init__, infer, stream_infer, chat, get_ppl
:member-order: bysource

Config
-------------------
Expand Down
2 changes: 1 addition & 1 deletion docs/en/llm/pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ from lmdeploy import pipeline, GenerationConfig

pipe = pipeline('internlm/internlm2_5-7b-chat')

gen_config=GenerationConfig(output_logits='generation'
gen_config=GenerationConfig(output_logits='generation',
max_new_tokens=10)
response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
gen_config=gen_config)
Expand Down
11 changes: 5 additions & 6 deletions docs/zh_cn/api/pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
Pipeline
--------
.. autofunction:: pipeline

Serving
--------
.. autofunction:: lmdeploy.api.serve
.. autofunction:: lmdeploy.api.client

.. autoclass:: Pipeline
:undoc-members:
:show-inheritance:
:members: __init__, infer, stream_infer, chat, get_ppl
:member-order: bysource

Config
-------------------
Expand Down
3 changes: 2 additions & 1 deletion lmdeploy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from .api import client, pipeline, serve
from .messages import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, VisionConfig
from .model import ChatTemplateConfig
from .pipeline import Pipeline
from .tokenizer import Tokenizer
from .version import __version__, version_info

__all__ = [
'pipeline', 'serve', 'client', 'Tokenizer', 'GenerationConfig', '__version__', 'version_info', 'ChatTemplateConfig',
'PytorchEngineConfig', 'TurbomindEngineConfig', 'VisionConfig'
'PytorchEngineConfig', 'TurbomindEngineConfig', 'VisionConfig', 'Pipeline'
]
172 changes: 41 additions & 131 deletions lmdeploy/api.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
# Copyright (c) OpenMMLab. All rights reserved.
import os
from typing import List, Literal, Optional, Union
from __future__ import annotations

from .archs import autoget_backend_config, get_task
from .messages import PytorchEngineConfig, SpeculativeConfig, TurbomindEngineConfig
from .model import ChatTemplateConfig
from typing import TYPE_CHECKING, List, Literal

from typing_extensions import deprecated

from .pipeline import Pipeline

if TYPE_CHECKING:
from .messages import PytorchEngineConfig, SpeculativeConfig, TurbomindEngineConfig
from .model import ChatTemplateConfig


def pipeline(model_path: str,
backend_config: Optional[Union[TurbomindEngineConfig, PytorchEngineConfig]] = None,
chat_template_config: Optional[ChatTemplateConfig] = None,
backend_config: 'TurbomindEngineConfig' | 'PytorchEngineConfig' | None = None,
chat_template_config: 'ChatTemplateConfig' | None = None,
log_level: str = 'WARNING',
max_log_len: int = None,
speculative_config: SpeculativeConfig = None,
max_log_len: int | None = None,
speculative_config: 'SpeculativeConfig' | None = None,
**kwargs):
"""
Args:
Expand Down Expand Up @@ -59,141 +64,46 @@ def pipeline(model_path: str,
print(response)

""" # noqa E501
if os.getenv('TM_LOG_LEVEL') is None:
os.environ['TM_LOG_LEVEL'] = log_level
from lmdeploy.utils import get_logger, get_model
logger = get_logger('lmdeploy')
logger.setLevel(log_level)

# model_path is not local path.
if not os.path.exists(model_path):
download_dir = backend_config.download_dir \
if backend_config is not None else None
revision = backend_config.revision \
if backend_config is not None else None
model_path = get_model(model_path, download_dir, revision)

# spec model
if speculative_config is not None and speculative_config.model and not os.path.exists(speculative_config.model):
download_dir = backend_config.download_dir \
if backend_config is not None else None
speculative_config.model = get_model(speculative_config.model, download_dir)

_, pipeline_class = get_task(model_path)
if not isinstance(backend_config, PytorchEngineConfig):
# set auto backend mode
backend_config = autoget_backend_config(model_path, backend_config)
backend = 'pytorch' if isinstance(backend_config, PytorchEngineConfig) else 'turbomind'
logger.info(f'Using {backend} engine')

return pipeline_class(model_path,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config,
max_log_len=max_log_len,
speculative_config=speculative_config,
**kwargs)

return Pipeline(model_path,
backend_config=backend_config,
chat_template_config=chat_template_config,
log_level=log_level,
max_log_len=max_log_len,
speculative_config=speculative_config,
**kwargs)


@deprecated('This function is no longer available. Please use CLI command "lmdeploy serve api_server" instead.')
def serve(model_path: str,
model_name: Optional[str] = None,
model_name: str | None = None,
backend: Literal['turbomind', 'pytorch'] = 'turbomind',
backend_config: Optional[Union[TurbomindEngineConfig, PytorchEngineConfig]] = None,
chat_template_config: Optional[ChatTemplateConfig] = None,
backend_config: 'TurbomindEngineConfig' | 'PytorchEngineConfig' | None = None,
chat_template_config: 'ChatTemplateConfig' | None = None,
server_name: str = '0.0.0.0',
server_port: int = 23333,
log_level: str = 'ERROR',
api_keys: Optional[Union[List[str], str]] = None,
api_keys: List[str] | str | None = None,
ssl: bool = False,
**kwargs):
"""This will run the api_server in a subprocess.
"""This function is deprecated and no longer available.

Args:
model_path: the path of a model.
It could be one of the following options:

- i) A local directory path of a turbomind model which is
converted by ``lmdeploy convert`` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
``InternLM/internlm-chat-20b-4bit``,
``lmdeploy/llama2-chat-70b-4bit``, etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as ``internlm/internlm-chat-7b``,
``Qwen/Qwen-7B-Chat``, ``baichuan-inc/Baichuan2-7B-Chat``
and so on.
.. deprecated::
This function has been removed. Please use alternative methods.

model_name: the name of the served model. It can be accessed
by the RESTful API ``/v1/models``. If it is not specified,
``model_path`` will be adopted
backend: either ``turbomind`` or ``pytorch`` backend. Default to
``turbomind`` backend.
backend_config: backend
config instance. Default to none.
chat_template_config: chat template configuration.
Default to None.
server_name: host ip for serving
server_port: server port
log_level: set log level whose value among
[``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``]
api_keys: Optional list of API keys. Accepts string type as
a single api_key. Default to None, which means no api key applied.
ssl: Enable SSL. Requires OS Environment variables
``SSL_KEYFILE`` and ``SSL_CERTFILE``.
This will run the api_server in a subprocess.
""" # noqa E501
raise NotImplementedError("The 'serve' function is no longer available. "
'This function has been deprecated and removed.')

Return:
APIClient: A client chatbot for LLaMA series models.

Examples:
@deprecated('This function is no longer available. Please use "from lmdeploy.serve import APIClient" instead.')
def client(api_server_url: str = 'http://0.0.0.0:23333', api_key: str | None = None, **kwargs):
"""This function is deprecated and no longer available.

.. code-block:: python
.. deprecated::
This function has been removed. Please use ``from lmdeploy.serve import APIClient`` instead.

from lmdeploy.api import serve
client = serve('internlm/internlm-chat-7b', 'internlm-chat-7b')
for output in client.chat('hi', 1):
print(output)
""" # noqa E501
import time
from multiprocessing import Process

from lmdeploy.serve.openai.api_client import APIClient
from lmdeploy.serve.openai.api_server import serve

if type(backend_config) is not PytorchEngineConfig:
# set auto backend mode
backend_config = autoget_backend_config(model_path, backend_config)
backend = 'pytorch' if type(backend_config) is PytorchEngineConfig else 'turbomind'

task = Process(target=serve,
args=(model_path, ),
kwargs=dict(model_name=model_name,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config,
server_name=server_name,
server_port=server_port,
log_level=log_level,
api_keys=api_keys,
ssl=ssl,
**kwargs),
daemon=True)
task.start()
client = APIClient(f'http://{server_name}:{server_port}')
while True:
time.sleep(1)
try:
client.available_models
print(f'Launched the api_server in process {task.pid}, user can '
f'kill the server by:\nimport os,signal\nos.kill({task.pid}, '
'signal.SIGKILL)')
return client
except: # noqa
pass


def client(api_server_url: str = 'http://0.0.0.0:23333', api_key: Optional[str] = None, **kwargs):
"""
Args:
api_server_url: communicating address ``http://<ip>:<port>`` of
api_server
Expand All @@ -202,5 +112,5 @@ def client(api_server_url: str = 'http://0.0.0.0:23333', api_key: Optional[str]
Return:
Chatbot for LLaMA series models with turbomind as inference engine.
"""
from lmdeploy.serve.openai.api_client import APIClient
return APIClient(api_server_url, api_key, **kwargs)
raise NotImplementedError("The 'client' function is no longer available. This function has been deprecated. "
' Please use "from lmdeploy.serve import APIClient" instead.')
75 changes: 31 additions & 44 deletions lmdeploy/archs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
import os
from typing import Dict, List, Literal, Optional, Union
from typing import Dict, List, Literal, Tuple

from transformers import AutoConfig

Expand Down Expand Up @@ -57,8 +57,8 @@ def autoget_backend(model_path: str) -> Literal['turbomind', 'pytorch']:

def autoget_backend_config(
model_path: str,
backend_config: Optional[Union[PytorchEngineConfig, TurbomindEngineConfig]] = None
) -> Union[PytorchEngineConfig, TurbomindEngineConfig]:
backend_config: PytorchEngineConfig | TurbomindEngineConfig | None = None
) -> Tuple[Literal['turbomind', 'pytorch'], PytorchEngineConfig | TurbomindEngineConfig]:
"""Get backend config automatically.

Args:
Expand All @@ -72,14 +72,14 @@ def autoget_backend_config(
"""
from dataclasses import asdict

if isinstance(backend_config, PytorchEngineConfig):
return 'pytorch', backend_config

backend = autoget_backend(model_path)
if backend == 'pytorch':
config = PytorchEngineConfig()
else:
config = TurbomindEngineConfig()
config = PytorchEngineConfig() if backend == 'pytorch' else TurbomindEngineConfig()
if backend_config is not None:
if type(backend_config) == type(config):
return backend_config
config = backend_config
else:
data = asdict(backend_config)
for k, v in data.items():
Expand All @@ -90,7 +90,7 @@ def autoget_backend_config(
config.block_size = backend_config.cache_block_seq_len
else:
config.cache_block_seq_len = backend_config.block_size
return config
return backend, config


def check_vl_llm(config: dict) -> bool:
Expand Down Expand Up @@ -126,14 +126,14 @@ def check_vl_llm(config: dict) -> bool:

def get_task(model_path: str):
"""Get pipeline type and pipeline class from model config."""
from lmdeploy.serve.async_engine import AsyncEngine
from lmdeploy.serve.core import AsyncEngine

if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')):
# workspace model
return 'llm', AsyncEngine
_, config = get_model_arch(model_path)
if check_vl_llm(config.to_dict()):
from lmdeploy.serve.vl_async_engine import VLAsyncEngine
from lmdeploy.serve.core import VLAsyncEngine
return 'vlm', VLAsyncEngine

# default task, pipeline_class
Expand All @@ -146,40 +146,27 @@ def get_model_arch(model_path: str):
Args:
model_path(str): the model path
"""
if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')):
# the turbomind model
import yaml
config_file = os.path.join(model_path, 'triton_models', 'weights', 'config.yaml')
with open(config_file, 'r') as f:
config = yaml.safe_load(f)

from .turbomind.deploy.config import TurbomindModelConfig
tm_config = TurbomindModelConfig.from_dict(config)

return tm_config.model_config.model_arch, tm_config
try:
cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
except Exception as e: # noqa
from transformers import PretrainedConfig
cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)

_cfg = cfg.to_dict()
if _cfg.get('architectures', None):
arch = _cfg['architectures'][0]
if _cfg.get('auto_map'):
for _, v in _cfg['auto_map'].items():
if 'InternLMXComposer2ForCausalLM' in v:
arch = 'InternLMXComposer2ForCausalLM'
elif _cfg.get('auto_map', None) and 'AutoModelForCausalLM' in _cfg['auto_map']:
arch = _cfg['auto_map']['AutoModelForCausalLM'].split('.')[-1]
elif _cfg.get('language_config', None) and _cfg['language_config'].get(
'auto_map', None) and 'AutoModelForCausalLM' in _cfg['language_config']['auto_map']:
arch = _cfg['language_config']['auto_map']['AutoModelForCausalLM'].split('.')[-1]
else:
# transformers model
try:
cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
except Exception as e: # noqa
from transformers import PretrainedConfig
cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)

_cfg = cfg.to_dict()
if _cfg.get('architectures', None):
arch = _cfg['architectures'][0]
if _cfg.get('auto_map'):
for _, v in _cfg['auto_map'].items():
if 'InternLMXComposer2ForCausalLM' in v:
arch = 'InternLMXComposer2ForCausalLM'
elif _cfg.get('auto_map', None) and 'AutoModelForCausalLM' in _cfg['auto_map']:
arch = _cfg['auto_map']['AutoModelForCausalLM'].split('.')[-1]
elif _cfg.get('language_config', None) and _cfg['language_config'].get(
'auto_map', None) and 'AutoModelForCausalLM' in _cfg['language_config']['auto_map']:
arch = _cfg['language_config']['auto_map']['AutoModelForCausalLM'].split('.')[-1]
else:
raise RuntimeError(f'Could not find model architecture from config: {_cfg}')
return arch, cfg
raise RuntimeError(f'Could not find model architecture from config: {_cfg}')
return arch, cfg


def search_nested_config(config, key):
Expand Down
Loading