-
Notifications
You must be signed in to change notification settings - Fork 249
/
Copy pathapi_cli.py
333 lines (319 loc) · 14.3 KB
/
api_cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
import argparse
def make_argument_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser()
parser.add_argument(
"--run_mode",
type=str,
choices=["normal", "prefill", "decode", "pd_master"],
default="normal",
help="set run mode, normal is started for a single server, prefill decode pd_master is for pd split run mode",
)
parser.add_argument("--host", type=str, default="127.0.0.1")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--httpserver_workers", type=int, default=1)
parser.add_argument(
"--zmq_mode",
type=str,
default="ipc:///tmp/",
help="use socket mode or ipc mode, only can be set in ['tcp://', 'ipc:///tmp/']",
)
parser.add_argument(
"--pd_master_ip",
type=str,
default="0.0.0.0",
help="when run_mode set to prefill or decode, you need set this pd_mater_ip",
)
parser.add_argument(
"--pd_master_port",
type=int,
default=1212,
help="when run_mode set to prefill or decode, you need set this pd_mater_port",
)
parser.add_argument(
"--pd_decode_rpyc_port",
type=int,
default=42000,
help="p d mode, decode node used for kv move manager rpyc server port",
)
parser.add_argument(
"--model_name",
type=str,
default="default_model_name",
help="just help to distinguish internal model name, use 'host:port/get_model_name' to get",
)
parser.add_argument(
"--model_dir",
type=str,
default=None,
help="the model weight dir path, the app will load config, weights and tokenizer from this dir",
)
parser.add_argument(
"--tokenizer_mode",
type=str,
default="fast",
help="""tokenizer load mode, can be slow, fast or auto, slow mode load fast but run slow,
slow mode is good for debug and test, fast mode get best performance, auto mode will
try to use fast mode, if failed will use slow mode""",
)
parser.add_argument(
"--load_way",
type=str,
default="HF",
help="""the way of loading model weights, the default is HF(Huggingface format), llama also supports
DS(Deepspeed)""",
)
parser.add_argument(
"--max_total_token_num",
type=int,
default=None,
help="the total token nums the gpu and model can support, equals = max_batch * (input_len + output_len)",
)
parser.add_argument(
"--mem_fraction",
type=float,
default=0.9,
help="""Memory usage ratio, default is 0.9, you can specify a smaller value if OOM occurs at runtime.
If max_total_token_num is not specified, it will be calculated automatically based on this value.""",
)
parser.add_argument(
"--batch_max_tokens",
type=int,
default=None,
help="max tokens num for new cat batch, it control prefill batch size to Preventing OOM",
)
parser.add_argument(
"--eos_id", nargs="+", type=int, default=None, help="eos stop token id, if None, will load from config.json"
)
parser.add_argument(
"--running_max_req_size", type=int, default=1000, help="the max size for forward requests in the same time"
)
parser.add_argument("--nnodes", type=int, default=1, help="the number of nodes")
parser.add_argument("--node_rank", type=int, default=0, help="the rank of the current node")
parser.add_argument(
"--multinode_httpmanager_port",
type=int,
default=12345,
help="the port for multinode http manager, default is 20000",
)
parser.add_argument(
"--multinode_router_gloo_port",
type=int,
default=20001,
help="the gloo port for multinode router, default is 20001",
)
parser.add_argument("--tp", type=int, default=1, help="model tp parral size, the default is 1")
parser.add_argument(
"--dp",
type=int,
default=1,
help="""This is just a useful parameter for deepseekv2. When
using the deepseekv2 model, set dp to be equal to the tp parameter. In other cases, please
do not set it and keep the default value as 1.""",
)
parser.add_argument(
"--max_req_total_len", type=int, default=16384, help="the max value for req_input_len + req_output_len"
)
parser.add_argument(
"--nccl_host",
type=str,
default="127.0.0.1",
help="""The nccl_host to build a distributed environment for PyTorch.
When deploying in multi-node manner, the value should be set to the IP of the master node""",
)
parser.add_argument(
"--nccl_port", type=int, default=28765, help="the nccl_port to build a distributed environment for PyTorch"
)
parser.add_argument(
"--mode",
type=str,
default=[],
nargs="+",
help="""Model mode: [triton_int8kv | ppl_int8kv | ppl_fp16 | triton_flashdecoding
| triton_gqa_attention | triton_gqa_flashdecoding | triton_fp8kv,
triton_flashdecoding mode is for long context, current support llama llama2 qwen;
triton_gqa_attention and triton_gqa_flashdecoding is fast kernel for model which use GQA;
triton_int8kv mode use int8 to store kv cache, can increase token capacity, use triton kernel;
triton_fp8kv mode use float8 to store kv cache, currently only for deepseek2;
ppl_int8kv mode use int8 to store kv cache, and use ppl fast kernel;
ppl_fp16 mode use ppl fast fp16 decode attention kernel;
you need to read source code to make sure the supported detail mode for all models""",
)
parser.add_argument(
"--trust_remote_code",
action="store_true",
help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
)
parser.add_argument("--disable_log_stats", action="store_true", help="disable logging throughput stats.")
parser.add_argument("--log_stats_interval", type=int, default=10, help="log stats interval in second.")
parser.add_argument("--router_token_ratio", type=float, default=0.0, help="token ratio to control router dispatch")
parser.add_argument(
"--router_max_new_token_len", type=int, default=1024, help="the request max new token len for router"
)
parser.add_argument(
"--router_max_wait_tokens",
type=int,
default=6,
help="schedule new requests after every router_max_wait_tokens decode steps.",
)
parser.add_argument(
"--disable_aggressive_schedule",
action="store_true",
help="""aggressive schedule can lead to frequent prefill interruptions during decode.
disabling it allows the router_max_wait_tokens parameter to work more effectively.""",
)
parser.add_argument("--use_dynamic_prompt_cache", action="store_true", help="use_dynamic_prompt_cache test")
parser.add_argument("--chunked_prefill_size", type=int, default=8192, help="chunked prefill size")
parser.add_argument("--disable_chunked_prefill", action="store_true", help="whether to disable chunked prefill")
parser.add_argument("--diverse_mode", action="store_true", help="diversity generation mode")
parser.add_argument("--token_healing_mode", action="store_true", help="code model infer mode")
parser.add_argument(
"--output_constraint_mode",
type=str,
choices=["outlines", "xgrammar", "none"],
default="none",
help="set the output constraint backend, none means no output constraint",
)
parser.add_argument(
"--first_token_constraint_mode",
action="store_true",
help="""constraint the first token allowed range,
use env FIRST_ALLOWED_TOKENS to set the range, like FIRST_ALLOWED_TOKENS=1,2 ..""",
)
parser.add_argument(
"--enable_multimodal", action="store_true", help="Whether or not to allow to load additional multimodal models."
)
parser.add_argument(
"--enable_mps", action="store_true", help="Whether to enable nvidia mps for multimodal service."
)
parser.add_argument("--enable_custom_allreduce", action="store_true", help="Whether to disable cutom allreduce.")
parser.add_argument("--enable_custom_allgather", action="store_true", help="Whether to enable cutom allgather.")
parser.add_argument(
"--enable_tpsp_mix_mode",
action="store_true",
help="""inference backend will use TP SP Mixed running mode.
only llama and deepseek v3 model supported now.""",
)
parser.add_argument(
"--enable_prefill_microbatch_overlap",
action="store_true",
help="""inference backend will use microbatch overlap mode for prefill.
only deepseekv3 model supported now.""",
)
parser.add_argument(
"--enable_decode_microbatch_overlap",
action="store_true",
help="""inference backend will use microbatch overlap mode for decode.
only deepseekv3 model supported now.""",
)
parser.add_argument(
"--enable_flashinfer_prefill",
action="store_true",
help="""inference backend will use the attention kernel of flashinfer for prefill,
only deepseekv3 model supported now.""",
)
parser.add_argument(
"--enable_flashinfer_decode",
action="store_true",
help="""inference backend will use the attention kernel of flashinfer for decode,
only deepseekv3 model supported now.""",
)
parser.add_argument(
"--cache_capacity", type=int, default=200, help="cache server capacity for multimodal resources"
)
parser.add_argument(
"--cache_reserved_ratio", type=float, default=0.5, help="cache server reserved capacity ratio after clear"
)
parser.add_argument(
"--data_type",
type=str,
choices=["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"],
default=None,
help="the data type of the model weight",
)
parser.add_argument("--return_all_prompt_logprobs", action="store_true", help="return all prompt tokens logprobs")
parser.add_argument("--use_reward_model", action="store_true", help="use reward model")
parser.add_argument(
"--long_truncation_mode",
type=str,
choices=[None, "head", "center"],
default=None,
help="""use to select the handle way when input_token_len + max_new_tokens > max_req_total_len.
None : raise Exception
head : remove some head tokens to make input_token_len + max_new_tokens <= max_req_total_len
center : remove some tokens in center loc to make input_token_len + max_new_tokens <= max_req_total_len""",
)
parser.add_argument("--use_tgi_api", action="store_true", help="use tgi input and ouput format")
parser.add_argument(
"--health_monitor", action="store_true", help="check the health of service and restart when error"
)
parser.add_argument("--metric_gateway", type=str, default=None, help="address for collecting monitoring metrics")
parser.add_argument("--job_name", type=str, default="lightllm", help="job name for monitor")
parser.add_argument(
"--grouping_key", action="append", default=[], help="grouping_key for the monitor in the form key=value"
)
parser.add_argument("--push_interval", type=int, default=10, help="interval of pushing monitoring metrics")
parser.add_argument(
"--visual_infer_batch_size", type=int, default=1, help="number of images to process in each inference batch"
)
parser.add_argument(
"--visual_gpu_ids", nargs="+", type=int, default=[0], help="List of GPU IDs to use, e.g., 0 1 2"
)
parser.add_argument("--visual_tp", type=int, default=1, help="number of tensort parallel instances for ViT")
parser.add_argument("--visual_dp", type=int, default=1, help="number of data parallel instances for ViT")
parser.add_argument(
"--visual_nccl_ports",
nargs="+",
type=int,
default=[29500],
help="List of NCCL ports to build a distributed environment for Vit, e.g., 29500 29501 29502",
)
parser.add_argument(
"--enable_monitor_auth", action="store_true", help="Whether to open authentication for push_gateway"
)
parser.add_argument("--disable_cudagraph", action="store_true", help="Disable the cudagraph of the decoding stage")
parser.add_argument(
"--graph_max_batch_size",
type=int,
default=16,
help="""Maximum batch size that can be captured by the cuda graph for decodign stage.
The default value is 8. It will turn into eagar mode if encounters a larger value.""",
)
parser.add_argument(
"--graph_max_len_in_batch",
type=int,
default=0,
help="""Maximum sequence length that can be captured by the cuda graph for decodign stage.
The default value is 8192. It will turn into eagar mode if encounters a larger value. """,
)
parser.add_argument(
"--quant_type",
type=str,
default="none",
help="""Quantization method: ppl-w4a16-128 | flashllm-w6a16
| ao-int4wo-[32,64,128,256] | ao-int8wo | ao-fp8w8a16 | ao-fp6w6a16
| vllm-w8a8 | vllm-fp8w8a8 | vllm-fp8w8a8-b128
| triton-fp8w8a8-block128""",
)
parser.add_argument(
"--quant_cfg",
type=str,
default=None,
help="""Path of quantization config. It can be used for mixed quantization.
Examples can be found in lightllm/common/quantization/configs.""",
)
parser.add_argument(
"--vit_quant_type",
type=str,
default="none",
help="""Quantization method: ppl-w4a16-128 | flashllm-w6a16
| ao-int4wo-[32,64,128,256] | ao-int8wo | ao-fp8w8a16 | ao-fp6w6a16
| vllm-w8a8 | vllm-fp8w8a8""",
)
parser.add_argument(
"--vit_quant_cfg",
type=str,
default=None,
help="""Path of quantization config. It can be used for mixed quantization.
Examples can be found in lightllm/common/quantization/configs.""",
)
return parser