Skip to content

Commit 704cb87

Browse files
authored
fix httpserver mutlinode_tp mode code. (#794)
1 parent df9f8d4 commit 704cb87

File tree

1 file changed

+16
-8
lines changed

1 file changed

+16
-8
lines changed

lightllm/server/httpserver/manager.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ def __init__(
5959
self.transfer_lock = asyncio.Lock() # the lock for transfer to next module in multi node mode.
6060
self.disable_abort = args.nnodes > 1 and args.dp == 1 # mulitnode dp=1 mode, disable abort
6161
self.is_multinode_tp = args.dp == 1 and args.nnodes > 1
62+
self.is_multinode_tp_master = args.dp == 1 and args.nnodes > 1 and args.node_rank == 0
63+
self.is_multinode_tp_slave = args.dp == 1 and args.nnodes > 1 and args.node_rank > 0
6264
if self.is_multinode_tp:
6365
if args.node_rank == 0:
6466
self.multinode_req_manager = []
@@ -192,7 +194,7 @@ def alloc_req_id(self, sampling_params, is_health_req: bool = False):
192194
if is_health_req:
193195
return sampling_params.group_request_id
194196
if self.pd_mode == NodeRole.NORMAL:
195-
if not (self.nnodes > 1 and self.args.dp == 1):
197+
if not self.is_multinode_tp:
196198
group_request_id = self.id_gen.generate_id()
197199
else:
198200
if self.node_rank == 0:
@@ -222,7 +224,7 @@ async def generate(
222224

223225
try:
224226
original_multimodal_params = None
225-
if self.nnodes > 1 and self.node_rank == 0 and self.args.dp == 1:
227+
if self.is_multinode_tp_master:
226228
original_multimodal_params = copy.deepcopy(multimodal_params)
227229

228230
if self.pd_mode.is_P_or_NORMAL():
@@ -366,8 +368,10 @@ async def transfer_to_next_module_or_node(
366368
original_multimodal_params: MultimodalParams,
367369
group_req_objs: Optional[GroupReqObjs] = None,
368370
):
369-
# 多节点纯tp 运行模式下,保证请求能保持相同的顺序转发到其他节点和当前节点next module.
370-
if self.nnodes > 1 and self.node_rank == 0 and self.args.dp == 1:
371+
# 多节点纯tp 运行模式下,master 节点需要将请求按照可控的顺序转发给slave节点,
372+
# 同时转发给salve节点的时候,要保证master节点按照转发的顺序转发给next_module
373+
# 所以需要锁的控制。
374+
if self.is_multinode_tp_master:
371375
async with self.transfer_lock:
372376
for sender in self.multinode_req_manager:
373377
sender.send_pyobj(
@@ -376,8 +380,10 @@ async def transfer_to_next_module_or_node(
376380
)
377381
await self.transfer_to_next_module(group_req_objs)
378382
return
379-
380-
if self.nnodes > 1 and self.node_rank > 0 and self.args.dp == 1:
383+
# 多节点纯tp 的slave节点,需要按照接受到请求的顺序转发,这需要锁和排队机制来保证。
384+
# self.request_order_queue 实现了一种简单的排队取出机制,这样master 和 slave
385+
# 节点的请求到达各自节点的router的顺序才是一致的,才能完成同步同态调度。
386+
if self.is_multinode_tp_slave:
381387
while True:
382388
if self.request_order_queue and self.request_order_queue[0] != group_req_objs.group_req_id:
383389
await asyncio.sleep(0.002)
@@ -578,8 +584,10 @@ async def handle_loop(self):
578584
if self.pd_mode.is_P_or_D():
579585
self.forwarding_queue = AsyncQueue()
580586
asyncio.create_task(self.pd_handle_loop())
581-
582-
if self.args.node_rank > 0:
587+
588+
# 多节点tp模式下的slave节点,需要开启一个协程task用来接收
589+
# master 转发过来的请求对象。
590+
if self.is_multinode_tp_slave:
583591
asyncio.create_task(self.loop_for_request())
584592

585593
while True:

0 commit comments

Comments
 (0)