Skip to content

Commit 4e21b15

Browse files
[BugFix] Check all expert maps when using muilty instance. (#3662)
### What this PR does / why we need it? Check all expert maps when using muilty instance. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? Qwen 235B in double A3. case1:master has expert map, slave has not expert map. case2: master has expert map, slave has error expert map. case3: master has expert map,slave has correct expert map. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: offline0806 <[email protected]> Co-authored-by: offline0806 <[email protected]>
1 parent b321e38 commit 4e21b15

File tree

3 files changed

+25
-5
lines changed

3 files changed

+25
-5
lines changed

vllm_ascend/ops/common_fused_moe.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ def __init__(self, *args, **kwargs):
192192
os.R_OK):
193193
self.expert_load_balancer = ExpertLoadBalancer(
194194
self.expert_map_path, self.global_num_experts)
195+
self.expert_load_balancer.check_expert_map_tensor()
195196
self.global_redundant_expert_num = (
196197
self.expert_load_balancer.get_global_redundant_expert_num())
197198
try:

vllm_ascend/ops/expert_load_balancer.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,31 @@
33
from typing import Dict, List
44

55
import torch
6+
import torch.distributed as dist
67

78

89
class ExpertLoadBalancer(object):
910

1011
def __init__(self, expert_map_path, global_expert_num):
1112
self.expert_map_path = expert_map_path
1213
self.global_expert_num = global_expert_num
14+
self.tensor_data = []
1315
self.expert_map_tensor, self.layers_num, self.ranks_num = (
1416
self._expert_file_to_tensor())
17+
self.expert_placement_map = self.generate_expert_placement_map()
1518

1619
def _expert_file_to_tensor(self):
1720
with open(self.expert_map_path, "r") as f:
1821
data = json.load(f)
1922
layers_num = data["moe_layer_count"]
2023
gpus_num = data["layer_list"][0]["device_count"]
2124

22-
tensor_data = []
2325
for layer in data["layer_list"]:
2426
device_data = []
2527
for device in layer["device_list"]:
2628
device_data.append(device["device_expert"])
27-
tensor_data.append(device_data)
28-
expert_map_tensor = torch.tensor(tensor_data, dtype=torch.int32)
29+
self.tensor_data.append(device_data)
30+
expert_map_tensor = torch.tensor(self.tensor_data, dtype=torch.int32)
2931
return expert_map_tensor, layers_num, gpus_num
3032

3133
def generate_index_dicts(self, tensor_2d):
@@ -81,8 +83,7 @@ def generate_log2phy_expert_map(self, layer_id):
8183
return log2phy_map
8284

8385
def get_rank_placement_map(self, layer_id, rank_id):
84-
expert_placement_map = self.generate_expert_placement_map()
85-
layer_expert_map = expert_placement_map[layer_id]
86+
layer_expert_map = self.expert_placement_map[layer_id]
8687
rank_expert_map = layer_expert_map[rank_id].to(
8788
torch.npu.current_device())
8889
rank_local_expert_num = torch.sum(torch.ne(rank_expert_map, -1)).item()
@@ -97,3 +98,20 @@ def get_global_redundant_expert_num(self):
9798
len(self.expert_map_tensor[0][0]) * self.ranks_num -
9899
self.global_expert_num)
99100
return global_redundant_expert_num
101+
102+
def check_expert_map_tensor(self):
103+
if dist.is_initialized():
104+
try:
105+
rank = dist.get_rank()
106+
world_size = dist.get_world_size()
107+
all_expert_maps = [None for _ in range(world_size)]
108+
dist.all_gather_object(all_expert_maps, self.tensor_data)
109+
for rank_id, expert_map_tensor in enumerate(all_expert_maps):
110+
if self.tensor_data != expert_map_tensor:
111+
raise ValueError(
112+
f"The expert map of rank{rank} is not equal to rank{rank_id}"
113+
)
114+
return True
115+
except Exception as e:
116+
raise ValueError(
117+
f"The expert maps of all ranks are inconsistency: {e}")

vllm_ascend/torchair/ops/torchair_fused_moe.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1042,6 +1042,7 @@ def __init__(
10421042
os.R_OK):
10431043
self.expert_load_balancer = ExpertLoadBalancer(
10441044
self.expert_map_path, self.global_num_experts)
1045+
self.expert_load_balancer.check_expert_map_tensor()
10451046
self.global_redundant_expert_num = (
10461047
self.expert_load_balancer.get_global_redundant_expert_num())
10471048
try:

0 commit comments

Comments
 (0)