@@ -68,55 +68,68 @@ def use_pplx_kernels(self):
68
68
def make (tp_size_ : int , dp_size_ : int ,
69
69
vllm_parallel_config : ParallelConfig ) -> "FusedMoEParallelConfig" :
70
70
"""
71
- Determine MoE parallel configuration. Based on the input tp_size_, dp_size_,
72
- ep_size_ and vllm's parallel config, determine what level's of parallelism
73
- to use in the fused moe layer.
71
+ Determine MoE parallel configuration. Based on the input tp_size_,
72
+ dp_size_, ep_size_ and vllm's parallel config, determine what
73
+ level's of parallelism to use in the fused moe layer.
74
74
75
75
Args:
76
76
tp_size_ (int): tp_size passed into the FusedMoE constructor.
77
77
dp_size_ (int): dp_size passed into the FusedMoE constructor.
78
78
ep_size_ (int): ep_size passed into the FusedMoE constructor.
79
- vllm_parallel_config (ParallelConfig): vllm's parallel config object.
79
+ vllm_parallel_config (ParallelConfig): vllm's parallel config
80
+ object.
80
81
81
82
Examples:
82
83
When there is no parallelism requested, i.e. tp_size_ = dp_size_ = 1,
83
84
we simply return the sizes unaltered and the ranks set to 0.
84
85
85
- Expert Parallelism is considered only when either dp_size_ or tp_size_ is non trivial.
86
+ Expert Parallelism is considered only when either dp_size_ or tp_size_
87
+ is non trivial.
86
88
87
- When TP = 2, DP = 1 and EP = False, the configuration on different devices,
88
- - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} // legend : {size, rank}
89
+ When TP = 2, DP = 1 and EP = False, the configuration on different
90
+ devices,
91
+ - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
92
+ legend : {size, rank}
89
93
- device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
90
94
- Comment : Tensors are sharded across 2 devices.
91
95
92
- When TP = 1, DP = 2 and EP = False, the configuration on different devices,
96
+ When TP = 1, DP = 2 and EP = False, the configuration on different
97
+ devices,
93
98
- device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
94
99
- device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
95
- - Comment: There are 2 engine instances and the tensors are sharded across 2 decvices.
100
+ - Comment: There are 2 engine instances and the tensors are sharded
101
+ across 2 decvices.
96
102
97
- When TP = 2, DP = 2 and EP = False, the configuration on different devices,
103
+ When TP = 2, DP = 2 and EP = False, the configuration on different
104
+ devices,
98
105
- device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
99
106
- device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
100
107
- device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
101
108
- device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
102
- - Comment: There are 2 engine instances and the tensors are sharded across 4 devices.
109
+ - Comment: There are 2 engine instances and the tensors are sharded
110
+ across 4 devices.
103
111
104
- When, TP = 2, DP = 1 and EP = True, the configuration on different devices,
112
+ When, TP = 2, DP = 1 and EP = True, the configuration on different
113
+ devices,
105
114
- device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
106
115
- device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
107
116
- Comment: The experts are split between the 2 devices.
108
117
109
- When, TP = 1, DP = 2 and EP = True, the configuration on different devices,
118
+ When, TP = 1, DP = 2 and EP = True, the configuration on different
119
+ devices,
110
120
- device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
111
121
- device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
112
- - Comment: There are 2 engine instances and the experts are split between the 2 devices.
122
+ - Comment: There are 2 engine instances and the experts are split
123
+ between the 2 devices.
113
124
114
- When TP = 2, DP = 2 and EP = True, the configuration on different devices,
125
+ When TP = 2, DP = 2 and EP = True, the configuration on different
126
+ devices,
115
127
- device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
116
128
- device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
117
129
- device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
118
130
- device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
119
- - Comment: There are 2 engine instances and the experts are split between the 4 devices.
131
+ - Comment: There are 2 engine instances and the experts are split
132
+ between the 4 devices.
120
133
"""
121
134
122
135
def flatten_tp_across_dp (dp_rank : int ):
@@ -127,7 +140,8 @@ def flatten_tp_across_dp(dp_rank: int):
127
140
tp_rank = dp_rank * tp_size_ + tp_rank
128
141
return tp_size , tp_rank
129
142
130
- use_ep = dp_size_ * tp_size_ > 1 and vllm_parallel_config .enable_expert_parallel
143
+ use_ep = (dp_size_ * tp_size_ > 1
144
+ and vllm_parallel_config .enable_expert_parallel )
131
145
132
146
dp_size = dp_size_
133
147
dp_rank = get_dp_group ().rank_in_group
@@ -143,8 +157,8 @@ def flatten_tp_across_dp(dp_rank: int):
143
157
use_ep = False )
144
158
# DP + EP / TP + EP / DP + TP + EP
145
159
assert use_ep
146
- # In EP, each device owns a set of experts fully. There is no tensor parallel.
147
- # Update tp_size, tp_rank, ep_size and ep_rank to reflect that.
160
+ # In EP, each device owns a set of experts fully. There is no tensor
161
+ # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
148
162
ep_size = tp_size
149
163
ep_rank = tp_rank
150
164
return FusedMoEParallelConfig (tp_size = 1 ,
@@ -719,12 +733,13 @@ def __init__(
719
733
self .params_dtype = params_dtype
720
734
721
735
vllm_config = get_current_vllm_config ()
722
- self .moe_parallel_config : FusedMoEParallelConfig = FusedMoEParallelConfig .make (
723
- tp_size_ = (tp_size if tp_size is not None else
724
- get_tensor_model_parallel_world_size ()),
725
- dp_size_ = (dp_size
726
- if dp_size is not None else get_dp_group ().world_size ),
727
- vllm_parallel_config = vllm_config .parallel_config )
736
+ self .moe_parallel_config : FusedMoEParallelConfig = (
737
+ FusedMoEParallelConfig .make (
738
+ tp_size_ = (tp_size if tp_size is not None else
739
+ get_tensor_model_parallel_world_size ()),
740
+ dp_size_ = (dp_size if dp_size is not None else
741
+ get_dp_group ().world_size ),
742
+ vllm_parallel_config = vllm_config .parallel_config ))
728
743
729
744
self .global_num_experts = num_experts
730
745
@@ -1184,8 +1199,9 @@ def must_reduce_shared_outputs(self) -> bool:
1184
1199
def maybe_all_reduce_tensor_model_parallel (
1185
1200
self , final_hidden_states : torch .Tensor ):
1186
1201
"""
1187
- The pplx combine kernel reduce across GPU ranks by default. The pplx kernels are
1188
- used when EP is enabled. In that case, this function is a no-op.
1202
+ The pplx combine kernel reduce across GPU ranks by default. The pplx
1203
+ kernels are used when EP is enabled. In that case, this function is a
1204
+ no-op.
1189
1205
"""
1190
1206
if self .dp_size > 1 and self .use_ep and has_pplx :
1191
1207
return final_hidden_states
0 commit comments