131131from vllm_ascend .utils import (ACL_FORMAT_FRACTAL_ND , ACL_FORMAT_FRACTAL_NZ ,
132132 AscendSocVersion , ProfileExecuteDuration ,
133133 enable_sp , get_ascend_soc_version , is_310p ,
134- is_enable_nz , lmhead_tp_enable )
134+ is_enable_nz , lmhead_tp_enable , is_moe_model )
135135from vllm_ascend .worker .npu_input_batch import CachedRequestState , InputBatch
136136
137137if TYPE_CHECKING :
@@ -470,11 +470,14 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
470470 self .in_profile_run = False
471471
472472 self ._init_mc2_tokens_capacity ()
473- self .reserved_mc2_mask = torch .zeros (
474- self .mc2_tokens_capacity ,
475- dtype = torch .bool ,
476- device = self .device ,
477- )
473+ if is_moe_model (vllm_config ):
474+ self .reserved_mc2_mask = torch .zeros (
475+ self .mc2_tokens_capacity ,
476+ dtype = torch .bool ,
477+ device = self .device ,
478+ )
479+ else :
480+ self .reserved_mc2_mask = None
478481 self .dynamic_eplb = self .ascend_config .dynamic_eplb or self .ascend_config .expert_map_record_path
479482 if self .dynamic_eplb :
480483 EPLBParamUtils .check_dynamic_eplb (self .ascend_config .dynamic_eplb )
@@ -1339,9 +1342,7 @@ def _prepare_inputs(
13391342 self .query_lens = torch .from_numpy (num_scheduled_tokens )
13401343
13411344 # Copy the tensors to the NPU.
1342- self .input_ids [:total_num_scheduled_tokens ].copy_ (
1343- self .input_ids_cpu [:total_num_scheduled_tokens ], non_blocking = True )
1344-
1345+ self ._prepare_input_ids (total_num_scheduled_tokens , cu_num_tokens )
13451346 self .positions_cpu [total_num_scheduled_tokens :num_input_tokens ].zero_ ()
13461347 self .positions [:num_input_tokens ].copy_ (
13471348 self .positions_cpu [:num_input_tokens ], non_blocking = True )
@@ -1362,16 +1363,6 @@ def _prepare_inputs(
13621363 self ._update_graph_pad_size (with_prefill , maybe_padded_num_tokens )
13631364 attn_metadata : dict [str , Any ] = {}
13641365
1365- # Prepare input_ids
1366- token_indices = (positions_np +
1367- req_indices * self .input_batch .token_ids_cpu .shape [1 ])
1368- torch .index_select (self .input_batch .token_ids_cpu_tensor .flatten (),
1369- 0 ,
1370- torch .from_numpy (token_indices ),
1371- out = self .input_ids_cpu [:total_num_scheduled_tokens ])
1372- # Copy the tensors to the NPU.
1373- self ._prepare_input_ids (total_num_scheduled_tokens , cu_num_tokens )
1374-
13751366 # _prepare_inputs may reorder the batch, so we must gather
13761367 # multi-modal outputs after that to ensure the correct order
13771368 if self .is_multimodal_model :
0 commit comments