@@ -561,22 +561,47 @@ def schedule(self) -> SchedulerOutput:
561561 scheduled_spec_decode_tokens ,
562562 req_to_new_blocks ,
563563 )
564- scheduler_output = SchedulerOutput (
565- scheduled_new_reqs = new_reqs_data ,
566- scheduled_cached_reqs = cached_reqs_data ,
567- num_scheduled_tokens = num_scheduled_tokens ,
568- total_num_scheduled_tokens = total_num_scheduled_tokens ,
569- scheduled_spec_decode_tokens = scheduled_spec_decode_tokens ,
570- scheduled_encoder_inputs = scheduled_encoder_inputs ,
571- num_common_prefix_blocks = num_common_prefix_blocks ,
572- # finished_req_ids is an existing state in the scheduler,
573- # instead of being newly scheduled in this step.
574- # It contains the request IDs that are finished in between
575- # the previous and the current steps.
576- finished_req_ids = self .finished_req_ids ,
577- free_encoder_mm_hashes = self .encoder_cache_manager .
578- get_freed_mm_hashes (),
579- )
564+ if vllm_version_is ("0.11.0" ):
565+ scheduled_requests = (scheduled_new_reqs + scheduled_running_reqs +
566+ scheduled_resumed_reqs )
567+ structured_output_request_ids , grammar_bitmask = (
568+ self .get_grammar_bitmask (scheduled_requests ,
569+ scheduled_spec_decode_tokens ))
570+ scheduler_output = SchedulerOutput (
571+ scheduled_new_reqs = new_reqs_data ,
572+ scheduled_cached_reqs = cached_reqs_data ,
573+ num_scheduled_tokens = num_scheduled_tokens ,
574+ total_num_scheduled_tokens = total_num_scheduled_tokens ,
575+ scheduled_spec_decode_tokens = scheduled_spec_decode_tokens ,
576+ scheduled_encoder_inputs = scheduled_encoder_inputs ,
577+ num_common_prefix_blocks = num_common_prefix_blocks ,
578+ # finished_req_ids is an existing state in the scheduler,
579+ # instead of being newly scheduled in this step.
580+ # It contains the request IDs that are finished in between
581+ # the previous and the current steps.
582+ finished_req_ids = self .finished_req_ids ,
583+ free_encoder_mm_hashes = self .encoder_cache_manager .
584+ get_freed_mm_hashes (),
585+ structured_output_request_ids = structured_output_request_ids ,
586+ grammar_bitmask = grammar_bitmask ,
587+ )
588+ else :
589+ scheduler_output = SchedulerOutput (
590+ scheduled_new_reqs = new_reqs_data ,
591+ scheduled_cached_reqs = cached_reqs_data ,
592+ num_scheduled_tokens = num_scheduled_tokens ,
593+ total_num_scheduled_tokens = total_num_scheduled_tokens ,
594+ scheduled_spec_decode_tokens = scheduled_spec_decode_tokens ,
595+ scheduled_encoder_inputs = scheduled_encoder_inputs ,
596+ num_common_prefix_blocks = num_common_prefix_blocks ,
597+ # finished_req_ids is an existing state in the scheduler,
598+ # instead of being newly scheduled in this step.
599+ # It contains the request IDs that are finished in between
600+ # the previous and the current steps.
601+ finished_req_ids = self .finished_req_ids ,
602+ free_encoder_mm_hashes = self .encoder_cache_manager .
603+ get_freed_mm_hashes (),
604+ )
580605
581606 # NOTE(Kuntai): this function is designed for multiple purposes:
582607 # 1. Plan the KV cache store
0 commit comments