diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index e48ab79f18..2b1bba1464 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -158,7 +158,8 @@ def get_cpu_seqlens(is_decoding, is_unpaged_prefill): repeat_interleave, used for attention metadata. """ if is_decoding: - q_seqlens_cpu = None + q_seqlens_cpu = None if AscendOpsBackend.enable_graph else torch.arange(1, step_context.kv_seqlens.size(0) + 1, + dtype=torch.int32) kv_seqlens_cpu = kv_seqlens_expanded = step_context.kv_seqlens.cpu() elif is_unpaged_prefill: q_seqlens_cpu = step_context.q_seqlens.cpu() diff --git a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py index 13f4e12a58..4531b474ca 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py +++ b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py @@ -78,6 +78,7 @@ def paged_token_attention( k_cache, v_cache, attn_output, + q_seqlens, kv_seq_len, max_kv_seq_len, block_offsets, @@ -96,6 +97,7 @@ def paged_token_attention( v_cache, block_offsets, block_size, + q_seqlens, kv_seq_len, max_kv_seq_len, num_q_heads, @@ -167,6 +169,7 @@ def paged_attention_fwd( key_cache, value_cache, attn_output, + q_seqlens, kv_seqlens, max_kv_seq_len, block_offsets, diff --git a/requirements/runtime_ascend.txt b/requirements/runtime_ascend.txt index d94a38d0bf..22d1ca8418 100644 --- a/requirements/runtime_ascend.txt +++ b/requirements/runtime_ascend.txt @@ -22,9 +22,9 @@ safetensors sentencepiece shortuuid tiktoken -torch>=2.3.1,<2.9.0 -torch-npu>=2.3.1,<2.9.0 -torchvision>=0.18.1,<0.24.0 +torch>=2.3.1,<2.10.0 +torch-npu>=2.3.1,<2.10.0 +torchvision>=0.18.1,<0.25.0 transformers uvicorn xgrammar