Skip to content

[Bug]: 0.9.2RC1 Qwen3 235B w8a8量化开启EP失败 #2105

@offline0731

Description

@offline0731

Your current environment

def profile_run(self) -> None:
    # Trigger compilation for general shape.
    hidden_states = self._dummy_run(self.max_num_tokens)
    output = None
    if get_pp_group().is_last_rank:
        if self.is_pooling_model:
            output = self._dummy_pooler_run(hidden_states)
        else:
            # For profile, have maximum num_reqs and that collectively have
            # maximum num_tokens.
            min_tokens_per_req = self.max_num_tokens // self.max_num_reqs
            num_scheduled_tokens_list = [min_tokens_per_req
                                         ] * self.max_num_reqs
            num_scheduled_tokens_list[
                -1] += self.max_num_tokens % self.max_num_reqs
            num_scheduled_tokens = np.array(num_scheduled_tokens_list,
                                            dtype=np.int32)
            logit_indices = np.cumsum(num_scheduled_tokens) - 1
            # TODO: need to rum a dummy sampler for generate task
            hidden_states = hidden_states[logit_indices]
            output = self.model.compute_logits(hidden_states, None)

🐛 Describe the bug

(VllmWorker rank=3 pid=56447) ERROR 07-30 06:16:20 [multiproc_executor.py:522] WorkerProc hit an exception.
(VllmWorker rank=3 pid=56447) ERROR 07-30 06:16:20 [multiproc_executor.py:522] Traceback (most recent call last):
(VllmWorker rank=3 pid=56447) ERROR 07-30 06:16:20 [multiproc_executor.py:522] File "/vllm-workspace/vllm/vllm/v1/executor/multiproc_executor.py", line 517, in worker_busy_loop
(VllmWorker rank=3 pid=56447) ERROR 07-30 06:16:20 [multiproc_executor.py:522] output = func(*args, **kwargs)
(VllmWorker rank=3 pid=56447) ERROR 07-30 06:16:20 [multiproc_executor.py:522] File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/worker_v1.py", line 152, in determine_available_memory
(VllmWorker rank=3 pid=56447) ERROR 07-30 06:16:20 [multiproc_executor.py:522] self.model_runner.profile_run()
(VllmWorker rank=3 pid=56447) ERROR 07-30 06:16:20 [multiproc_executor.py:522] File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py", line 1701, in profile_run
(VllmWorker rank=3 pid=56447) ERROR 07-30 06:16:20 [multiproc_executor.py:522] hidden_states = hidden_states[logit_indices]
(VllmWorker rank=3 pid=56447) ERROR 07-30 06:16:20 [multiproc_executor.py:522] RuntimeError: ACL stream synchronize failed, error code:507018
(VllmWorker rank=1 pid=56185) INFO 07-30 06:16:20 [monitor.py:34] torch.compile takes 4.12 s in total
[rank1]:[W730 06:16:20.361744142 compiler_depend.ts:57] Warning: E39999: Inner Error!
E39999: [PID: 56185] 2025-07-30-06:16:20.219.133 The error from device(chipId:1, dieId:0), serial number is 25, an exception occurred during AICPU execution, stream_id:2, task_id:285, errcode:21008, msg:inner error.[FUNC:ProcessStarsAicpuErrorInfo][FILE:device_error_proc.cc][LINE:1496]
TraceBack (most recent call last):
Kernel task happen error, retCode=0x2a, [aicpu exception].[FUNC:PreCheckTaskErr][FILE:davinci_kernel_task.cc][LINE:1366]
AICPU Kernel task happen error, retCode=0x2a.[FUNC:GetError][FILE:stream.cc][LINE:1119]
Aicpu kernel execute failed, device_id=1, stream_id=2, task_id=285, errorCode=2a.[FUNC:PrintAicpuErrorInfo][FILE:davinci_kernel_task.cc][LINE:1128]
Aicpu kernel execute failed, device_id=1, stream_id=2, task_id=285, fault op_name=[FUNC:GetError][FILE:stream.cc][LINE:1119]
rtStreamSynchronize execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
synchronize stream failed, runtime result = 507018[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
(function copy_between_host_and_device_opapi)
(VllmWorker rank=1 pid=56185) ERROR 07-30 06:16:20 [multiproc_executor.py:522] WorkerProc hit an exception.
(VllmWorker rank=1 pid=56185) ERROR 07-30 06:16:20 [multiproc_executor.py:522] Traceback (most recent call last):
(VllmWorker rank=1 pid=56185) ERROR 07-30 06:16:20 [multiproc_executor.py:522] File "/vllm-workspace/vllm/vllm/v1/executor/multiproc_executor.py", line 517, in worker_busy_loop
(VllmWorker rank=1 pid=56185) ERROR 07-30 06:16:20 [multiproc_executor.py:522] output = func(*args, **kwargs)
(VllmWorker rank=1 pid=56185) ERROR 07-30 06:16:20 [multiproc_executor.py:522] File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/worker_v1.py", line 152, in determine_available_memory
(VllmWorker rank=1 pid=56185) ERROR 07-30 06:16:20 [multiproc_executor.py:522] self.model_runner.profile_run()
(VllmWorker rank=1 pid=56185) ERROR 07-30 06:16:20 [multiproc_executor.py:522] File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py", line 1701, in profile_run
(VllmWorker rank=1 pid=56185) ERROR 07-30 06:16:20 [multiproc_executor.py:522] hidden_states = hidden_states[logit_indices]
(VllmWorker rank=1 pid=56185) ERROR 07-30 06:16:20 [multiproc_executor.py:522] RuntimeError: ACL stream synchronize failed, error code:507018
(VllmWorker rank=0 pid=56178) INFO 07-30 06:16:20 [monitor.py:34] torch.compile takes 4.42 s in total
[rank0]:[W730 06:16:20.372906864 compiler_depend.ts:57] Warning: E39999: Inner Error!
E39999: [PID: 56178] 2025-07-30-06:16:20.230.211 The error from device(chipId:0, dieId:0), serial number is 27, an exception occurred during AICPU execution, stream_id:2, task_id:286, errcode:21008, msg:inner error.[FUNC:ProcessStarsAicpuErrorInfo][FILE:device_error_proc.cc][LINE:1496]
TraceBack (most recent call last):
Kernel task happen error, retCode=0x2a, [aicpu exception].[FUNC:PreCheckTaskErr][FILE:davinci_kernel_task.cc][LINE:1366]
AICPU Kernel task happen error, retCode=0x2a.[FUNC:GetError][FILE:stream.cc][LINE:1119]
Aicpu kernel execute failed, device_id=0, stream_id=2, task_id=286, errorCode=2a.[FUNC:PrintAicpuErrorInfo][FILE:davinci_kernel_task.cc][LINE:1128]
Aicpu kernel execute failed, device_id=0, stream_id=2, task_id=286, fault op_name=[FUNC:GetError][FILE:stream.cc][LINE:1119]
rtStreamSynchronize execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
synchronize stream failed, runtime result = 507018[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
(function copy_between_host_and_device_opapi)
(VllmWorker rank=0 pid=56178) ERROR 07-30 06:16:20 [multiproc_executor.py:522] WorkerProc hit an exception.
(VllmWorker rank=0 pid=56178) ERROR 07-30 06:16:20 [multiproc_executor.py:522] Traceback (most recent call last):
(VllmWorker rank=0 pid=56178) ERROR 07-30 06:16:20 [multiproc_executor.py:522] File "/vllm-workspace/vllm/vllm/v1/executor/multiproc_executor.py", line 517, in worker_busy_loop
(VllmWorker rank=0 pid=56178) ERROR 07-30 06:16:20 [multiproc_executor.py:522] output = func(*args, **kwargs)
(VllmWorker rank=0 pid=56178) ERROR 07-30 06:16:20 [multiproc_executor.py:522] File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/worker_v1.py", line 152, in determine_available_memory
(VllmWorker rank=0 pid=56178) ERROR 07-30 06:16:20 [multiproc_executor.py:522] self.model_runner.profile_run()
(VllmWorker rank=0 pid=56178) ERROR 07-30 06:16:20 [multiproc_executor.py:522] File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py", line 1701, in profile_run
(VllmWorker rank=0 pid=56178) ERROR 07-30 06:16:20 [multiproc_executor.py:522] hidden_states = hidden_states[logit_indices]
(VllmWorker rank=0 pid=56178) ERROR 07-30 06:16:20 [multiproc_executor.py:522] RuntimeError: ACL stream synchronize failed, error code:507018
ERROR 07-30 06:16:20 [core.py:586] EngineCore failed to start.
ERROR 07-30 06:16:20 [core.py:586] Traceback (most recent call last):
ERROR 07-30 06:16:20 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 577, in run_engine_core
ERROR 07-30 06:16:20 [core.py:586] engine_core = EngineCoreProc(*args, **kwargs)
ERROR 07-30 06:16:20 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 404, in init
ERROR 07-30 06:16:20 [core.py:586] super().init(vllm_config, executor_class, log_stats,
ERROR 07-30 06:16:20 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 82, in init
ERROR 07-30 06:16:20 [core.py:586] self._initialize_kv_caches(vllm_config)
ERROR 07-30 06:16:20 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 142, in _initialize_kv_caches
ERROR 07-30 06:16:20 [core.py:586] available_gpu_memory = self.model_executor.determine_available_memory()
ERROR 07-30 06:16:20 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/executor/abstract.py", line 76, in determine_available_memory
ERROR 07-30 06:16:20 [core.py:586] output = self.collective_rpc("determine_available_memory")
ERROR 07-30 06:16:20 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/executor/multiproc_executor.py", line 215, in collective_rpc
ERROR 07-30 06:16:20 [core.py:586] result = get_response(w, dequeue_timeout)
ERROR 07-30 06:16:20 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/executor/multiproc_executor.py", line 202, in get_response
ERROR 07-30 06:16:20 [core.py:586] raise RuntimeError(
ERROR 07-30 06:16:20 [core.py:586] RuntimeError: Worker failed with error 'ACL stream synchronize failed, error code:507018', please check the stack trace above for the root cause
[W730 06:16:22.858261943 compiler_depend.ts:526] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.718.847 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeUsedDevices)
[W730 06:16:22.860542008 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.722.390 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.862563250 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.724.490 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.864553442 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.726.561 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.866398652 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.728.498 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.868223023 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.730.302 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.869118032 compiler_depend.ts:526] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.730.327 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeUsedDevices)
[W730 06:16:22.870843091 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.732.136 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.871217776 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.733.148 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.873211937 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.735.124 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.873954906 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.735.550 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.874991027 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.737.046 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.876054519 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.737.922 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.876856668 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.738.871 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.878056901 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.739.920 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.878726988 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.740.794 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.880341726 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.741.978 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.880558428 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.742.606 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.882374518 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.744.402 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.882392808 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.744.221 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.884241489 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.746.298 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.884581523 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.746.327 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.886622125 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.748.444 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.886880658 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.748.081 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.888663337 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.750.525 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.889824740 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.751.406 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.890626549 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.752.527 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.891996724 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.753.743 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.892680212 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56178] 2025-07-30-06:16:22.754.528 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.894178098 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.755.936 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.896279971 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.758.053 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.898495786 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.760.196 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.900581869 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.762.357 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.902467289 compiler_depend.ts:526] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.763.628 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeUsedDevices)
[W730 06:16:22.902736752 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56185] 2025-07-30-06:16:22.764.501 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.904542782 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.766.524 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.906262611 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.768.454 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.907835098 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.770.105 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.909448696 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.771.729 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.911005673 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.773.285 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.912721182 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.774.874 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.914327090 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.776.561 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.916014068 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.778.221 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.917713527 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.780.010 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.919269074 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.781.577 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.920812741 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.783.103 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.922378718 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.784.677 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.923923655 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.786.199 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.925511073 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.787.810 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
[W730 06:16:22.927078260 compiler_depend.ts:508] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.789.339 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function npuSynchronizeDevice)
[W730 06:16:22.928694818 compiler_depend.ts:227] Warning: NPU warning, error code is 507018[Error]:
[Error]: The aicpu execution is abnormal.
Rectify the fault based on the error information in the ascend log.
EH9999: Inner Error!
rtDeviceSynchronizeWithTimeout execute failed, reason=[aicpu exception][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
EH9999: [PID: 56447] 2025-07-30-06:16:22.790.944 wait for compute device to finish failed, runtime result = 507018.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
TraceBack (most recent call last):
(function empty_cache)
ERROR 07-30 06:16:32 [multiproc_executor.py:135] Worker proc VllmWorker-2 died unexpectedly, shutting down executor.
Process EngineCore_0:
Traceback (most recent call last):
File "/usr/local/python3.10.17/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/local/python3.10.17/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 590, in run_engine_core
raise e
File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 577, in run_engine_core
engine_core = EngineCoreProc(*args, **kwargs)
File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 404, in init
super().init(vllm_config, executor_class, log_stats,
File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 82, in init
self._initialize_kv_caches(vllm_config)
File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 142, in _initialize_kv_caches
available_gpu_memory = self.model_executor.determine_available_memory()
File "/vllm-workspace/vllm/vllm/v1/executor/abstract.py", line 76, in determine_available_memory
output = self.collective_rpc("determine_available_memory")
File "/vllm-workspace/vllm/vllm/v1/executor/multiproc_executor.py", line 215, in collective_rpc
result = get_response(w, dequeue_timeout)
File "/vllm-workspace/vllm/vllm/v1/executor/multiproc_executor.py", line 202, in get_response
raise RuntimeError(
RuntimeError: Worker failed with error 'ACL stream synchronize failed, error code:507018', please check the stack trace above for the root cause
Traceback (most recent call last):
File "/usr/local/python3.10.17/bin/vllm", line 8, in
sys.exit(main())
File "/vllm-workspace/vllm/vllm/entrypoints/cli/main.py", line 65, in main
args.dispatch_function(args)
File "/vllm-workspace/vllm/vllm/entrypoints/cli/serve.py", line 55, in cmd
uvloop.run(run_server(args))
File "/usr/local/python3.10.17/lib/python3.10/site-packages/uvloop/init.py", line 82, in run
return loop.run_until_complete(wrapper())
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/usr/local/python3.10.17/lib/python3.10/site-packages/uvloop/init.py", line 61, in wrapper
return await main
File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 1431, in run_server
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 1451, in run_server_worker
async with build_async_engine_client(args, client_config) as engine_client:
File "/usr/local/python3.10.17/lib/python3.10/contextlib.py", line 199, in aenter
return await anext(self.gen)
File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 158, in build_async_engine_client
async with build_async_engine_client_from_engine_args(
File "/usr/local/python3.10.17/lib/python3.10/contextlib.py", line 199, in aenter
return await anext(self.gen)
File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 194, in build_async_engine_client_from_engine_args
async_llm = AsyncLLM.from_vllm_config(
File "/vllm-workspace/vllm/vllm/v1/engine/async_llm.py", line 162, in from_vllm_config
return cls(
File "/vllm-workspace/vllm/vllm/v1/engine/async_llm.py", line 124, in init
self.engine_core = EngineCoreClient.make_async_mp_client(
File "/vllm-workspace/vllm/vllm/v1/engine/core_client.py", line 96, in make_async_mp_client
return AsyncMPClient(*client_args)
File "/vllm-workspace/vllm/vllm/v1/engine/core_client.py", line 666, in init
super().init(
File "/vllm-workspace/vllm/vllm/v1/engine/core_client.py", line 403, in init
with launch_core_engines(vllm_config, executor_class,
File "/usr/local/python3.10.17/lib/python3.10/contextlib.py", line 142, in exit
next(self.gen)
File "/vllm-workspace/vllm/vllm/v1/engine/utils.py", line 434, in launch_core_engines
wait_for_engine_startup(
File "/vllm-workspace/vllm/vllm/v1/engine/utils.py", line 484, in wait_for_engine_startup
raise RuntimeError("Engine core initialization failed. "
RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
[ERROR] 2025-07-30-06:16:37 (PID:55707, Device:-1, RankID:-1) ERR99999 UNKNOWN applicaiton exception

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions