Skip to content

Commit dc7faca

Browse files
authored
[Iluvatar GPU] fix ci error caused by rebuild_padding param and cuda graph (#4504)
1 parent d70aacf commit dc7faca

File tree

4 files changed

+100
-41
lines changed

4 files changed

+100
-41
lines changed

fastdeploy/model_executor/graph_optimization/utils.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import paddle
2121
import pynvml
2222

23+
from fastdeploy.platforms import current_platform
24+
2325

2426
@dataclass
2527
class PaddleMemoryInfo:
@@ -46,8 +48,11 @@ def __init__(
4648
self.device_id = device_id
4749
self.print_debug_info = print_debug_info
4850

49-
pynvml.nvmlInit()
50-
self.gpu_memory_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id)
51+
if current_platform.is_iluvatar():
52+
self.gpu_memory_handle = None
53+
else:
54+
pynvml.nvmlInit()
55+
self.gpu_memory_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id)
5156

5257
def __del__(self):
5358
""" """

fastdeploy/model_executor/pre_and_post_process.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,9 @@ def rebuild_padding(
732732
seq_lens_decoder,
733733
seq_lens_encoder,
734734
output_padding_offset,
735+
first_token_out,
735736
max_input_length,
737+
enable_logprob,
736738
)
737739
elif current_platform.is_gcu():
738740
from fastdeploy.model_executor.ops.gcu import rebuild_padding

fastdeploy/worker/iluvatar_model_runner.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ def __init__(
3131
rank: int,
3232
local_rank: int,
3333
):
34+
# Iluvatar does not support cudagraph
35+
fd_config.graph_opt_config.use_cudagraph = False
3436
super(IluvatarModelRunner, self).__init__(
3537
fd_config=fd_config, device=device, device_id=device_id, rank=rank, local_rank=local_rank
3638
)
@@ -40,6 +42,7 @@ def __init__(
4042
assert not self.cache_config.enable_prefix_caching, "Iluvatar does not support prefix caching"
4143
self.mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN"
4244
assert not self.mla_cache, "Iluvatar does not support MLA"
45+
assert not self.use_cudagraph, "Iluvatar does not support cudagraph"
4346
if self.enable_mm:
4447
assert (
4548
not self.cache_config.enable_chunked_prefill
Lines changed: 88 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,91 @@
1+
import functools
2+
import sys
3+
import threading
4+
15
from fastdeploy import LLM, SamplingParams
26
from fastdeploy.utils import set_random_seed
37

4-
set_random_seed(123)
5-
6-
prompts = [
7-
"Hello, my name is",
8-
]
9-
10-
# 采样参数
11-
sampling_params = SamplingParams(temperature=0.8, top_p=0.00001, max_tokens=16)
12-
13-
# 加载模型
14-
llm = LLM(
15-
model="/data1/fastdeploy/ERNIE_300B_4L",
16-
tensor_parallel_size=8,
17-
max_model_len=8192,
18-
quantization="wint8",
19-
block_size=16,
20-
)
21-
22-
# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理)
23-
outputs = llm.generate(prompts, sampling_params)
24-
25-
assert outputs[0].outputs.token_ids == [
26-
23768,
27-
97000,
28-
47814,
29-
59335,
30-
68170,
31-
183,
32-
49080,
33-
94717,
34-
82966,
35-
99140,
36-
31615,
37-
51497,
38-
94851,
39-
60764,
40-
10889,
41-
2,
42-
], f"{outputs[0].outputs.token_ids}"
8+
9+
def timeout(seconds):
10+
def decorator(func):
11+
@functools.wraps(func)
12+
def wrapper(*args, **kwargs):
13+
result = [None]
14+
exception = [None]
15+
16+
def target():
17+
try:
18+
result[0] = func(*args, **kwargs)
19+
except Exception as e:
20+
exception[0] = e
21+
22+
thread = threading.Thread(target=target)
23+
thread.daemon = True
24+
thread.start()
25+
thread.join(seconds)
26+
27+
if thread.is_alive():
28+
raise TimeoutError(f"Function timed out after {seconds} seconds")
29+
30+
if exception[0]:
31+
raise exception[0]
32+
33+
return result[0]
34+
35+
return wrapper
36+
37+
return decorator
38+
39+
40+
@timeout(60)
41+
def offline_infer_check():
42+
set_random_seed(123)
43+
44+
prompts = [
45+
"Hello, my name is",
46+
]
47+
48+
# 采样参数
49+
sampling_params = SamplingParams(temperature=0.8, top_p=0.00001, max_tokens=16)
50+
51+
# 加载模型
52+
llm = LLM(
53+
model="/data1/fastdeploy/ERNIE_300B_4L",
54+
tensor_parallel_size=8,
55+
max_model_len=8192,
56+
quantization="wint8",
57+
block_size=16,
58+
)
59+
60+
# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理)
61+
outputs = llm.generate(prompts, sampling_params)
62+
63+
assert outputs[0].outputs.token_ids == [
64+
23768,
65+
97000,
66+
47814,
67+
59335,
68+
68170,
69+
183,
70+
49080,
71+
94717,
72+
82966,
73+
99140,
74+
31615,
75+
51497,
76+
94851,
77+
60764,
78+
10889,
79+
2,
80+
], f"{outputs[0].outputs.token_ids}"
81+
print("PASSED")
82+
83+
84+
if __name__ == "__main__":
85+
try:
86+
result = offline_infer_check()
87+
sys.exit(0)
88+
except TimeoutError:
89+
sys.exit(124)
90+
except Exception:
91+
sys.exit(1)

0 commit comments

Comments
 (0)