Skip to content

Commit 6996a63

Browse files
authored
[Frontend] Add Torch 2.8 support and KV cache for DeepSeekR1 model. (#586)
* Add Torch 2.8 and Transformers support, KV cache for DeepSeekR1 model * Support DeepSeekR1 f16 * Support llama * Support Bert * Support Whisper * Support StableDiffusion * Update DeepSeekR1 CMakeLists * Update DeepSeekR1 CMakeLists
1 parent 759a210 commit 6996a63

File tree

14 files changed

+1346
-177
lines changed

14 files changed

+1346
-177
lines changed

examples/BuddyBert/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ add_custom_command(
3737
-buffer-deallocation-simplification
3838
-bufferization-lower-deallocations
3939
-matmul-parallel-vectorization-optimize
40-
-batchmatmul-optimize
40+
# -batchmatmul-optimize
4141
-convert-linalg-to-affine-loops
4242
-affine-loop-fusion
4343
-affine-parallelize

examples/BuddyDeepSeekR1/CMakeLists.txt

Lines changed: 111 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ set(BUFFERIZE_SIMPLE_OPTS "bufferize-function-boundaries")
44
set(TOSA_PIPELINE "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))")
55

66
add_custom_command(
7-
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/forward.mlir
8-
${CMAKE_CURRENT_BINARY_DIR}/subgraph0.mlir
7+
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/forward_prefill.mlir
8+
${CMAKE_CURRENT_BINARY_DIR}/subgraph0_prefill.mlir
9+
${CMAKE_CURRENT_BINARY_DIR}/forward_decode.mlir
10+
${CMAKE_CURRENT_BINARY_DIR}/subgraph0_decode.mlir
911
${CMAKE_CURRENT_BINARY_DIR}/arg0.data
1012
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/import-deepseek-r1.py
1113
--output-dir ${CMAKE_CURRENT_BINARY_DIR}
@@ -23,8 +25,8 @@ add_custom_command(
2325
)
2426

2527
add_custom_command(
26-
OUTPUT forward.o
27-
COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/forward.mlir
28+
OUTPUT forward_prefill.o
29+
COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/forward_prefill.mlir
2830
-simplify-tosa-reshape |
2931
${LLVM_TOOLS_BINARY_DIR}/mlir-opt
3032
-pass-pipeline ${TOSA_PIPELINE} |
@@ -62,14 +64,14 @@ add_custom_command(
6264
${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
6365
${LLVM_TOOLS_BINARY_DIR}/llvm-as |
6466
${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
65-
-o ${CMAKE_CURRENT_BINARY_DIR}/forward.o
66-
DEPENDS buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/forward.mlir
67-
COMMENT "Building forward.o "
67+
-o ${CMAKE_CURRENT_BINARY_DIR}/forward_prefill.o
68+
DEPENDS buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/forward_prefill.mlir
69+
COMMENT "Building forward_prefill.o "
6870
VERBATIM)
6971

7072
add_custom_command(
71-
OUTPUT subgraph.o
72-
COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/subgraph0.mlir
73+
OUTPUT subgraph_prefill.o
74+
COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/subgraph0_prefill.mlir
7375
-simplify-tosa-reshape |
7476
${LLVM_TOOLS_BINARY_DIR}/mlir-opt
7577
-pass-pipeline ${TOSA_PIPELINE} |
@@ -109,9 +111,101 @@ add_custom_command(
109111
${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
110112
${LLVM_TOOLS_BINARY_DIR}/llvm-as |
111113
${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
112-
-o ${CMAKE_CURRENT_BINARY_DIR}/subgraph.o
113-
DEPENDS buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/subgraph0.mlir
114-
COMMENT "Building subgraph.o "
114+
-o ${CMAKE_CURRENT_BINARY_DIR}/subgraph_prefill.o
115+
DEPENDS buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/subgraph0_prefill.mlir
116+
COMMENT "Building subgraph_prefill.o "
117+
VERBATIM)
118+
119+
add_custom_command(
120+
OUTPUT forward_decode.o
121+
COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/forward_decode.mlir
122+
-simplify-tosa-reshape |
123+
${LLVM_TOOLS_BINARY_DIR}/mlir-opt
124+
-pass-pipeline ${TOSA_PIPELINE} |
125+
${BUDDY_BINARY_DIR}/buddy-opt
126+
-eliminate-empty-tensors
127+
-empty-tensor-to-alloc-tensor
128+
-one-shot-bufferize=${BUFFERIZE_SIMPLE_OPTS}
129+
-expand-strided-metadata
130+
-ownership-based-buffer-deallocation
131+
-buffer-deallocation-simplification
132+
-bufferization-lower-deallocations
133+
-matmul-vectorization
134+
-batchmatmul-optimize
135+
-convert-linalg-to-affine-loops
136+
-affine-loop-fusion
137+
-affine-parallelize
138+
-convert-vector-to-scf
139+
-lower-affine
140+
-convert-scf-to-openmp
141+
-cse
142+
-memref-expand
143+
-arith-expand
144+
-convert-vector-to-llvm
145+
-convert-arith-to-llvm
146+
-finalize-memref-to-llvm
147+
-convert-scf-to-cf
148+
-convert-cf-to-llvm
149+
-llvm-request-c-wrappers
150+
-convert-openmp-to-llvm
151+
-convert-arith-to-llvm
152+
-convert-math-to-llvm
153+
-convert-math-to-libm
154+
-convert-func-to-llvm
155+
-reconcile-unrealized-casts |
156+
${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
157+
${LLVM_TOOLS_BINARY_DIR}/llvm-as |
158+
${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
159+
-o ${CMAKE_CURRENT_BINARY_DIR}/forward_decode.o
160+
DEPENDS buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/forward_decode.mlir
161+
COMMENT "Building forward_decode.o "
162+
VERBATIM)
163+
164+
add_custom_command(
165+
OUTPUT subgraph_decode.o
166+
COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/subgraph0_decode.mlir
167+
-simplify-tosa-reshape |
168+
${LLVM_TOOLS_BINARY_DIR}/mlir-opt
169+
-pass-pipeline ${TOSA_PIPELINE} |
170+
${BUDDY_BINARY_DIR}/buddy-opt
171+
-eliminate-empty-tensors
172+
-empty-tensor-to-alloc-tensor
173+
-convert-elementwise-to-linalg
174+
-one-shot-bufferize=${BUFFERIZE_SIMPLE_OPTS}
175+
-expand-strided-metadata
176+
-ownership-based-buffer-deallocation
177+
-buffer-deallocation-simplification
178+
-bufferization-lower-deallocations
179+
-matmul-vectorization
180+
# -batchmatmul-optimize
181+
-convert-linalg-to-affine-loops
182+
-affine-loop-fusion
183+
-affine-parallelize
184+
-convert-vector-to-scf
185+
-lower-affine
186+
-convert-scf-to-openmp
187+
-func-bufferize-dynamic-offset
188+
-cse
189+
-memref-expand
190+
-arith-expand
191+
-convert-vector-to-llvm
192+
-convert-arith-to-llvm
193+
-finalize-memref-to-llvm
194+
-convert-scf-to-cf
195+
-convert-cf-to-llvm
196+
-llvm-request-c-wrappers
197+
-convert-openmp-to-llvm
198+
-convert-arith-to-llvm
199+
-convert-math-to-llvm
200+
-convert-math-to-libm
201+
-convert-func-to-llvm
202+
-reconcile-unrealized-casts |
203+
${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
204+
${LLVM_TOOLS_BINARY_DIR}/llvm-as |
205+
${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
206+
-o ${CMAKE_CURRENT_BINARY_DIR}/subgraph_decode.o
207+
DEPENDS buddy-opt ${CMAKE_CURRENT_BINARY_DIR}/subgraph0_decode.mlir
208+
COMMENT "Building subgraph_decode.o "
115209
VERBATIM)
116210

117211
add_custom_command(
@@ -202,7 +296,7 @@ add_custom_command(
202296
COMMENT "Building subgraph.o "
203297
VERBATIM)
204298

205-
add_library(DEEPSEEKR1 STATIC forward.o subgraph.o)
299+
add_library(DEEPSEEKR1 STATIC forward_prefill.o subgraph_prefill.o forward_decode.o subgraph_decode.o)
206300
add_library(DEEPSEEKR1_F16 STATIC forward-f16.o subgraph-f16.o)
207301

208302
SET_SOURCE_FILES_PROPERTIES(
@@ -228,12 +322,12 @@ set(DEEPSEEKR1_EXAMPLE_PATH ${CMAKE_CURRENT_SOURCE_DIR})
228322
set(DEEPSEEKR1_EXAMPLE_BUILD_PATH ${CMAKE_CURRENT_BINARY_DIR})
229323

230324
target_compile_definitions(buddy-deepseek-r1-run PRIVATE
231-
DEEPSEEKR1_EXAMPLE_PATH="${DEEPSEEKR1_EXAMPLE_PATH}"
232-
DEEPSEEKR1_EXAMPLE_BUILD_PATH="${DEEPSEEKR1_EXAMPLE_BUILD_PATH}"
325+
DEEPSEEKR1_EXAMPLE_PATH="${DEEPSEEKR1_EXAMPLE_PATH}/"
326+
DEEPSEEKR1_EXAMPLE_BUILD_PATH="${DEEPSEEKR1_EXAMPLE_BUILD_PATH}/"
233327
)
234328
target_compile_definitions(buddy-deepseek-r1-f16-run PRIVATE
235-
DEEPSEEKR1_EXAMPLE_PATH="${DEEPSEEKR1_EXAMPLE_PATH}"
236-
DEEPSEEKR1_EXAMPLE_BUILD_PATH="${DEEPSEEKR1_EXAMPLE_BUILD_PATH}"
329+
DEEPSEEKR1_EXAMPLE_PATH="${DEEPSEEKR1_EXAMPLE_PATH}/"
330+
DEEPSEEKR1_EXAMPLE_BUILD_PATH="${DEEPSEEKR1_EXAMPLE_BUILD_PATH}/"
237331
)
238332

239333
target_link_directories(buddy-deepseek-r1-run PRIVATE ${LLVM_LIBRARY_DIR})

examples/BuddyDeepSeekR1/buddy-deepseek-r1-f16-main.cpp

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,12 @@ using namespace buddy;
2929

3030
constexpr size_t ParamsSize = 1777088064;
3131
constexpr size_t MaxVocabSize = 151936;
32-
constexpr size_t MaxTokenLength = 40;
32+
constexpr size_t MaxTokenLength = 20;
3333

3434
/// Declare DeepSeekR1 forward function.
3535
extern "C" void _mlir_ciface_forward(MemRef<uint16_t, 3> *result,
3636
MemRef<uint16_t, 1> *arg0,
37-
Text<size_t, 2> *arg1,
38-
MemRef<long long, 2> *arg2);
37+
Text<size_t, 2> *arg1);
3938

4039
// -----------------------------------------------------------------------------
4140
// Helper Functions
@@ -166,19 +165,15 @@ int main() {
166165
// - Output container.
167166
// - Parameters container.
168167
Text<size_t, 2> outputContainer;
169-
MemRef<uint16_t, 3> resultContainer({1, 40, 151936});
168+
MemRef<uint16_t, 3> resultContainer({1, 20, 151936});
170169
Text<size_t, 2> inputContainer(inputStr);
171170
MemRef<uint16_t, 1> paramsContainer({ParamsSize});
172-
MemRef<long long, 2> attention_mask({1, 40}, 0);
173171

174172
/// Fill data into containers
175173
// - Input: register vocabulary and tokenize the input string.
176174
// - Output: register vocabulary.
177175
// - Parameters: load parameters from the `arg0` file into the container.
178176
tokenizeInput(vocabDir, inputContainer);
179-
for (int i = 0; i < (int)inputContainer.getTokenCnt(); i++) {
180-
attention_mask.getData()[i] = 1;
181-
}
182177
outputContainer.loadVocab(vocabDir);
183178
loadParameters(paramsDir, paramsContainer);
184179

@@ -190,8 +185,7 @@ int main() {
190185
for (int i = 0; i < generateLen; i++) {
191186
const auto inferenceStart = std::chrono::high_resolution_clock::now();
192187
// Execute the forward pass of the model.
193-
_mlir_ciface_forward(&resultContainer, &paramsContainer, &inputContainer,
194-
&attention_mask);
188+
_mlir_ciface_forward(&resultContainer, &paramsContainer, &inputContainer);
195189

196190
const auto inferenceEnd = std::chrono::high_resolution_clock::now();
197191
const std::chrono::duration<double, std::milli> inferenceTime =
@@ -213,7 +207,6 @@ int main() {
213207
}
214208
// Append the generated token into the input and output container.
215209
inputContainer.appendTokenIdx(maxIndex);
216-
attention_mask.getData()[MaxTokenLength - generateLen + i] = 1;
217210
outputContainer.appendTokenIdx(maxIndex);
218211
free(resultContainer.release());
219212
}

0 commit comments

Comments
 (0)