diff --git a/benchmarks/Gemmini/Ops/CMakeLists.txt b/benchmarks/Gemmini/Ops/CMakeLists.txt index 69d3c789..c093e769 100644 --- a/benchmarks/Gemmini/Ops/CMakeLists.txt +++ b/benchmarks/Gemmini/Ops/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(MatMulOp) +add_subdirectory(ConvOp) diff --git a/benchmarks/Gemmini/Ops/ConvOp/CMakeLists.txt b/benchmarks/Gemmini/Ops/ConvOp/CMakeLists.txt new file mode 100644 index 00000000..5425da85 --- /dev/null +++ b/benchmarks/Gemmini/Ops/ConvOp/CMakeLists.txt @@ -0,0 +1,52 @@ + +set(BUDDY_OPT ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt) +set(BUDDY_TRANSLATE ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-translate) +set(BUDDY_LLC ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-llc) +set(INTERFACES ${BUDDY_MLIR_BUILD_DIR}/../frontend/Interfaces) + +set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++) +set(CMAKE_C_COMPILER riscv64-unknown-linux-gnu-gcc) + +include_directories( + ${BENCHMARKS_DIR} + ${GEMMINI_INCLUDE_DIR} + ${GEMMINI_INCLUDE_DIR}/../ + ${INTERFACES} +) + +if (NOT DEFINED ENV{RISCV}) + message(FATAL_ERROR "Can't find RISCV environment variable(missing: RISCV_TOOLCHAIN)") +endif() + +# CMAKE_C_FLAGS is set when configuring cmake. +separate_arguments(CLANG_FLAGS_LIST UNIX_COMMAND "${CMAKE_C_FLAGS}") + +add_custom_command( + OUTPUT buddy_conv.o + COMMAND ${BUDDY_OPT} ${CMAKE_CURRENT_SOURCE_DIR}/conv.mlir + -llvm-request-c-wrappers + -lower-gemmini + > log.mlir + COMMAND ${BUDDY_OPT} ${CMAKE_CURRENT_SOURCE_DIR}/conv.mlir + -llvm-request-c-wrappers + -convert-linalg-to-gemmini + -convert-linalg-to-loops + -lower-gemmini | + ${BUDDY_TRANSLATE} -buddy-to-llvmir | + ${BUDDY_LLC} -filetype=obj -mtriple=riscv64 + -mattr=+buddyext,+D -float-abi=hard + -o buddy_conv.o + VERBATIM) + +add_library(BuddyConv STATIC buddy_conv.o) +set_target_properties(BuddyConv PROPERTIES LINKER_LANGUAGE C) + +add_library(ExoConv STATIC ExoConv.c) +set_target_properties(ExoConv PROPERTIES LINKER_LANGUAGE C) + +add_executable(dl-op-gemmini-conv-benchmark Main.cpp) +target_link_libraries(dl-op-gemmini-conv-benchmark + -static + ExoConv + BuddyConv +) diff --git a/benchmarks/Gemmini/Ops/ConvOp/ExoConv.c b/benchmarks/Gemmini/Ops/ConvOp/ExoConv.c new file mode 100644 index 00000000..7f658617 --- /dev/null +++ b/benchmarks/Gemmini/Ops/ConvOp/ExoConv.c @@ -0,0 +1,140 @@ +//===- ExoMatmul.c --------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements Exo-lang Matmul kernel. +// The kernels are generated from exo-lang python script. +// +//===----------------------------------------------------------------------===// + +#include "ExoUtils.h" +#include "gemmini.h" + +// clang-format off +// conv_3( +// output : i8[4, 56, 56, 64] @DRAM, +// bias : i32[1, 64] @DRAM, +// inp : i8[4, 58, 58, 64] @DRAM, +// weights : i8[3, 3, 64, 64] @DRAM, +// act : bool, +// scale : f32 @DRAM +// ) +void _exo_conv_3( int8_t* output, const int32_t* bias, const int8_t* inp, const int8_t* weights, bool act, const float* scale ) { + gemmini_extended_config_st((64), (act), (scale)[0]); + gemmini_extended_config_ex(WS, 0, 0, 1, 0, 0); + gemmini_extended3_config_ld((64), 1.0f, 0, 1); + gemmini_extended3_config_ld(0, 1.0f, 0, 0); + int8_t *i_s = (int8_t*) ((uint64_t)gemm_malloc (16 * 16 * 4 * 3 * 30 * sizeof(int8_t))); + int8_t *i_s_1 = (int8_t*) ((uint64_t)gemm_malloc (16 * 8 * 4 * 3 * 30 * sizeof(int8_t))); + int8_t *w_s = (int8_t*) ((uint64_t)gemm_malloc (16 * 16 * 4 * 4 * 3 * 3 * sizeof(int8_t))); + int8_t *w_s_1 = (int8_t*) ((uint64_t)gemm_malloc (16 * 16 * 4 * 4 * 3 * 3 * sizeof(int8_t))); + int32_t *res = (int32_t*) ((uint32_t)gemm_acc_malloc (16 * 16 * 4 * sizeof(int32_t))); + int32_t *res_1 = (int32_t*) ((uint32_t)gemm_acc_malloc (16 * 9 * 4 * sizeof(int32_t))); + for (int_fast32_t b = 0; b < 4; b++) { + for (int_fast32_t ocol_o = 0; ocol_o < 3; ocol_o++) { + for (int_fast32_t orow_o = 0; orow_o < 2; orow_o++) { + for (int_fast32_t orow_io = 0; orow_io < 4; orow_io++) { + for (int_fast32_t orow_ii = 0; orow_ii < 7; orow_ii++) { + gemmini_extended_mvin( ((uint64_t) &bias[0]), ((uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + (0)/16))), 16, (16) ); + gemmini_extended_mvin( ((uint64_t) &bias[16]), ((uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + (256)/16))), 16, (16) ); + gemmini_extended_mvin( ((uint64_t) &bias[32]), ((uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((2) * (256))/16))), 16, (16) ); + gemmini_extended_mvin( ((uint64_t) &bias[48]), ((uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((3) * (256))/16))), 16, (16) ); + for (int_fast32_t krow = 0; krow < 3; krow++) { + for (int_fast32_t kcol = 0; kcol < 3; kcol++) { + if (ocol_o == 0) { + if (b == 0) { + if (orow_o == 0) { + if (orow_ii + 7 * orow_io == 0) { + for (int_fast32_t kch_o = 0; kch_o < 4; kch_o++) { + gemmini_extended_mvin2( &weights[(krow) * (12288) + (kcol) * (4096) + (16 * kch_o) * (64)], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)w_s)) + ((krow) * (12288) + (kcol) * (4096) + (kch_o) * (1024))/16))), 16*(4), (16) ); + } + } + } + } + } + if (orow_ii + 7 * orow_io == 0 || krow == 2) { + gemmini_extended4_config_ld(((struct exo_win_2i8c){ &inp[(b) * (215296) + (krow + orow_ii + 7 * orow_io + 28 * orow_o) * (3712) + (kcol + 16 * ocol_o) * (64)], { 64, 1 } }).strides[0]*1, 1.0f, 0, (16), 2); + gemmini_extended_mvin3( &inp[(b) * (215296) + (krow + orow_ii + 7 * orow_io + 28 * orow_o) * (3712) + (kcol + 16 * ocol_o) * (64)], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)i_s)) + ((krow + orow_ii + 7 * orow_io) * (3072) + (kcol) * (1024))/16))), 16*(4), (16) ); + } + for (int_fast32_t kch_o = 0; kch_o < 4; kch_o++) { + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)w_s)) + ((krow) * (12288) + (kcol) * (4096) + (kch_o) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + (0)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)i_s)) + ((krow + orow_ii + 7 * orow_io) * (3072) + (kcol) * (1024) + (kch_o) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)w_s)) + ((krow) * (12288) + (kcol) * (4096) + (kch_o) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + (256)/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)i_s)) + ((krow + orow_ii + 7 * orow_io) * (3072) + (kcol) * (1024) + (kch_o) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)w_s)) + ((krow) * (12288) + (kcol) * (4096) + (kch_o) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((2) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)i_s)) + ((krow + orow_ii + 7 * orow_io) * (3072) + (kcol) * (1024) + (kch_o) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)w_s)) + ((krow) * (12288) + (kcol) * (4096) + (kch_o) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((3) * (256))/16))) | 0x40000000, (16), (16), (16), (16)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)i_s)) + ((krow + orow_ii + 7 * orow_io) * (3072) + (kcol) * (1024) + (kch_o) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16); + } + } + } + gemmini_extended_mvout( ((uint64_t) &output[(b) * (200704) + (orow_ii + 7 * orow_io + 28 * orow_o) * (3584) + (16 * ocol_o) * (64)]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + (0)/16)), (16), (16) ); + gemmini_extended_mvout( ((uint64_t) &output[(b) * (200704) + (orow_ii + 7 * orow_io + 28 * orow_o) * (3584) + (16 * ocol_o) * (64) + 16]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + (256)/16)), (16), (16) ); + gemmini_extended_mvout( ((uint64_t) &output[(b) * (200704) + (orow_ii + 7 * orow_io + 28 * orow_o) * (3584) + (16 * ocol_o) * (64) + 32]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((2) * (256))/16)), (16), (16) ); + gemmini_extended_mvout( ((uint64_t) &output[(b) * (200704) + (orow_ii + 7 * orow_io + 28 * orow_o) * (3584) + (16 * ocol_o) * (64) + 48]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((3) * (256))/16)), (16), (16) ); + } + } + } + } + for (int_fast32_t orow_o = 0; orow_o < 2; orow_o++) { + for (int_fast32_t orow_io = 0; orow_io < 4; orow_io++) { + for (int_fast32_t orow_ii = 0; orow_ii < 7; orow_ii++) { + gemmini_extended_mvin( ((uint64_t) &bias[0]), ((uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + (0)/16))), 16, (8) ); + gemmini_extended_mvin( ((uint64_t) &bias[16]), ((uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + (144)/16))), 16, (8) ); + gemmini_extended_mvin( ((uint64_t) &bias[32]), ((uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + ((2) * (144))/16))), 16, (8) ); + gemmini_extended_mvin( ((uint64_t) &bias[48]), ((uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + ((3) * (144))/16))), 16, (8) ); + for (int_fast32_t krow = 0; krow < 3; krow++) { + for (int_fast32_t kcol = 0; kcol < 3; kcol++) { + if (b == 0) { + if (orow_o == 0) { + if (orow_ii + 7 * orow_io == 0) { + for (int_fast32_t kch_o = 0; kch_o < 4; kch_o++) { + gemmini_extended_mvin2( &weights[(krow) * (12288) + (kcol) * (4096) + (16 * kch_o) * (64)], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)w_s_1)) + ((krow) * (12288) + (kcol) * (4096) + (kch_o) * (1024))/16))), 16*(4), (16) ); + } + } + } + } + if (orow_ii + 7 * orow_io == 0 || krow == 2) { + gemmini_extended4_config_ld(((struct exo_win_2i8c){ &inp[(b) * (215296) + (krow + orow_ii + 7 * orow_io + 28 * orow_o) * (3712) + (48 + kcol) * (64)], { 64, 1 } }).strides[0]*1, 1.0f, 0, (8), 2); + gemmini_extended_mvin3( &inp[(b) * (215296) + (krow + orow_ii + 7 * orow_io + 28 * orow_o) * (3712) + (48 + kcol) * (64)], ((uint64_t) &*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)i_s_1)) + ((krow + orow_ii + 7 * orow_io) * (1536) + (kcol) * (512))/16))), 16*(4), (8) ); + } + for (int_fast32_t kch_o = 0; kch_o < 4; kch_o++) { + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)w_s_1)) + ((krow) * (12288) + (kcol) * (4096) + (kch_o) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + (0)/16))) | 0x40000000, (16), (16), (16), (8)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)i_s_1)) + ((krow + orow_ii + 7 * orow_io) * (1536) + (kcol) * (512) + (kch_o) * (128))/16))), ~((uint32_t)0), (16), (8), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)w_s_1)) + ((krow) * (12288) + (kcol) * (4096) + (kch_o) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + (144)/16))) | 0x40000000, (16), (16), (16), (8)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)i_s_1)) + ((krow + orow_ii + 7 * orow_io) * (1536) + (kcol) * (512) + (kch_o) * (128))/16))), ~((uint32_t)0), (16), (8), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)w_s_1)) + ((krow) * (12288) + (kcol) * (4096) + (kch_o) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + ((2) * (144))/16))) | 0x40000000, (16), (16), (16), (8)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)i_s_1)) + ((krow + orow_ii + 7 * orow_io) * (1536) + (kcol) * (512) + (kch_o) * (128))/16))), ~((uint32_t)0), (16), (8), 16, 16); + gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)w_s_1)) + ((krow) * (12288) + (kcol) * (4096) + (kch_o) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + ((3) * (144))/16))) | 0x40000000, (16), (16), (16), (8)); + gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)i_s_1)) + ((krow + orow_ii + 7 * orow_io) * (1536) + (kcol) * (512) + (kch_o) * (128))/16))), ~((uint32_t)0), (16), (8), 16, 16); + } + } + } + gemmini_extended_mvout( ((uint64_t) &output[(b) * (200704) + (orow_ii + 7 * orow_io + 28 * orow_o) * (3584) + (48) * (64)]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + (0)/16)), (16), (8) ); + gemmini_extended_mvout( ((uint64_t) &output[(b) * (200704) + (orow_ii + 7 * orow_io + 28 * orow_o) * (3584) + (48) * (64) + 16]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + (144)/16)), (16), (8) ); + gemmini_extended_mvout( ((uint64_t) &output[(b) * (200704) + (orow_ii + 7 * orow_io + 28 * orow_o) * (3584) + (48) * (64) + 32]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + ((2) * (144))/16)), (16), (8) ); + gemmini_extended_mvout( ((uint64_t) &output[(b) * (200704) + (orow_ii + 7 * orow_io + 28 * orow_o) * (3584) + (48) * (64) + 48]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res_1)) + ((3) * (144))/16)), (16), (8) ); + } + } + } + } + gemm_acc_free((uint32_t)(res_1)); + gemm_acc_free((uint32_t)(res)); + gemm_free((uint64_t)(w_s_1)); + gemm_free((uint64_t)(w_s)); + gemm_free((uint64_t)(i_s_1)); + gemm_free((uint64_t)(i_s)); +} +// clang-format on diff --git a/benchmarks/Gemmini/Ops/ConvOp/ExoUtils.h b/benchmarks/Gemmini/Ops/ConvOp/ExoUtils.h new file mode 100644 index 00000000..072779fb --- /dev/null +++ b/benchmarks/Gemmini/Ops/ConvOp/ExoUtils.h @@ -0,0 +1,169 @@ +//===- ExoUtils.h ---------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements Exo-lang helper functions. +// The functions are from exo repository. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#ifndef GEMM_HEAP_SIZE +#define GEMM_HEAP_SIZE 100000 +#endif + +#ifndef GEMM_DIM +#define GEMM_DIM 16 +#endif + +#ifndef EXO_WIN_2I8C +#define EXO_WIN_2I8C +struct exo_win_2i8c{ + const int8_t * const data; + const int_fast32_t strides[2]; +}; +#endif + +typedef struct __attribute__((__packed__)) NewBlock { + uint32_t size; + uint32_t loc; + uint8_t is_used; +} NewBlock; + +NewBlock BLOCKS[GEMM_HEAP_SIZE / sizeof(NewBlock)]; +uint32_t gemm_last_ptr; + +void gemm_init_mem() { + for (uint32_t i = 0; i < sizeof(BLOCKS); i++) + ((uint8_t *)BLOCKS)[i] = 0; + gemm_last_ptr = 0; +} + +uint32_t gemm_malloc(long unsigned int size) { + if (size == 0) + return -1; + size = (size + GEMM_DIM - 1) / GEMM_DIM; + int i; + for (i = 0; i < GEMM_HEAP_SIZE / sizeof(NewBlock) && BLOCKS[i].size > 0; + i++) { + if (BLOCKS[i].is_used) + continue; + if (BLOCKS[i].size < size) + continue; + break; + } + if (BLOCKS[i].size == 0) { + BLOCKS[i].loc = gemm_last_ptr; + BLOCKS[i].size = size; + BLOCKS[i].is_used = 1; + gemm_last_ptr += size; + return BLOCKS[i].loc; + } + + BLOCKS[i].is_used = 1; + return BLOCKS[i].loc; +} + +void gemm_free(uint32_t addr) { + for (int i = 0; BLOCKS[i].size > 0; i++) { + if (BLOCKS[i].is_used && BLOCKS[i].loc == addr) { + BLOCKS[i].is_used = 0; + return; + } + } + return; +} + +#ifndef GEMM_ACC_HEAP_SIZE +#define GEMM_ACC_HEAP_SIZE 100000 +#endif + +#ifndef GEMM_ACC_DIM +#define GEMM_ACC_DIM 16 +#endif + +typedef struct __attribute__((__packed__)) AccBlock { + uint32_t size; + uint32_t loc; + uint8_t is_used; +} AccBlock; + +// maintain a stack of blocks corresponding to +// a stack alloc and free strategy +#define N_ACC_BLOCKS (GEMM_ACC_HEAP_SIZE / sizeof(AccBlock)) +AccBlock ACC_BLOCKS[N_ACC_BLOCKS]; +uint32_t gemm_acc_free_block; + +void gemm_acc_init_mem() { + uint8_t *buf = (uint8_t *)ACC_BLOCKS; + for (uint32_t i = 0; i < sizeof(ACC_BLOCKS); i++) + buf[i] = 0; + gemm_acc_free_block = 0; +} + +uint32_t gemm_acc_malloc(long unsigned int size) { + // must have two free metadata blocks and + // this allocation must have > 0 size + if (size == 0) + return -1; + if (gemm_acc_free_block >= N_ACC_BLOCKS) + return -1; + + size = (size + GEMM_ACC_DIM - 1) / GEMM_ACC_DIM; + uint32_t i = gemm_acc_free_block; + + uint32_t loc = 0; + if (i > 0) { + loc = ACC_BLOCKS[i - 1].loc + ACC_BLOCKS[i - 1].size; + } + + ACC_BLOCKS[i].size = size; + ACC_BLOCKS[i].loc = loc; + ACC_BLOCKS[i].is_used = 1; + gemm_acc_free_block = i + 1; + + return (ACC_BLOCKS[i].loc | ((uint32_t)0x80000000)); +} + +void gemm_acc_free(uint32_t addr) { + if (gemm_acc_free_block == 0) + return; + addr = addr & (uint32_t)(0x7FFFFFFF); + // first case: free-ing the top of the block-stack + if (ACC_BLOCKS[gemm_acc_free_block - 1].loc == addr) { + ACC_BLOCKS[gemm_acc_free_block - 1].is_used = 0; + + // Then go through and release as many blocks + // as we can + for (int i = gemm_acc_free_block - 1; i >= 0; i--) { + if (ACC_BLOCKS[i].is_used) + break; // loop termination + // otherwise... + gemm_acc_free_block = i; + } + // second case: find the freed block and mark it + } else { + for (int i = gemm_acc_free_block - 1; i >= 0; i--) { + if (ACC_BLOCKS[i].loc == addr) { + ACC_BLOCKS[i].is_used = 0; + break; + } + } + } + return; +} diff --git a/benchmarks/Gemmini/Ops/ConvOp/Main.cpp b/benchmarks/Gemmini/Ops/ConvOp/Main.cpp new file mode 100644 index 00000000..fe2df6ee --- /dev/null +++ b/benchmarks/Gemmini/Ops/ConvOp/Main.cpp @@ -0,0 +1,166 @@ +//===- Main.cpp -----------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This is the main file of Gemmini Conv operation benchmark. +// +//===----------------------------------------------------------------------===// + +#include "gemmini.h" +#include +#include +#include + +#include "Gemmini/Utils.h" +#include + +using namespace buddy::benchmark; + +// ----------------------------------------------------------------------------- +// Benchmark Configuration. You can change the number here as needed. +// ----------------------------------------------------------------------------- + +#define _BATCH_SIZE 4 +#define _IN_CH 64 +#define _OUT_CH 64 +#define _IN_DIM 58 +#define _OUT_DIM 56 +#define _KERNEL_DIM 3 +static float c_scale[1] = {1.0f}; +static int32_t _BIAS = 1; + + +// ----------------------------------------------------------------------------- +// Include Kernel Functions. +// ----------------------------------------------------------------------------- + +extern "C" { +void _mlir_ciface_gemmini_conv_3(MemRef *input0, + MemRef *input1, + MemRef *inputBias, + MemRef *output); +// void _exo_conv_3(int8_t* output, const int32_t* bias, +// const int8_t* inp, const int8_t* weights, +// bool act, const float* scale ); +} + +// ----------------------------------------------------------------------------- +// Global Variables. +// ----------------------------------------------------------------------------- + +static int8_t input[_BATCH_SIZE * _IN_DIM * _IN_DIM * _IN_CH] row_align(1); // NHWC +static int8_t weights[_KERNEL_DIM * _KERNEL_DIM * _IN_CH * _OUT_CH] row_align(1); // FHWC +static int32_t inputBias[_IN_CH] row_align(1); +static int8_t output[_BATCH_SIZE * _OUT_DIM * _OUT_DIM * _OUT_CH] row_align(1); // NHW C + +intptr_t inputSizes[4] = {_BATCH_SIZE, _IN_DIM, _IN_DIM, _IN_CH}; +intptr_t weightsSizes[4] = {_KERNEL_DIM * _KERNEL_DIM, _IN_CH * _OUT_CH}; +intptr_t biasSizes[1] = {_OUT_CH}; +intptr_t outputSizes[2] = {_BATCH_SIZE * _OUT_DIM * _OUT_DIM, _OUT_CH}; + +MemRef inputAMemRef(inputSizes); +MemRef inputBMemRef(weightsSizes); + +// ----------------------------------------------------------------------------- +// Benchmark Functions. The kernel functions are called here. +// ----------------------------------------------------------------------------- + +// Gemmini native convolution function. +// This function is used to get the expected output results for verification. +void nativeConv(int8_t *inputA, int8_t *inputB, int8_t *outputC, + int32_t *inputBias) { + + uint64_t start = gemmini::readCycles(); + tiled_conv_auto(_BATCH_SIZE, _IN_DIM, _IN_DIM, _IN_CH, _OUT_CH, + _OUT_DIM, _OUT_DIM, 1, 1, 1, 0, _KERNEL_DIM, + false, false, false, false, false, + inputA, inputB, inputBias, outputC, + NO_ACTIVATION, ACC_SCALE_IDENTITY, 0, 0, 0, WS); + uint64_t end = gemmini::readCycles(); + + std::cout << "Gemmini native Conv cycles: " << end - start << std::endl; +} + +// Buddy Gemmini dialect Conv benchmark function. +// Verifies the result against expected output. +using MLIRFunctionType = void (*)(MemRef *, MemRef *, + MemRef *, MemRef *); +void buddyConv(int8_t *outputExpected, MLIRFunctionType MLIRFunc, + const std::string &name) { + MemRef outputMemRef(outputSizes, 0); + MemRef biasMemRef(biasSizes, _BIAS); + + uint64_t start = gemmini::readCycles(); + MLIRFunc(&inputAMemRef, &inputBMemRef, &biasMemRef, &outputMemRef); + uint64_t end = gemmini::readCycles(); + + std::cout << name << " cycles: " << end - start << std::endl; + gemmini::verify(outputExpected, outputMemRef.getData(), + _BATCH_SIZE*_OUT_DIM*_OUT_DIM, _OUT_CH, name); +} + + +// Exo-lang Conv benchmark function. +// void exoConv(int8_t *outputExpected) { +// static int8_t outputExo[_BATCH_SIZE * _OUT_DIM * _OUT_DIM * _OUT_CH] = {0}; + +// uint64_t start = gemmini::readCycles(); +// _exo_conv_3(outputExo, inputBias, input, weights, false, c_scale); +// uint64_t end = gemmini::readCycles(); + +// std::cout << "Exo-lang Gemmini Conv cycles: " << end - start << std::endl; +// gemmini::verify(outputExpected, outputExo, +// _BATCH_SIZE * _OUT_DIM * _OUT_DIM, _OUT_CH, +// "Exo-lang Gemmini Conv"); +// } + +// ----------------------------------------------------------------------------- +// Main Function. +// ----------------------------------------------------------------------------- + +int main() { + // Initialize input data. + for (int b = 0; b < _BATCH_SIZE; ++b) { + for (int h = 0; h < _IN_DIM; ++h) { + for (int w = 0; w < _IN_DIM; ++w) { + for (int c = 0; c < _IN_CH; ++c) { + input[((b * _IN_DIM + h) * _IN_DIM + w) * _IN_CH + c] = 1; + } + } + } + } + for (int kh = 0; kh < _KERNEL_DIM; ++kh) { + for (int kw = 0; kw < _KERNEL_DIM; ++kw) { + for (int ic = 0; ic < _IN_CH; ++ic) { + for (int oc = 0; oc < _OUT_CH; ++oc) { + weights[((kh* _KERNEL_DIM + kw)*_IN_CH + ic)*_OUT_CH + oc] = 1; + } + } + } + } + + for (int oc = 0; oc < _OUT_CH; ++oc) { + inputBias[oc] = _BIAS; + } + + std::cout << "\033[34m---------- Verification ----------\033[0m" << std::endl; + + int8_t outputExpected[_BATCH_SIZE * _OUT_DIM * _OUT_DIM * _OUT_CH] row_align(1) = {0}; + nativeConv(input, weights, outputExpected, inputBias); + buddyConv(outputExpected, _mlir_ciface_gemmini_conv_3, "Buddy Gemmini Conv"); + // exoConv(outputExpected); + + return 0; +} diff --git a/benchmarks/Gemmini/Ops/ConvOp/conv.mlir b/benchmarks/Gemmini/Ops/ConvOp/conv.mlir new file mode 100644 index 00000000..4d1e4f8b --- /dev/null +++ b/benchmarks/Gemmini/Ops/ConvOp/conv.mlir @@ -0,0 +1,14 @@ + +func.func @gemmini_conv_3(%input : memref<4x58x58x64xi8>, + %weights : memref<9x4096xi8>, //3x3, 64x64 + %bias : memref<64xi32>, //1x64 + %output : memref<12544x64xi8>) { // 4x56x56 + + %outdim = arith.constant 56 : i64 + %kernelDim = arith.constant 3 : i64 + + gemmini.tile_conv %input %weights %bias %output %outdim %outdim %kernelDim { stride = 1 } : + memref<4x58x58x64xi8> memref<9x4096xi8> memref<64xi32> memref<12544x64xi8> i64 i64 i64 + + return +} \ No newline at end of file