From dfacf4d6997d98c838fb8b324f893d0d1d109773 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Wed, 14 Sep 2022 14:53:47 +0200 Subject: [PATCH 01/24] [config] Parametrize scratchpad banking factor --- config/mempool.mk | 3 +++ config/minpool.mk | 3 +++ config/systolic.mk | 3 +++ config/terapool.mk | 3 +++ hardware/Makefile | 2 +- hardware/src/mempool_pkg.sv | 2 +- software/apps/memcpy/main.c | 2 +- software/runtime/runtime.h | 2 ++ software/runtime/runtime.mk | 1 + 9 files changed, 18 insertions(+), 3 deletions(-) diff --git a/config/mempool.mk b/config/mempool.mk index a3df45b35..ec2c34154 100644 --- a/config/mempool.mk +++ b/config/mempool.mk @@ -17,6 +17,9 @@ num_groups ?= 4 # Number of cores per MemPool tile num_cores_per_tile ?= 4 +# L1 scratchpad banking factor +banking_factor ?= 4 + # Radix for hierarchical AXI interconnect axi_hier_radix ?= 20 diff --git a/config/minpool.mk b/config/minpool.mk index 455cd30e6..484bef548 100644 --- a/config/minpool.mk +++ b/config/minpool.mk @@ -17,6 +17,9 @@ num_groups ?= 4 # Number of cores per MemPool tile num_cores_per_tile ?= 4 +# L1 scratchpad banking factor +banking_factor ?= 4 + # Number of DMA backends in each group dmas_per_group ?= 1 diff --git a/config/systolic.mk b/config/systolic.mk index 5de36e4c5..d317e0dad 100644 --- a/config/systolic.mk +++ b/config/systolic.mk @@ -15,6 +15,9 @@ num_groups ?= 4 # Number of cores per MemPool tile num_cores_per_tile ?= 4 +# L1 scratchpad banking factor +banking_factor ?= 4 + # Radix for hierarchical AXI interconnect axi_hier_radix ?= 16 diff --git a/config/terapool.mk b/config/terapool.mk index 5d3f90854..a9df13cba 100644 --- a/config/terapool.mk +++ b/config/terapool.mk @@ -17,6 +17,9 @@ num_groups ?= 8 # Number of cores per Terapool tile num_cores_per_tile ?= 8 +# L1 scratchpad banking factor +banking_factor ?= 4 + # Radix for hierarchical AXI interconnect axi_hier_radix ?= 8 diff --git a/hardware/Makefile b/hardware/Makefile index 7965053d4..32454e193 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -87,7 +87,7 @@ endif vlog_args += -suppress vlog-2583 -suppress vlog-13314 -suppress vlog-13233 vlog_args += -work $(library) # Defines -vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups) +vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups) -DBANKING_FACTOR=$(banking_factor) vlog_defs += -DL2_BASE=$(l2_base) -DL2_SIZE=$(l2_size) -DL2_BANKS=$(l2_banks) vlog_defs += -DBOOT_ADDR=$(boot_addr) -DXPULPIMG=$(xpulpimg) vlog_defs += -DSNITCH_TRACE=$(snitch_trace) diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index a11eeeff1..5ba3234f4 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -35,7 +35,7 @@ package mempool_pkg; localparam integer unsigned DataWidth = 32; localparam integer unsigned BeWidth = DataWidth / 8; localparam integer unsigned ByteOffset = $clog2(BeWidth); - localparam integer unsigned BankingFactor = 4; + localparam integer unsigned BankingFactor = `ifdef BANKING_FACTOR `BANKING_FACTOR `else 0 `endif; localparam bit LrScEnable = 1'b1; localparam integer unsigned TCDMSizePerBank = 1024; // [B] localparam integer unsigned NumBanks = NumCores * BankingFactor; diff --git a/software/apps/memcpy/main.c b/software/apps/memcpy/main.c index c92a688a1..f93d2e0d1 100644 --- a/software/apps/memcpy/main.c +++ b/software/apps/memcpy/main.c @@ -27,7 +27,7 @@ #ifndef SIZE #define SIZE ((NUM_CORES) * (NUM_CORES)*2) #endif -#define BANKING_FACTOR (4) +// Assume banking factor of 4 uint32_t l2_data_a[SIZE] __attribute__((section(".l2"))) __attribute__((aligned(NUM_CORES * 4 * 4))); diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 108e217a5..12376e52d 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -11,6 +11,8 @@ #include #include +#define NUM_BANKS_PER_TILE NUM_CORES_PER_TILE *BANKING_FACTOR + extern char l1_alloc_base; extern uint32_t atomic_barrier; extern volatile uint32_t wake_up_reg; diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 717a432d9..ee4003663 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -64,6 +64,7 @@ DEFINES += -DPRINTF_DISABLE_SUPPORT_FLOAT -DPRINTF_DISABLE_SUPPORT_LONG_LONG -DP DEFINES += -DNUM_CORES=$(num_cores) DEFINES += -DNUM_GROUPS=$(num_groups) DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile) +DEFINES += -DBANKING_FACTOR=$(banking_factor) DEFINES += -DNUM_CORES_PER_GROUP=$(shell awk 'BEGIN{print $(num_cores)/$(num_groups)}') DEFINES += -DNUM_TILES_PER_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)}') DEFINES += -DLOG2_NUM_CORES_PER_TILE=$(shell awk 'BEGIN{print log($(num_cores_per_tile))/log(2)}') From 738a8cfafae8383180cc63a35cb0d33eb69511df Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Tue, 18 May 2021 01:43:01 +0200 Subject: [PATCH 02/24] [hardware] Add support for atomic Xqueue operations to TCDM adapter --- Bender.yml | 1 + config/config.mk | 3 + config/systolic.mk | 8 +- hardware/Makefile | 2 +- hardware/src/mempool_pkg.sv | 3 + hardware/src/mempool_tile.sv | 87 +++-- hardware/src/tcdm_adapter_xqueue.sv | 501 ++++++++++++++++++++++++++++ 7 files changed, 574 insertions(+), 31 deletions(-) create mode 100644 hardware/src/tcdm_adapter_xqueue.sv diff --git a/Bender.yml b/Bender.yml index 08b62d28c..982aa0c6b 100644 --- a/Bender.yml +++ b/Bender.yml @@ -28,6 +28,7 @@ sources: - hardware/src/mempool_cc.sv - hardware/src/snitch_addr_demux.sv - hardware/src/tcdm_adapter.sv + - hardware/src/tcdm_adapter_xqueue.sv - hardware/src/tcdm_shim.sv - hardware/src/tcdm_wide_narrow_mux.sv - hardware/src/address_scrambler.sv diff --git a/config/config.mk b/config/config.mk index ea0ff5425..fb01a9006 100644 --- a/config/config.mk +++ b/config/config.mk @@ -56,6 +56,9 @@ dmas_per_group ?= 4 ## Xqueues configuration ## ############################# +# Hardware queues for systolic (atomic ISA extension in TCDM adapter) +xqueue ?= 0 + # XQueue extension's queue size in each memory bank (in words) xqueue_size ?= 0 diff --git a/config/systolic.mk b/config/systolic.mk index d317e0dad..e14ce5a99 100644 --- a/config/systolic.mk +++ b/config/systolic.mk @@ -32,6 +32,10 @@ seq_mem_size ?= 2048 ## Xqueues configuration ## ############################# -# Xqueue extension's queue size (in queue entries) -# in each memory bank (assume banking factor of 4) +# Hardware queues for systolic (atomic ISA extension in TCDM adapter) +xqueue ?= 1 + +# Systolic queues size (assume banking factor of 4) for: +# - software queues emulation (size measured in queue entries) +# - hardware xqueue's queue in each memory bank (size measured in words) xqueue_size ?= 4 diff --git a/hardware/Makefile b/hardware/Makefile index 32454e193..046d9ed04 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -95,7 +95,7 @@ vlog_defs += -DAXI_DATA_WIDTH=$(axi_data_width) vlog_defs += -DRO_LINE_WIDTH=$(ro_line_width) vlog_defs += -DDMAS_PER_GROUP=$(dmas_per_group) vlog_defs += -DAXI_HIER_RADIX=$(axi_hier_radix) -DAXI_MASTERS_PER_GROUP=$(axi_masters_per_group) -vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE_SIZE=$(xqueue_size) +vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE=$(xqueue) -DXQUEUE_SIZE=$(xqueue_size) # Traffic generation enabled ifdef tg diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index 5ba3234f4..ce7915ee3 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -258,6 +258,9 @@ package mempool_pkg; * QUEUE PARAMETERS * **********************/ + // Size of queues in words (must be a power of two) + localparam bit Xqueue = `ifdef XQUEUE `XQUEUE `else 1'b0 `endif; + // Size of xqueues in words (must be a power of two) localparam int unsigned XQueueSize = `ifdef XQUEUE_SIZE `XQUEUE_SIZE `else 0 `endif; diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index a3a6aa50b..19dacacae 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -381,34 +381,65 @@ module mempool_tile assign bank_resp_payload[b].rdata.amo = '0; // Don't care assign bank_resp_wide[b] = meta_out.wide; - tcdm_adapter #( - .AddrWidth (TCDMAddrMemWidth), - .DataWidth (DataWidth ), - .metadata_t (bank_metadata_t ), - .LrScEnable (LrScEnable ), - .RegisterAmo(1'b0 ) - ) i_tcdm_adapter ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .in_valid_i (bank_req_valid[b] ), - .in_ready_o (bank_req_ready[b] ), - .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]), - .in_amo_i (bank_req_payload[b].wdata.amo ), - .in_write_i (bank_req_payload[b].wen ), - .in_wdata_i (bank_req_payload[b].wdata.data ), - .in_meta_i (meta_in ), - .in_be_i (bank_req_payload[b].be ), - .in_valid_o (bank_resp_valid[b] ), - .in_ready_i (bank_resp_ready[b] ), - .in_rdata_o (bank_resp_payload[b].rdata.data ), - .in_meta_o (meta_out ), - .out_req_o (req_valid ), - .out_add_o (req_addr ), - .out_write_o (req_write ), - .out_wdata_o (req_wdata ), - .out_be_o (req_be ), - .out_rdata_i (resp_rdata ) - ); + if (Xqueue) begin: gen_tcdm_adapter_xqueue + tcdm_adapter_xqueue #( + .AddrWidth (TCDMAddrMemWidth), + .DataWidth (DataWidth ), + .XQueueSize (XQueueSize ), + .metadata_t (bank_metadata_t ), + .RegisterAmo(1'b0 ) + ) i_tcdm_adapter ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .in_valid_i (bank_req_valid[b] ), + .in_ready_o (bank_req_ready[b] ), + .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]), + .in_amo_i (bank_req_payload[b].wdata.amo ), + .in_write_i (bank_req_payload[b].wen ), + .in_wdata_i (bank_req_payload[b].wdata.data ), + .in_meta_i (meta_in ), + .in_be_i (bank_req_payload[b].be ), + .in_valid_o (bank_resp_valid[b] ), + .in_ready_i (bank_resp_ready[b] ), + .in_rdata_o (bank_resp_payload[b].rdata.data ), + .in_meta_o (meta_out ), + .out_req_o (req_valid ), + .out_add_o (req_addr ), + .out_write_o (req_write ), + .out_wdata_o (req_wdata ), + .out_be_o (req_be ), + .out_rdata_i (resp_rdata ) + ); + end else begin: gen_tcdm_adapter + tcdm_adapter #( + .AddrWidth (TCDMAddrMemWidth), + .DataWidth (DataWidth ), + .metadata_t (bank_metadata_t ), + .LrScEnable (LrScEnable ), + .RegisterAmo(1'b0 ) + ) i_tcdm_adapter ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .in_valid_i (bank_req_valid[b] ), + .in_ready_o (bank_req_ready[b] ), + .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]), + .in_amo_i (bank_req_payload[b].wdata.amo ), + .in_write_i (bank_req_payload[b].wen ), + .in_wdata_i (bank_req_payload[b].wdata.data ), + .in_meta_i (meta_in ), + .in_be_i (bank_req_payload[b].be ), + .in_valid_o (bank_resp_valid[b] ), + .in_ready_i (bank_resp_ready[b] ), + .in_rdata_o (bank_resp_payload[b].rdata.data ), + .in_meta_o (meta_out ), + .out_req_o (req_valid ), + .out_add_o (req_addr ), + .out_write_o (req_write ), + .out_wdata_o (req_wdata ), + .out_be_o (req_be ), + .out_rdata_i (resp_rdata ) + ); + end // Bank tc_sram #( diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv new file mode 100644 index 000000000..5f038c2e4 --- /dev/null +++ b/hardware/src/tcdm_adapter_xqueue.sv @@ -0,0 +1,501 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Description: Handles the protocol conversion from valid/ready to req/gnt and correctly returns +// the metadata. Additionally, it handles atomics. Hence, it needs to be instantiated in front of +// an SRAM over which it has exclusive access. +// +// Author: Samuel Riedel + +`include "common_cells/registers.svh" + +module tcdm_adapter_xqueue #( + parameter int unsigned AddrWidth = 32, + parameter int unsigned DataWidth = 32, + parameter int unsigned XQueueSize = 4, + parameter type metadata_t = logic, + parameter bit RegisterAmo = 1'b0, // Cut path between request and response at the cost of increased AMO latency + // Dependent parameters. DO NOT CHANGE. + localparam int unsigned BeWidth = DataWidth/8, + localparam int unsigned QCntWidth = $clog2(XQueueSize) +) ( + input logic clk_i, + input logic rst_ni, + // master side + input logic in_valid_i, // Bank request + output logic in_ready_o, // Bank grant + input logic [AddrWidth-1:0] in_address_i, // Address + input logic [3:0] in_amo_i, // Atomic Memory Operation + input logic in_write_i, // 1: Store, 0: Load + input logic [DataWidth-1:0] in_wdata_i, // Write data + input metadata_t in_meta_i, // Meta data + input logic [BeWidth-1:0] in_be_i, // Byte enable + output logic in_valid_o, // Response valid + input logic in_ready_i, // Response ready + output logic [DataWidth-1:0] in_rdata_o, // Read data + output metadata_t in_meta_o, // Meta data + // slave side + output logic out_req_o, // Bank request + output logic [AddrWidth-1:0] out_add_o, // Address + output logic out_write_o, // 1: Store, 0: Load + output logic [DataWidth-1:0] out_wdata_o, // Write data + output logic [BeWidth-1:0] out_be_o, // Bit enable + input logic [DataWidth-1:0] out_rdata_i // Read data +); + + typedef enum logic [3:0] { + AMONone = 4'h0, + AMOSwap = 4'h1, + AMOAdd = 4'h2, + AMOAnd = 4'h3, + AMOOr = 4'h4, + AMOXor = 4'h5, + AMOMax = 4'h6, + AMOMaxu = 4'h7, + AMOMin = 4'h8, + AMOMinu = 4'h9, + AMOLR = 4'hA, + AMOSC = 4'hB, + QPush = 4'hC, + QPop = 4'hD + } amo_op_t; + + typedef enum logic [2:0] { + Idle, DoAMO, WriteBackAMO, ResolveQPushStall, ResolveQPopStall + } state_e; + + // Stored data in spill registers and fall through register + metadata_t stored_meta_data; + metadata_t stored_smeta_data; + logic[DataWidth-1:0] resp_in_data; + + // Handshake signals for spill registers and fall through register + logic meta_in_vld, meta_in_rdy, meta_out_vld, meta_out_rdy; + logic smeta_in_vld, smeta_in_rdy, smeta_out_vld, smeta_out_rdy; + logic rdata_in_vld_d, rdata_in_vld_q; + logic rdata_in_rdy, rdata_out_vld, rdata_out_rdy; + + // Response meta data selection and valid signals + logic sresp_select_d, sresp_select_q; + logic resp_vld; + logic sresp_vld; + + // FSM related signals + state_e state_q, state_d; + logic vld_amo_op; + logic req_accepted, resp_accepted; + logic queue_stalled_d, queue_stalled_q; + + // Temporary storage for AMO operations + amo_op_t amo_op_d, amo_op_q; + logic [AddrWidth-1:0] addr_d, addr_q; + + // AMO ALU signals + logic [31:0] amo_operand_a; + logic [31:0] amo_operand_b_d, amo_operand_b_q; + logic [31:0] amo_result, amo_result_q; + + // Queue counters + logic unsigned [QCntWidth-1:0] curr_tail_d, curr_tail_q; + logic unsigned [QCntWidth-1:0] next_tail_d, next_tail_q; + logic unsigned [QCntWidth-1:0] curr_head_d, curr_head_q; + + // Queue counter increment + logic unsigned [QCntWidth-1:0] increment_operand, increment_result; + + // Queue management signals + logic queue_empty; + logic queue_full; + logic increment_tail, increment_head; + logic stalled_queue_op; + + // Temporary storage of write data for stalled queue push + logic[DataWidth-1:0] qpush_data_d, qpush_data_q; + + // Stores the metadata at handshake (except stalled queue operations) + spill_register #( + .T (metadata_t), + .Bypass(1'b0 ) + ) i_meta_register ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .valid_i(meta_in_vld ), + .ready_o(meta_in_rdy ), + .data_i (in_meta_i ), + .valid_o(meta_out_vld ), + .ready_i(meta_out_rdy ), + .data_o (stored_meta_data) + ); + assign meta_in_vld = req_accepted & !in_write_i & !stalled_queue_op; + assign meta_out_rdy = sresp_select_q ? 1'b0 : resp_accepted; + + // Stores the metadata at handshake of stalled queue operations + spill_register #( + .T (metadata_t), + .Bypass(1'b0 ) + ) i_stallmeta_register ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .valid_i(smeta_in_vld ), + .ready_o(smeta_in_rdy ), + .data_i (in_meta_i ), + .valid_o(smeta_out_vld ), + .ready_i(smeta_out_rdy ), + .data_o (stored_smeta_data) + ); + assign smeta_in_vld = req_accepted & stalled_queue_op; + assign smeta_out_rdy = sresp_select_q ? resp_accepted : 1'b0; + + // Store response data if it's not accepted immediately + fall_through_register #( + .T(logic[DataWidth-1:0]) + ) i_rdata_register ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .clr_i (1'b0 ), + .testmode_i(1'b0 ), + .data_i (resp_in_data ), + .valid_i (rdata_in_vld_q), + .ready_o (rdata_in_rdy ), + .data_o (in_rdata_o ), + .valid_o (rdata_out_vld ), + .ready_i (rdata_out_rdy ) + ); + assign resp_in_data = out_rdata_i; + assign rdata_out_rdy = resp_accepted; + + // Output response valid if both meta and read data are available (the read data will always be last) + assign resp_vld = meta_out_vld & rdata_out_vld; + assign sresp_vld = smeta_out_vld & rdata_out_vld; + // Select output valid depending on response selection + assign in_valid_o = sresp_select_q ? sresp_vld : resp_vld; + // Select output meta data depending on response selection + assign in_meta_o = sresp_select_q ? stored_smeta_data : stored_meta_data; + + // Exclude queue operations as valid amo operations + assign vld_amo_op = !(amo_op_t'(in_amo_i) inside {AMONone, QPush, QPop}); + // Request is accepted on successful input handshake + assign req_accepted = in_valid_i & in_ready_o; + // Response is accepted on successful output handshake + assign resp_accepted = in_ready_i & in_valid_o; + + always_comb begin + // Default + amo_op_d = AMONone; + addr_d = addr_q; + amo_operand_b_d = amo_operand_b_q; + state_d = state_q; + sresp_select_d = sresp_select_q; + queue_stalled_d = queue_stalled_q; + qpush_data_d = qpush_data_q; + + // While response is pending no requests are accepted + in_ready_o = in_valid_o & ~in_ready_i ? 1'b0 : 1'b1; + + // Feed-through of request + out_req_o = req_accepted; + out_add_o = in_address_i; + out_write_o = in_write_i; + out_wdata_o = in_wdata_i; + out_be_o = in_be_i; + + // Response data as feed-through of read data + // resp_in_data = out_rdata_i; + + // Response is acquired a cycle after a read access + rdata_in_vld_d = out_req_o & !out_write_o; + + // Flags to increment queue counters + increment_tail = 1'b0; + increment_head = 1'b0; + + // FSM + unique case (state_q) + // Idle State handles normal load/stores, non-stalled queue operations + // and the initial read of AMO operations (single cycle operations) + // In case of pending queue stall or AMO operations transition away + Idle: begin + // Prepare queue push + if (amo_op_t'(in_amo_i) == QPush) begin + // Write data at tail of queue + out_add_o = curr_tail_q; + out_write_o = 1'b1; + end + + // Prepare queue pop + if (amo_op_t'(in_amo_i) == QPop) begin + // Read data at head of queue + out_add_o = curr_head_q; + end + + // Request accepted (triggers memory access) + if (req_accepted) begin + // Reset meta data selection to default meta data + sresp_select_d = 1'b0; + + // AMO operation + if (vld_amo_op) begin + amo_op_d = amo_op_t'(in_amo_i); + addr_d = in_address_i; + amo_operand_b_d = in_wdata_i; + state_d = DoAMO; + end + + // Queue push + if (amo_op_t'(in_amo_i) == QPush) begin + if (queue_full) begin + // Set flag and store queue push data for later + queue_stalled_d = 1'b1; + qpush_data_d = in_wdata_i; // TODO: MIGHT NOT BE NEEDED + // Prevent acquisition of response data (TODO: might not be needed) + rdata_in_vld_d = 1'b0; + end else begin + // Set increment flag + increment_tail = 1'b1; + // Force acquisition of response data despite a write access + // Response data will match the write data of the write access + rdata_in_vld_d = 1'b1; + // Previous queue pop failed due to empty queue + if (queue_stalled_q) begin + queue_stalled_d = 1'b0; + state_d = ResolveQPopStall; + end + end + end + + // Queue pop + if (amo_op_t'(in_amo_i) == QPop) begin + if (queue_empty) begin + // Set flag + queue_stalled_d = 1'b1; + // Prevent acquisition of response data despite read access + rdata_in_vld_d = 1'b0; + end else begin + // Set increment flag + increment_head = 1'b1; + // Previous queue push failed due to full queue + if (queue_stalled_q) begin + queue_stalled_d = 1'b0; + state_d = ResolveQPushStall; + end + end + end + end + end + + // DoAMO & WriteBackAMO State claims the memory interface for AMO write + DoAMO, WriteBackAMO: begin + in_ready_o = 1'b0; + // Return to Idle one cycle later if we cut the path + state_d = (RegisterAmo && state_q != WriteBackAMO) ? WriteBackAMO : Idle; + // Commit AMO + out_req_o = 1'b1; + out_write_o = 1'b1; + out_add_o = addr_q; + out_be_o = 4'b1111; + // serve from register if we cut the path + if (RegisterAmo) begin + out_wdata_o = amo_result_q; + end else begin + out_wdata_o = amo_result; + end + end + + // ResolveQPushStall State blocks any requests until queue pop response + // has been accepted and then executes the queue push + ResolveQPushStall: begin + // Do not accept any requests during resolve + in_ready_o = 1'b0; + // Prepare queue push (write data at tail of queue) + // TODO: INSTEAD READ STORED DATA FOR PUSH RESPONSE + out_add_o = curr_tail_q; + out_write_o = 1'b1; + out_wdata_o = qpush_data_q; + out_be_o = 4'b1111; + // Wait until pop response accepted + if (resp_accepted) begin + // Set success flag + increment_tail = 1'b1; + // Trigger memory access + out_req_o = 1'b1; + // Force acquisition of response data despite a write access + // Response data will match the write data of the write access + rdata_in_vld_d = 1'b1; + // Set meta data selection to stalled meta data + sresp_select_d = 1'b1; + // Return to Idle + state_d = Idle; + end + end + + // ResolveQPushStall State blocks any requests until queue push response + // has been accepted and then executes the queue pop + ResolveQPopStall: begin + // Do not accept any requests during resolve + in_ready_o = 1'b0; + // Prepare queue pop (read data at head of queue) + out_add_o = curr_head_q; + out_write_o = 1'b0; + out_be_o = 4'b1111; + // Wait until push response accepted + if (resp_accepted) begin + // Set success flag + increment_head = 1'b1; + // Trigger memory access + out_req_o = 1'b1; + // Set meta data selection to stalled meta data + sresp_select_d = 1'b1; + // Return to Idle + state_d = Idle; + end + end + default:; + endcase + end + + // ---------------- + // AMO ALU + // ---------------- + logic [33:0] adder_sum; + logic [32:0] adder_operand_a, adder_operand_b; + + assign amo_operand_a = out_rdata_i; + assign adder_sum = adder_operand_a + adder_operand_b; + /* verilator lint_off WIDTH */ + always_comb begin : amo_alu + + adder_operand_a = $signed(amo_operand_a); + adder_operand_b = $signed(amo_operand_b_q); + + amo_result = amo_operand_b_q; + + unique case (amo_op_q) + // the default is to output operand_b + AMOSwap:; + AMOAdd: amo_result = adder_sum[31:0]; + AMOAnd: amo_result = amo_operand_a & amo_operand_b_q; + AMOOr: amo_result = amo_operand_a | amo_operand_b_q; + AMOXor: amo_result = amo_operand_a ^ amo_operand_b_q; + AMOMax: begin + adder_operand_b = -$signed(amo_operand_b_q); + amo_result = adder_sum[32] ? amo_operand_b_q : amo_operand_a; + end + AMOMin: begin + adder_operand_b = -$signed(amo_operand_b_q); + amo_result = adder_sum[32] ? amo_operand_a : amo_operand_b_q; + end + AMOMaxu: begin + adder_operand_a = $unsigned(amo_operand_a); + adder_operand_b = -$unsigned(amo_operand_b_q); + amo_result = adder_sum[32] ? amo_operand_b_q : amo_operand_a; + end + AMOMinu: begin + adder_operand_a = $unsigned(amo_operand_a); + adder_operand_b = -$unsigned(amo_operand_b_q); + amo_result = adder_sum[32] ? amo_operand_a : amo_operand_b_q; + end + default: amo_result = '0; + endcase + end + + if (RegisterAmo) begin : gen_amo_slice + `FFLNR(amo_result_q, amo_result, (state_q == DoAMO), clk_i) + end else begin : gen_amo_slice + assign amo_result_q = '0; + end + + // ---------------- + // QUEUE MANAGEMENT + // ---------------- + assign queue_empty = (curr_head_q == curr_tail_q); + assign queue_full = (curr_head_q == next_tail_q); + + assign increment_result = increment_operand + 1; + + always_comb begin : queue_management + // Default + curr_tail_d = curr_tail_q; + next_tail_d = next_tail_q; + curr_head_d = curr_head_q; + + // Increment queue counters + increment_operand = curr_head_q; + if (increment_tail) begin + increment_operand = next_tail_q; + curr_tail_d = next_tail_q; + next_tail_d = increment_result; + end + if (increment_head) begin + increment_operand = curr_head_q; + curr_head_d = increment_result; + end + + // Select spill register for meta data + unique case (amo_op_t'(in_amo_i)) + QPush: stalled_queue_op = queue_full; + QPop: stalled_queue_op = queue_empty; + default: stalled_queue_op = 1'b0; + endcase + end + + // ---------------- + // SEQUENTIAL PROCESS + // ---------------- + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + state_q <= Idle; + amo_op_q <= amo_op_t'('0); + addr_q <= '0; + amo_operand_b_q <= '0; + rdata_in_vld_q <= 1'b0; + sresp_select_q <= 1'b0; + curr_tail_q <= 0; + next_tail_q <= 1; + curr_head_q <= 0; + queue_stalled_q <= 1'b0; + qpush_data_q <= '0; + end else begin + state_q <= state_d; + amo_op_q <= amo_op_d; + addr_q <= addr_d; + amo_operand_b_q <= amo_operand_b_d; + rdata_in_vld_q <= rdata_in_vld_d; + sresp_select_q <= sresp_select_d; + curr_tail_q <= curr_tail_d; + next_tail_q <= next_tail_d; + curr_head_q <= curr_head_d; + queue_stalled_q <= queue_stalled_d; + qpush_data_q <= qpush_data_d; + end + end + + // ---------------- + // ASSERTIONS + // ---------------- + // pragma translate_off + // Check for unsupported parameters + if (DataWidth != 32) begin + $error($sformatf("Module currently only supports DataWidth = 32. DataWidth is currently set to: %0d", DataWidth)); + end + + `ifndef VERILATOR + meta_full : assert property( + @(posedge clk_i) disable iff (~rst_ni) (meta_in_vld |-> meta_in_rdy)) + else $fatal (1, "Trying to push new data although the i_meta_register is not ready."); + `endif + + `ifndef VERILATOR + smeta_full : assert property( + @(posedge clk_i) disable iff (~rst_ni) (smeta_in_vld |-> smeta_in_rdy)) + else $fatal (1, "Trying to push new data although the i_stallmeta_register is not ready."); + `endif + + `ifndef VERILATOR + rdata_full : assert property( + @(posedge clk_i) disable iff (~rst_ni) (rdata_in_vld_q |-> rdata_in_rdy)) + else $fatal (1, "Trying to push new data although the i_rdata_register is not ready."); + `endif + // pragma translate_on + +endmodule From 49bc8a5c691f6bb3c224c1a352670302a59a0dcf Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Mon, 22 Aug 2022 14:16:05 +0200 Subject: [PATCH 03/24] [toolchain] Add toolchain support to xqueues extension (in standard atomic extension, illegal) --- hardware/deps/snitch/src/riscv_instr.sv | 2 ++ hardware/src/tcdm_adapter_xqueue.sv | 4 +++- software/runtime/encoding.h | 6 ++++++ toolchain/riscv-gnu-toolchain | 2 +- toolchain/riscv-isa-sim/disasm/disasm.cc | 4 ++++ toolchain/riscv-opcodes | 2 +- 6 files changed, 17 insertions(+), 3 deletions(-) diff --git a/hardware/deps/snitch/src/riscv_instr.sv b/hardware/deps/snitch/src/riscv_instr.sv index 23107aa70..afbd2cd7c 100644 --- a/hardware/deps/snitch/src/riscv_instr.sv +++ b/hardware/deps/snitch/src/riscv_instr.sv @@ -935,6 +935,8 @@ package riscv_instr; localparam logic [31:0] PV_PACK_H = 32'b1101001??????????000?????1010111; localparam logic [31:0] PV_PACKHI_B = 32'b1101100??????????001?????1010111; localparam logic [31:0] PV_PACKLO_B = 32'b1110000??????????001?????1010111; + localparam logic [31:0] Q_PUSH = 32'b00111????????????010?????0101111; + localparam logic [31:0] Q_POP = 32'b00110??00000?????010?????0101111; /* CSR Addresses */ localparam logic [11:0] CSR_FFLAGS = 12'h1; localparam logic [11:0] CSR_FRM = 12'h2; diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv index 5f038c2e4..07426d57f 100644 --- a/hardware/src/tcdm_adapter_xqueue.sv +++ b/hardware/src/tcdm_adapter_xqueue.sv @@ -10,6 +10,8 @@ `include "common_cells/registers.svh" +import cf_math_pkg::idx_width; + module tcdm_adapter_xqueue #( parameter int unsigned AddrWidth = 32, parameter int unsigned DataWidth = 32, @@ -18,7 +20,7 @@ module tcdm_adapter_xqueue #( parameter bit RegisterAmo = 1'b0, // Cut path between request and response at the cost of increased AMO latency // Dependent parameters. DO NOT CHANGE. localparam int unsigned BeWidth = DataWidth/8, - localparam int unsigned QCntWidth = $clog2(XQueueSize) + localparam int unsigned QCntWidth = idx_width(XQueueSize) ) ( input logic clk_i, input logic rst_ni, diff --git a/software/runtime/encoding.h b/software/runtime/encoding.h index ce0ce72fa..ee518a9ea 100644 --- a/software/runtime/encoding.h +++ b/software/runtime/encoding.h @@ -2148,6 +2148,10 @@ #define MASK_PV_PACKHI_B 0xfe00707f #define MATCH_PV_PACKLO_B 0xe0001057 #define MASK_PV_PACKLO_B 0xfe00707f +#define MATCH_Q_PUSH 0x3800202f +#define MASK_Q_PUSH 0xf800707f +#define MATCH_Q_POP 0x3000202f +#define MASK_Q_POP 0xf9f0707f #define CSR_FFLAGS 0x1 #define CSR_FRM 0x2 #define CSR_FCSR 0x3 @@ -3379,6 +3383,8 @@ DECLARE_INSN(pv_pack, MATCH_PV_PACK, MASK_PV_PACK) DECLARE_INSN(pv_pack_h, MATCH_PV_PACK_H, MASK_PV_PACK_H) DECLARE_INSN(pv_packhi_b, MATCH_PV_PACKHI_B, MASK_PV_PACKHI_B) DECLARE_INSN(pv_packlo_b, MATCH_PV_PACKLO_B, MASK_PV_PACKLO_B) +DECLARE_INSN(q_push, MATCH_Q_PUSH, MASK_Q_PUSH) +DECLARE_INSN(q_pop, MATCH_Q_POP, MASK_Q_POP) #endif #ifdef DECLARE_CSR DECLARE_CSR(fflags, CSR_FFLAGS) diff --git a/toolchain/riscv-gnu-toolchain b/toolchain/riscv-gnu-toolchain index 70acebe25..3b3b3dcbc 160000 --- a/toolchain/riscv-gnu-toolchain +++ b/toolchain/riscv-gnu-toolchain @@ -1 +1 @@ -Subproject commit 70acebe256fc49114b5f068fa79f03eb9affed09 +Subproject commit 3b3b3dcbc2c759924d25833374f4402d817b4b9c diff --git a/toolchain/riscv-isa-sim/disasm/disasm.cc b/toolchain/riscv-isa-sim/disasm/disasm.cc index fbb889775..d3d92c4ac 100644 --- a/toolchain/riscv-isa-sim/disasm/disasm.cc +++ b/toolchain/riscv-isa-sim/disasm/disasm.cc @@ -1515,6 +1515,10 @@ disassembler_t::disassembler_t(int xlen) DEFINE_RTYPE(pv_shuffle2_h); DEFINE_RTYPE(pv_shuffle2_b); + // Xqueues extension + DEFINE_XAMO(q_push) + DEFINE_XAMO_LR(q_pop) + // provide a default disassembly for all instructions as a fallback #define DECLARE_INSN(code, match, mask) \ add_insn(new disasm_insn_t(#code " (args unknown)", match, mask, {})); diff --git a/toolchain/riscv-opcodes b/toolchain/riscv-opcodes index 6bda68aa8..00b89eb39 160000 --- a/toolchain/riscv-opcodes +++ b/toolchain/riscv-opcodes @@ -1 +1 @@ -Subproject commit 6bda68aa82b78b47a61cbf0c08e39cf83a03f152 +Subproject commit 00b89eb39dbe8a980dd1485732b78231d01217c3 From d7bad0a60bd0b1906cecd19c9937611b515fab77 Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Tue, 18 May 2021 19:39:41 +0200 Subject: [PATCH 04/24] [snitch] Add xqueues extension to instruction decoder --- hardware/deps/snitch/src/snitch.sv | 43 ++++++++++++++++++++++++++++-- hardware/src/mempool_cc.sv | 9 ++++--- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/hardware/deps/snitch/src/snitch.sv b/hardware/deps/snitch/src/snitch.sv index e4d48bb18..fd2927834 100644 --- a/hardware/deps/snitch/src/snitch.sv +++ b/hardware/deps/snitch/src/snitch.sv @@ -18,7 +18,8 @@ module snitch parameter logic [31:0] MTVEC = BootAddr, // Exception Base Address (see privileged spec 3.1.7) parameter bit RVE = 0, // Reduced-register Extension parameter bit RVM = 1, // Enable IntegerMmultiplication & Division Extension - parameter int RegNrWritePorts = 2 // Implement one or two write ports into the register file + parameter int RegNrWritePorts = 2, // Implement one or two write ports into the register file + parameter bit Xqueue = 0 ) ( input logic clk_i, input logic rst_i, @@ -152,7 +153,10 @@ module snitch AMOMin = 4'h8, AMOMinu = 4'h9, AMOLR = 4'hA, - AMOSC = 4'hB + AMOSC = 4'hB, + // TODO(smazzola): parametrize + QPush = 4'hC, // Only used when Xqueue is enabled + QPop = 4'hD // Only used when Xqueue is enabled } ls_amo; logic [31:0] ld_result; @@ -1324,6 +1328,41 @@ module snitch end /* end of Xpulpimg extension */ +/* Xqueues extension */ + // TODO(khovg): Add define to include instr + riscv_instr::Q_PUSH: begin + if (Xqueue) begin + alu_op = BypassA; + write_rd = 1'b0; + uses_rd = 1'b1; + is_load = 1'b1; + is_signed = 1'b1; + ls_size = Word; + ls_amo = QPush; + opa_select = Reg; + opb_select = Reg; + end else begin + illegal_inst = 1'b1; + end + end + // TODO(khovg): Two source registers are unnnecessary + riscv_instr::Q_POP: begin + if (Xqueue) begin + alu_op = BypassA; + write_rd = 1'b0; + uses_rd = 1'b1; + is_load = 1'b1; + is_signed = 1'b1; + ls_size = Word; + ls_amo = QPop; + opa_select = Reg; + opb_select = Reg; + end else begin + illegal_inst = 1'b1; + end + end +/* end of Xqueues extension */ + // TODO(zarubaf): Illegal Instructions default: begin illegal_inst = 1'b1; diff --git a/hardware/src/mempool_cc.sv b/hardware/src/mempool_cc.sv index 096156608..3c86b19d4 100644 --- a/hardware/src/mempool_cc.sv +++ b/hardware/src/mempool_cc.sv @@ -57,10 +57,11 @@ module mempool_cc // Snitch Integer Core snitch #( - .BootAddr ( BootAddr ), - .MTVEC ( MTVEC ), - .RVE ( RVE ), - .RVM ( RVM ) + .BootAddr ( BootAddr ), + .MTVEC ( MTVEC ), + .RVE ( RVE ), + .RVM ( RVM ), + .Xqueue ( mempool_pkg::Xqueue ) ) i_snitch ( .clk_i , .rst_i , From 1b6ea2b3b1fcfacdda7f46850f45c93692c0268c Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Wed, 19 May 2021 00:58:44 +0200 Subject: [PATCH 05/24] [hardware] Fix response acquisition --- hardware/src/tcdm_adapter_xqueue.sv | 39 ++++++++++++++++++----------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv index 07426d57f..d89c7a9ee 100644 --- a/hardware/src/tcdm_adapter_xqueue.sv +++ b/hardware/src/tcdm_adapter_xqueue.sv @@ -83,6 +83,11 @@ module tcdm_adapter_xqueue #( logic resp_vld; logic sresp_vld; + // Helper signals to determine response data acquisition + logic mem_read_req; + logic force_rdata_acq; + logic prevent_rdata_acq; + // FSM related signals state_e state_q, state_d; logic vld_amo_op; @@ -167,6 +172,11 @@ module tcdm_adapter_xqueue #( assign resp_in_data = out_rdata_i; assign rdata_out_rdy = resp_accepted; + // Set if memory read request occurs this cycle + assign mem_read_req = out_req_o & !out_write_o; + // Acquire response data a cycle after a memory read request (can be forced or prevented) + assign rdata_in_vld_d = force_rdata_acq | (mem_read_req & !prevent_rdata_acq); + // Output response valid if both meta and read data are available (the read data will always be last) assign resp_vld = meta_out_vld & rdata_out_vld; assign sresp_vld = smeta_out_vld & rdata_out_vld; @@ -205,8 +215,9 @@ module tcdm_adapter_xqueue #( // Response data as feed-through of read data // resp_in_data = out_rdata_i; - // Response is acquired a cycle after a read access - rdata_in_vld_d = out_req_o & !out_write_o; + // Flags to force or prevent response acquisition + force_rdata_acq = 1'b0; + prevent_rdata_acq = 1'b0; // Flags to increment queue counters increment_tail = 1'b0; @@ -248,16 +259,16 @@ module tcdm_adapter_xqueue #( if (amo_op_t'(in_amo_i) == QPush) begin if (queue_full) begin // Set flag and store queue push data for later - queue_stalled_d = 1'b1; - qpush_data_d = in_wdata_i; // TODO: MIGHT NOT BE NEEDED + queue_stalled_d = 1'b1; + qpush_data_d = in_wdata_i; // TODO: MIGHT NOT BE NEEDED // Prevent acquisition of response data (TODO: might not be needed) - rdata_in_vld_d = 1'b0; + prevent_rdata_acq = 1'b1; end else begin // Set increment flag - increment_tail = 1'b1; + increment_tail = 1'b1; // Force acquisition of response data despite a write access // Response data will match the write data of the write access - rdata_in_vld_d = 1'b1; + force_rdata_acq = 1'b1; // Previous queue pop failed due to empty queue if (queue_stalled_q) begin queue_stalled_d = 1'b0; @@ -270,9 +281,9 @@ module tcdm_adapter_xqueue #( if (amo_op_t'(in_amo_i) == QPop) begin if (queue_empty) begin // Set flag - queue_stalled_d = 1'b1; + queue_stalled_d = 1'b1; // Prevent acquisition of response data despite read access - rdata_in_vld_d = 1'b0; + prevent_rdata_acq = 1'b1; end else begin // Set increment flag increment_head = 1'b1; @@ -318,16 +329,16 @@ module tcdm_adapter_xqueue #( // Wait until pop response accepted if (resp_accepted) begin // Set success flag - increment_tail = 1'b1; + increment_tail = 1'b1; // Trigger memory access - out_req_o = 1'b1; + out_req_o = 1'b1; // Force acquisition of response data despite a write access // Response data will match the write data of the write access - rdata_in_vld_d = 1'b1; + force_rdata_acq = 1'b1; // Set meta data selection to stalled meta data - sresp_select_d = 1'b1; + sresp_select_d = 1'b1; // Return to Idle - state_d = Idle; + state_d = Idle; end end From 6224f5f19ec15469d92d37cbe3377355bdda97da Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Wed, 19 May 2021 01:44:38 +0200 Subject: [PATCH 06/24] [hardware] Remove qpush data registers by abusing buffer slot --- hardware/src/tcdm_adapter_xqueue.sv | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv index d89c7a9ee..cb454d368 100644 --- a/hardware/src/tcdm_adapter_xqueue.sv +++ b/hardware/src/tcdm_adapter_xqueue.sv @@ -117,9 +117,6 @@ module tcdm_adapter_xqueue #( logic increment_tail, increment_head; logic stalled_queue_op; - // Temporary storage of write data for stalled queue push - logic[DataWidth-1:0] qpush_data_d, qpush_data_q; - // Stores the metadata at handshake (except stalled queue operations) spill_register #( .T (metadata_t), @@ -200,7 +197,6 @@ module tcdm_adapter_xqueue #( state_d = state_q; sresp_select_d = sresp_select_q; queue_stalled_d = queue_stalled_q; - qpush_data_d = qpush_data_q; // While response is pending no requests are accepted in_ready_o = in_valid_o & ~in_ready_i ? 1'b0 : 1'b1; @@ -258,10 +254,10 @@ module tcdm_adapter_xqueue #( // Queue push if (amo_op_t'(in_amo_i) == QPush) begin if (queue_full) begin - // Set flag and store queue push data for later + // Note: Memory write is still executed but the tail is not incremented + // Set stalled flag queue_stalled_d = 1'b1; - qpush_data_d = in_wdata_i; // TODO: MIGHT NOT BE NEEDED - // Prevent acquisition of response data (TODO: might not be needed) + // Prevent acquisition of response data prevent_rdata_acq = 1'b1; end else begin // Set increment flag @@ -280,7 +276,7 @@ module tcdm_adapter_xqueue #( // Queue pop if (amo_op_t'(in_amo_i) == QPop) begin if (queue_empty) begin - // Set flag + // Set stalled flag queue_stalled_d = 1'b1; // Prevent acquisition of response data despite read access prevent_rdata_acq = 1'b1; @@ -316,19 +312,18 @@ module tcdm_adapter_xqueue #( end // ResolveQPushStall State blocks any requests until queue pop response - // has been accepted and then executes the queue push + // has been accepted and then prepares the queue push response + // (queue push stores data even in full queue but does not update tail) ResolveQPushStall: begin // Do not accept any requests during resolve in_ready_o = 1'b0; - // Prepare queue push (write data at tail of queue) - // TODO: INSTEAD READ STORED DATA FOR PUSH RESPONSE + // Retrieve queue push data as dummy response (read data at tail of queue) out_add_o = curr_tail_q; - out_write_o = 1'b1; - out_wdata_o = qpush_data_q; + out_write_o = 1'b0; out_be_o = 4'b1111; // Wait until pop response accepted if (resp_accepted) begin - // Set success flag + // Set increment flag increment_tail = 1'b1; // Trigger memory access out_req_o = 1'b1; @@ -353,7 +348,7 @@ module tcdm_adapter_xqueue #( out_be_o = 4'b1111; // Wait until push response accepted if (resp_accepted) begin - // Set success flag + // Set increment flag increment_head = 1'b1; // Trigger memory access out_req_o = 1'b1; @@ -467,7 +462,6 @@ module tcdm_adapter_xqueue #( next_tail_q <= 1; curr_head_q <= 0; queue_stalled_q <= 1'b0; - qpush_data_q <= '0; end else begin state_q <= state_d; amo_op_q <= amo_op_d; @@ -479,7 +473,6 @@ module tcdm_adapter_xqueue #( next_tail_q <= next_tail_d; curr_head_q <= curr_head_d; queue_stalled_q <= queue_stalled_d; - qpush_data_q <= qpush_data_d; end end From 03d622e3a5928d8d77acdfe1c4c72cd68694f094 Mon Sep 17 00:00:00 2001 From: Samuel Riedel Date: Wed, 23 Mar 2022 22:54:06 +0100 Subject: [PATCH 07/24] [apps/hardware] Implement xqueue_test app --- hardware/src/tcdm_adapter_xqueue.sv | 6 ++ software/apps/systolic/xqueue_test/main.c | 112 ++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 software/apps/systolic/xqueue_test/main.c diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv index cb454d368..4adb3f415 100644 --- a/hardware/src/tcdm_adapter_xqueue.sv +++ b/hardware/src/tcdm_adapter_xqueue.sv @@ -502,6 +502,12 @@ module tcdm_adapter_xqueue #( @(posedge clk_i) disable iff (~rst_ni) (rdata_in_vld_q |-> rdata_in_rdy)) else $fatal (1, "Trying to push new data although the i_rdata_register is not ready."); `endif + + `ifndef VERILATOR + stalled_queue : assert property( + @(posedge clk_i) disable iff (~rst_ni) (!(queue_stalled_q && smeta_in_vld))) + else $fatal (1, "Trying to stall a queue operation despite an already stalled queue."); + `endif // pragma translate_on endmodule diff --git a/software/apps/systolic/xqueue_test/main.c b/software/apps/systolic/xqueue_test/main.c new file mode 100644 index 000000000..4cd39ca5c --- /dev/null +++ b/software/apps/systolic/xqueue_test/main.c @@ -0,0 +1,112 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Gua Hao Khov, ETH Zurich + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +int32_t *queue = 0; + +int32_t producer_check, consumer_check, dummy_check; + +// queue push +static inline int32_t queue_push(void *const queue, int32_t data) { + int32_t ret; + asm volatile ("q.push.w %0, %1, (%2)" : "=r"(ret) : "r"(data), "r"(queue)); + return ret; +} + +// queue pop +inline int32_t queue_pop(void *const queue) { + int32_t ret; + asm volatile ("q.pop.w %0, 0(%1)" : "=r"(ret) : "r"(queue)); + return ret; +} + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + extern int32_t __seq_start; + + // Initialize synchronization variables + mempool_barrier_init(core_id); + + // Initialization + mempool_init(core_id, num_cores); + + // Setup + if (core_id == 0) { + printf("Initialize\n"); + queue = &__seq_start; + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Producer + if (core_id == 0) { + int32_t data[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + int32_t check = 0; + int32_t resp; + int32_t dummy = 0; + for (uint32_t i = 0; i < 16; ++i) { + resp = queue_push(queue, data[i]); + dummy += resp; + } + for (uint32_t i = 0; i < 16; ++i) { + resp = queue_push(queue, data[i]); + dummy += resp; + check += data[i]; + } + producer_check = check; + dummy_check = dummy; + } + + // Consumer + if (core_id == 1) { + int32_t read_data; + int32_t check = 0; + for (uint32_t i = 0; i < 16; ++i) { + read_data = queue_pop(queue); + printf("Rx: %d\n", read_data); + } + printf("Burst Test\n"); + for (uint32_t i = 0; i < 16; ++i) { + read_data = queue_pop(queue); + check += read_data; + } + consumer_check = check; + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Print both checks + if (core_id == 0) { + printf("Check: %d/%d/%d\n", producer_check, consumer_check, dummy_check); + } + + // wait until all cores have finished + mempool_barrier(num_cores); + return 0; +} From 20248cbf627e5b1000cc1ef26f9b605a47ec2e82 Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Wed, 19 May 2021 17:24:30 +0200 Subject: [PATCH 08/24] [apps] Implement systolic matmul_xqueue (1x1 matmul) --- software/apps/systolic/matmul_xqueue/main.c | 210 ++++++++++ software/runtime/systolic/matmul_xqueue.h | 401 ++++++++++++++++++++ 2 files changed, 611 insertions(+) create mode 100644 software/apps/systolic/matmul_xqueue/main.c create mode 100644 software/runtime/systolic/matmul_xqueue.h diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c new file mode 100644 index 000000000..6123c8764 --- /dev/null +++ b/software/apps/systolic/matmul_xqueue/main.c @@ -0,0 +1,210 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Gua Hao Khov, ETH Zurich + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "systolic/matmul_xqueue.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +// Dimensions of matrices +#define DIM_M 12 +#define DIM_N 12 +#define DIM_P 12 + +uint32_t *grid_mapping; + +int32_t *matrix_A; +int32_t *matrix_B; + +uint32_t rep_count; + +systolic_matrix_t *syst_matrix_A; +systolic_matrix_t *syst_matrix_B; +systolic_matrix_t *syst_matrix_C; + +void generate_gradient_matrix(int32_t **matrix, uint32_t num_rows, + uint32_t num_cols) { + int32_t *new_matrix = (int32_t *)simple_malloc(num_rows * num_cols * 4); + for (uint32_t y = 0; y < num_rows; ++y) { + for (uint32_t x = 0; x < num_cols; ++x) { + new_matrix[y * num_cols + x] = (int32_t)(y + x); + } + } + *matrix = new_matrix; +} + +void print_matrix(int32_t const *matrix, uint32_t num_rows, + uint32_t num_columns) { + printf("Matrix at 0x%8X\n", (uint32_t)matrix); + for (uint32_t i = 0; i < num_rows; ++i) { + for (uint32_t j = 0; j < num_columns; ++j) { + printf("%5d ", matrix[i * num_columns + j]); + } + printf("\n"); + } +} + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t tile_id = core_id / 4; + + // Initialize synchronization variables + mempool_barrier_init(core_id); + + // Initialization + mempool_init(core_id, num_cores); + + // Allocate systolic grid mapping + if (core_id == 0) { + grid_mapping = (uint32_t *)simple_malloc(num_cores * 4); + } + + // ---------- + // 16 CORES + // ---------- + + // Assign grid position (row wise) + // uint32_t col_idx = core_id % 4; + // uint32_t row_idx = core_id / 4; + + // Assign grid position (col wise) + uint32_t col_idx = core_id / 4; + uint32_t row_idx = core_id % 4; + + // Assign grid position (tile wise) + // uint32_t col_idx; + // uint32_t row_idx; + // if (core_id < 4) { + // col_idx = core_id % 2; + // row_idx = core_id / 2; + // } else if (core_id < 8) { + // col_idx = core_id % 2 + 2; + // row_idx = core_id / 6; + // } else if (core_id < 12) { + // col_idx = core_id % 2; + // row_idx = core_id / 10 + 2; + // } else { + // col_idx = core_id % 2 + 2; + // row_idx = core_id / 14 + 2; + // } + + // uint32_t mapped_tile = tile_id; + + // ---------- + // 256 CORES + // ---------- + + // Assign grid position (col wise) + // uint32_t col_idx = core_id / 16; + // uint32_t row_idx = core_id % 16; + + // Assign grid position (tile wise) + // uint32_t mapped_group = core_id % 4; + // uint32_t col_idx = tile_id / 4; + // uint32_t row_idx = (tile_id % 4) + (mapped_group * 4); + // uint32_t mapped_tile = (tile_id % 16) + (mapped_group * 16); + + // Wait for all cores + mempool_barrier(num_cores); + + // Set systolic grid mapping + grid_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = tile_id; + + // Wait for all cores + mempool_barrier(num_cores); + + // Setup + if (core_id == 0) { + printf("> Initialize\n"); + + // Print out grid mapping + // print_matrix((int32_t *)grid_mapping, 4, 4); + + // Initialize systolic array + systolic_init(grid_mapping); + + // Create systolic matrices + generate_gradient_matrix(&matrix_A, DIM_M, DIM_N); + systolic_matrix_create(&syst_matrix_A, matrix_A, DIM_M, DIM_N); + simple_free(matrix_A); + generate_gradient_matrix(&matrix_B, DIM_N, DIM_P); + systolic_matrix_create(&syst_matrix_B, matrix_B, DIM_N, DIM_P); + simple_free(matrix_B); + systolic_matrix_allocate(&syst_matrix_C, DIM_M, DIM_P); + + // Print out systolic matrices A & B + // printf("> Print Systolic Matrices A & B\n"); + // systolic_matrix_print(syst_matrix_A); + // systolic_matrix_print(syst_matrix_B); + + // Set repetition count per submatrix of C (A->num_cols == B->num_rows) + rep_count = syst_matrix_A->num_cols; + } + + // Wait for all cores + mempool_barrier(num_cores); + + if (core_id == 0) { + // Start benchmark + printf("> Start\n"); + mempool_start_benchmark(); + } + + // Wait for all cores + mempool_barrier(num_cores); + + if ((row_idx == 0) && (col_idx == 0)) { + systolic_rcp_pe(rep_count, syst_matrix_A, syst_matrix_B, syst_matrix_C); + } + + if ((row_idx == 0) && (col_idx != 0)) { + systolic_cp_pe(col_idx, rep_count, syst_matrix_B, syst_matrix_C); + } + + if ((row_idx != 0) && (col_idx == 0)) { + systolic_rp_pe(row_idx, rep_count, syst_matrix_A, syst_matrix_C); + } + + if ((row_idx != 0) && (col_idx != 0)) { + systolic_np_pe(row_idx, col_idx, rep_count, syst_matrix_C); + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Print out benchmark + if (core_id == 0) { + // Stop benchmark + mempool_stop_benchmark(); + printf("> End\n"); + + // Print out systolic matrix C + // printf("> Print Systolic Matrix C\n"); + // systolic_matrix_print(syst_matrix_C); + } + + // wait until all cores have finished + mempool_barrier(num_cores); + return 0; +} diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h new file mode 100644 index 000000000..4b923db11 --- /dev/null +++ b/software/runtime/systolic/matmul_xqueue.h @@ -0,0 +1,401 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Gua Hao Khov, ETH Zurich + +/* This library implements a simple systolic architecture emulation + * using global code based orchestration + */ + +/* A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix + * C = AB + * (max dimension is 16-bit) + */ + +#include "alloc.h" +#include "printf.h" + +// Dimensions of square systolic array +#define SYSTOLIC_SIZE 4 + +// Systolic matrix +typedef struct { + int32_t *matrix; + uint32_t num_rows; + uint32_t num_cols; +} systolic_matrix_t; + +// TODO: SQRT ROOT OF NUM_CORES FOR SYSTOLIC SIZE + +// Array of queue ptrs in row-major order +int32_t *queues_vert[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; +int32_t *queues_horz[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; + +// TODO: GENERALIZE FOR ANY NUMBER OF TILES +void systolic_init(uint32_t const *grid_mapping) { + // Create systolic array via queues + extern int32_t __seq_start; + uint32_t grid_pos = 0; + uint32_t tile_id; + uint32_t tile_offset; + uint32_t bank_sel[4] = {0, 0, 0, 0}; + for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { + for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { + tile_id = grid_mapping[grid_pos]; + tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4; + queues_vert[y][x] = &__seq_start + tile_offset + bank_sel[tile_id]; + queues_horz[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 1; + bank_sel[tile_id] += 2; + ++grid_pos; + } + } + // TODO: PRINT OUT THE ADDRESSES TO CHECK +} + +void systolic_matrix_allocate(systolic_matrix_t **syst_matrix, + uint32_t num_rows, uint32_t num_cols) { + // Allocate matrix array + int32_t *array = (int32_t *)simple_malloc(num_rows * num_cols * 4); + + // Allocate systolic matrix + systolic_matrix_t *new_matrix = (systolic_matrix_t *)simple_malloc(3 * 4); + + // Assign values to systolic matrix + new_matrix->matrix = array; + new_matrix->num_rows = num_rows; + new_matrix->num_cols = num_cols; + + *syst_matrix = new_matrix; +} + +void systolic_matrix_create(systolic_matrix_t **syst_matrix, int32_t *matrix, + uint32_t num_rows, uint32_t num_cols) { + // Allocate matrix array + int32_t *array = (int32_t *)simple_malloc(num_rows * num_cols * 4); + + // Copy data into new matrix array + for (uint32_t y = 0; y < num_rows; ++y) { + for (uint32_t x = 0; x < num_cols; ++x) { + array[y * num_cols + x] = matrix[y * num_cols + x]; + } + } + + // Allocate systolic matrix + systolic_matrix_t *new_matrix = (systolic_matrix_t *)simple_malloc(3 * 4); + + // Assign values to systolic matrix + new_matrix->matrix = array; + new_matrix->num_rows = num_rows; + new_matrix->num_cols = num_cols; + + *syst_matrix = new_matrix; +} + +void systolic_matrix_print(systolic_matrix_t *syst_matrix) { + printf("Systolic matrix at 0x%08X\n", (uint32_t)syst_matrix); + uint32_t num_rows = syst_matrix->num_rows; + uint32_t num_cols = syst_matrix->num_cols; + int32_t *matrix = syst_matrix->matrix; + for (uint32_t y = 0; y < num_rows; ++y) { + for (uint32_t x = 0; x < num_cols; ++x) { + printf("%5d ", matrix[y * num_cols + x]); + } + printf("\n"); + } +} + +// row and column producing processing element +void systolic_rcp_pe(const uint32_t rep_count, + systolic_matrix_t const *__restrict__ A, + systolic_matrix_t const *__restrict__ B, + systolic_matrix_t const *__restrict__ C) { + int32_t *q_next_horz; + int32_t *q_next_vert; + int32_t data_horz = 0; + int32_t data_vert = 0; + int32_t *matrix_A; + int32_t *matrix_B; + int32_t *matrix_C; + uint32_t num_cols_A; + uint32_t num_cols_B; + uint32_t num_rows_C; + uint32_t num_cols_C; + int32_t curr_element_C; + + // Assign queues + q_next_horz = queues_horz[0][1]; + q_next_vert = queues_vert[1][0]; + + // Get matrix arrays + matrix_A = A->matrix; + matrix_B = B->matrix; + matrix_C = C->matrix; + + // Get dimensions of matrices + num_cols_A = A->num_cols; + num_cols_B = B->num_cols; + num_rows_C = C->num_rows; + num_cols_C = C->num_cols; + + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) { + // Reset value + curr_element_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz = matrix_A[y * num_cols_A + i]; + data_vert = matrix_B[i * num_cols_B + x]; + __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + curr_element_C += data_horz * data_vert; + } + + // Store value + matrix_C[y * num_cols_C + x] = curr_element_C; + } + } +} + +// column producing processing element +void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, + systolic_matrix_t const *__restrict__ B, + systolic_matrix_t const *__restrict__ C) { + int32_t *q_prev_horz; + int32_t *q_next_horz; + int32_t *q_next_vert; + int32_t data_horz = 0; + int32_t data_vert = 0; + int32_t *matrix_B; + int32_t *matrix_C; + uint32_t num_cols_B; + uint32_t num_rows_C; + uint32_t num_cols_C; + uint32_t shifted_x; + int32_t curr_element_C; + + // Assign queues + q_prev_horz = queues_horz[0][col_idx]; + if (col_idx == SYSTOLIC_SIZE - 1) { + q_next_horz = NULL; + } else { + q_next_horz = queues_horz[0][col_idx + 1]; + } + q_next_vert = queues_vert[1][col_idx]; + + // Get matrix arrays + matrix_B = B->matrix; + matrix_C = C->matrix; + + // Get dimensions of matrices + num_cols_B = B->num_cols; + num_rows_C = C->num_rows; + num_cols_C = C->num_cols; + + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) { + // Shift x + shifted_x = x + col_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C) { + // Reset value + curr_element_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + data_vert = matrix_B[i * num_cols_B + shifted_x]; + data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + if (q_next_horz) { + __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + } + __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + curr_element_C += data_horz * data_vert; + } + + // Store value + matrix_C[y * num_cols_C + shifted_x] = curr_element_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + if (q_next_horz) { + __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + } + __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + } + } + } + } +} + +// row producing processing element +void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, + systolic_matrix_t const *__restrict__ A, + systolic_matrix_t const *__restrict__ C) { + int32_t *q_next_horz; + int32_t *q_prev_vert; + int32_t *q_next_vert; + int32_t data_horz = 0; + int32_t data_vert = 0; + int32_t *matrix_A; + int32_t *matrix_C; + uint32_t num_cols_A; + uint32_t num_rows_C; + uint32_t num_cols_C; + uint32_t shifted_y; + int32_t curr_element_C; + + // Assign queues + q_next_horz = queues_horz[row_idx][1]; + q_prev_vert = queues_vert[row_idx][0]; + if (row_idx == SYSTOLIC_SIZE - 1) { + q_next_vert = NULL; + } else { + q_next_vert = queues_vert[row_idx + 1][0]; + } + + // Get matrix arrays + matrix_A = A->matrix; + matrix_C = C->matrix; + + // Get dimensions of matrices + num_cols_A = A->num_cols; + num_rows_C = C->num_rows; + num_cols_C = C->num_cols; + + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) { + // Shift y + shifted_y = y + row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_y < num_rows_C) { + // Reset value + curr_element_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz = matrix_A[shifted_y * num_cols_A + i]; + data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + if (q_next_vert) { + __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + } + curr_element_C += data_horz * data_vert; + } + + // Store value + matrix_C[shifted_y * num_cols_C + x] = curr_element_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + if (q_next_vert) { + __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + } + } + } + } + } +} + +// non-producing processing element +void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, + const uint32_t rep_count, + systolic_matrix_t const *__restrict__ C) { + int32_t *q_prev_horz; + int32_t *q_next_horz; + int32_t *q_prev_vert; + int32_t *q_next_vert; + int32_t data_horz = 0; + int32_t data_vert = 0; + int32_t *matrix_C; + uint32_t num_rows_C; + uint32_t num_cols_C; + uint32_t shifted_x; + uint32_t shifted_y; + int32_t curr_element_C; + + // Assign queues + q_prev_horz = queues_horz[row_idx][col_idx]; + if (col_idx == SYSTOLIC_SIZE - 1) { + q_next_horz = NULL; + } else { + q_next_horz = queues_horz[row_idx][col_idx + 1]; + } + q_prev_vert = queues_vert[row_idx][col_idx]; + if (row_idx == SYSTOLIC_SIZE - 1) { + q_next_vert = NULL; + } else { + q_next_vert = queues_vert[row_idx + 1][col_idx]; + } + + // Get matrix arrays + matrix_C = C->matrix; + + // Get dimensions of matrices + num_rows_C = C->num_rows; + num_cols_C = C->num_cols; + + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) { + // Shift x and y + shifted_x = x + col_idx; + shifted_y = y + row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C && shifted_y < num_rows_C) { + // Reset value + curr_element_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + if (q_next_horz) { + __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + } + if (q_next_vert) { + __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + } + curr_element_C += data_horz * data_vert; + } + + // Store values + matrix_C[shifted_y * num_cols_C + shifted_x] = curr_element_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + if (q_next_horz) { + __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + + } + if (q_next_vert) { + __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + } + } + } + } + } +} From ac43b0c35300e0d279fff08fdc77604674732ceb Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Wed, 19 May 2021 17:24:51 +0200 Subject: [PATCH 09/24] [apps] Optimize systolic matmul_xqueue for 2x2 matmul --- software/apps/systolic/matmul_xqueue/main.c | 6 +- software/runtime/systolic/matmul_xqueue.h | 337 +++++++++++++++----- 2 files changed, 260 insertions(+), 83 deletions(-) diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c index 6123c8764..99ac34f80 100644 --- a/software/apps/systolic/matmul_xqueue/main.c +++ b/software/apps/systolic/matmul_xqueue/main.c @@ -159,7 +159,7 @@ int main() { // systolic_matrix_print(syst_matrix_B); // Set repetition count per submatrix of C (A->num_cols == B->num_rows) - rep_count = syst_matrix_A->num_cols; + rep_count = syst_matrix_A->num_cols / 2; } // Wait for all cores @@ -200,8 +200,8 @@ int main() { printf("> End\n"); // Print out systolic matrix C - // printf("> Print Systolic Matrix C\n"); - // systolic_matrix_print(syst_matrix_C); + printf("> Print Systolic Matrix C\n"); + systolic_matrix_print(syst_matrix_C); } // wait until all cores have finished diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h index 4b923db11..01ddfc9fa 100644 --- a/software/runtime/systolic/matmul_xqueue.h +++ b/software/runtime/systolic/matmul_xqueue.h @@ -67,29 +67,49 @@ void systolic_init(uint32_t const *grid_mapping) { void systolic_matrix_allocate(systolic_matrix_t **syst_matrix, uint32_t num_rows, uint32_t num_cols) { + // Round up row and col dimension to next multiple of two + uint32_t syst_num_rows = (uint32_t)((num_rows + 1) & 0xFFFE); + uint32_t syst_num_cols = (uint32_t)((num_cols + 1) & 0xFFFE); + // Allocate matrix array - int32_t *array = (int32_t *)simple_malloc(num_rows * num_cols * 4); + int32_t *array = (int32_t *)simple_malloc(syst_num_rows * syst_num_cols * 4); // Allocate systolic matrix systolic_matrix_t *new_matrix = (systolic_matrix_t *)simple_malloc(3 * 4); // Assign values to systolic matrix new_matrix->matrix = array; - new_matrix->num_rows = num_rows; - new_matrix->num_cols = num_cols; + new_matrix->num_rows = syst_num_rows; + new_matrix->num_cols = syst_num_cols; *syst_matrix = new_matrix; } void systolic_matrix_create(systolic_matrix_t **syst_matrix, int32_t *matrix, uint32_t num_rows, uint32_t num_cols) { + // Round up row and col dimension to next multiple of two + uint32_t syst_num_rows = (uint32_t)((num_rows + 1) & 0xFFFE); + uint32_t syst_num_cols = (uint32_t)((num_cols + 1) & 0xFFFE); + // Allocate matrix array - int32_t *array = (int32_t *)simple_malloc(num_rows * num_cols * 4); + int32_t *array = (int32_t *)simple_malloc(syst_num_rows * syst_num_cols * 4); // Copy data into new matrix array for (uint32_t y = 0; y < num_rows; ++y) { for (uint32_t x = 0; x < num_cols; ++x) { - array[y * num_cols + x] = matrix[y * num_cols + x]; + array[y * syst_num_cols + x] = matrix[y * num_cols + x]; + } + } + + // Zero padding of matrix array + if (syst_num_cols != num_cols) { + for (uint32_t y = 0; y < syst_num_rows; ++y) { + array[y * syst_num_cols + syst_num_cols - 1] = 0; + } + } + if (syst_num_rows != num_rows) { + for (uint32_t x = 0; x < syst_num_cols; ++x) { + array[(syst_num_rows - 1) * syst_num_cols + x] = 0; } } @@ -98,8 +118,8 @@ void systolic_matrix_create(systolic_matrix_t **syst_matrix, int32_t *matrix, // Assign values to systolic matrix new_matrix->matrix = array; - new_matrix->num_rows = num_rows; - new_matrix->num_cols = num_cols; + new_matrix->num_rows = syst_num_rows; + new_matrix->num_cols = syst_num_cols; *syst_matrix = new_matrix; } @@ -124,8 +144,8 @@ void systolic_rcp_pe(const uint32_t rep_count, systolic_matrix_t const *__restrict__ C) { int32_t *q_next_horz; int32_t *q_next_vert; - int32_t data_horz = 0; - int32_t data_vert = 0; + int32_t data_horz[4] = {0, 0, 0, 0}; + int32_t data_vert[4] = {0, 0, 0, 0}; int32_t *matrix_A; int32_t *matrix_B; int32_t *matrix_C; @@ -133,7 +153,12 @@ void systolic_rcp_pe(const uint32_t rep_count, uint32_t num_cols_B; uint32_t num_rows_C; uint32_t num_cols_C; - int32_t curr_element_C; + int32_t curr_element_0_C; + int32_t curr_element_1_C; + int32_t curr_element_2_C; + int32_t curr_element_3_C; + uint32_t anchor_row_0; + uint32_t anchor_row_1; // Assign queues q_next_horz = queues_horz[0][1]; @@ -151,22 +176,49 @@ void systolic_rcp_pe(const uint32_t rep_count, num_cols_C = C->num_cols; // Execute step-wise matrix multiplication - for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) { - for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) { - // Reset value - curr_element_C = 0; + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; // Systolic matrix multiplication through MACs - for (uint32_t i = 0; i < rep_count; ++i) { - data_horz = matrix_A[y * num_cols_A + i]; - data_vert = matrix_B[i * num_cols_B + x]; - __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); - curr_element_C += data_horz * data_vert; + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + data_horz[0] = matrix_A[y * num_cols_A + i]; + data_horz[1] = matrix_A[y * num_cols_A + i + 1]; + data_horz[2] = matrix_A[(y + 1) * num_cols_A + i]; + data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1]; + data_vert[0] = matrix_B[i * num_cols_B + x]; + data_vert[1] = matrix_B[i * num_cols_B + x + 1]; + data_vert[2] = matrix_B[(i + 1) * num_cols_B + x]; + data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1]; + __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); + curr_element_0_C += data_horz[1] * data_vert[2]; + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[1]; } - // Store value - matrix_C[y * num_cols_C + x] = curr_element_C; + // Store values + anchor_row_0 = y * num_cols_C + x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; } } } @@ -178,15 +230,20 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, int32_t *q_prev_horz; int32_t *q_next_horz; int32_t *q_next_vert; - int32_t data_horz = 0; - int32_t data_vert = 0; + int32_t data_horz[4] = {0, 0, 0, 0}; + int32_t data_vert[4] = {0, 0, 0, 0}; int32_t *matrix_B; int32_t *matrix_C; uint32_t num_cols_B; uint32_t num_rows_C; uint32_t num_cols_C; uint32_t shifted_x; - int32_t curr_element_C; + int32_t curr_element_0_C; + int32_t curr_element_1_C; + int32_t curr_element_2_C; + int32_t curr_element_3_C; + uint32_t anchor_row_0; + uint32_t anchor_row_1; // Assign queues q_prev_horz = queues_horz[0][col_idx]; @@ -207,37 +264,73 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, num_cols_C = C->num_cols; // Execute step-wise matrix multiplication - for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) { - for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) { + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { // Shift x - shifted_x = x + col_idx; + shifted_x = x + 2 * col_idx; // Check if this PE is currently within the matrix C if (shifted_x < num_cols_C) { - // Reset value - curr_element_C = 0; + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; // Systolic matrix multiplication through MACs - for (uint32_t i = 0; i < rep_count; ++i) { - data_vert = matrix_B[i * num_cols_B + shifted_x]; - data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + data_vert[0] = matrix_B[i * num_cols_B + shifted_x]; + data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1]; + data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x]; + data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; + data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); if (q_next_horz) { - __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); } - __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); - curr_element_C += data_horz * data_vert; + __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); + curr_element_0_C += data_horz[1] * data_vert[2]; + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[1]; } - // Store value - matrix_C[y * num_cols_C + shifted_x] = curr_element_C; + // Store values + anchor_row_0 = y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); if (q_next_horz) { - __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); } - __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); } } } @@ -251,15 +344,20 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, int32_t *q_next_horz; int32_t *q_prev_vert; int32_t *q_next_vert; - int32_t data_horz = 0; - int32_t data_vert = 0; + int32_t data_horz[4] = {0, 0, 0, 0}; + int32_t data_vert[4] = {0, 0, 0, 0}; int32_t *matrix_A; int32_t *matrix_C; uint32_t num_cols_A; uint32_t num_rows_C; uint32_t num_cols_C; uint32_t shifted_y; - int32_t curr_element_C; + int32_t curr_element_0_C; + int32_t curr_element_1_C; + int32_t curr_element_2_C; + int32_t curr_element_3_C; + uint32_t anchor_row_0; + uint32_t anchor_row_1; // Assign queues q_next_horz = queues_horz[row_idx][1]; @@ -280,36 +378,72 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, num_cols_C = C->num_cols; // Execute step-wise matrix multiplication - for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) { - for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) { + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { // Shift y - shifted_y = y + row_idx; + shifted_y = y + 2 * row_idx; // Check if this PE is currently within the matrix C if (shifted_y < num_rows_C) { - // Reset value - curr_element_C = 0; + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; // Systolic matrix multiplication through MACs - for (uint32_t i = 0; i < rep_count; ++i) { - data_horz = matrix_A[shifted_y * num_cols_A + i]; - data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + data_horz[0] = matrix_A[shifted_y * num_cols_A + i]; + data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1]; + data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i]; + data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; + data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); if (q_next_vert) { - __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); } - curr_element_C += data_horz * data_vert; + curr_element_0_C += data_horz[1] * data_vert[2]; + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[1]; } - // Store value - matrix_C[shifted_y * num_cols_C + x] = curr_element_C; + // Store values + anchor_row_0 = shifted_y * num_cols_C + x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); if (q_next_vert) { - __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); } } } @@ -325,14 +459,19 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, int32_t *q_next_horz; int32_t *q_prev_vert; int32_t *q_next_vert; - int32_t data_horz = 0; - int32_t data_vert = 0; + int32_t data_horz[4] = {0, 0, 0, 0}; + int32_t data_vert[4] = {0, 0, 0, 0}; int32_t *matrix_C; uint32_t num_rows_C; uint32_t num_cols_C; uint32_t shifted_x; uint32_t shifted_y; - int32_t curr_element_C; + int32_t curr_element_0_C; + int32_t curr_element_1_C; + int32_t curr_element_2_C; + int32_t curr_element_3_C; + uint32_t anchor_row_0; + uint32_t anchor_row_1; // Assign queues q_prev_horz = queues_horz[row_idx][col_idx]; @@ -356,43 +495,81 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, num_cols_C = C->num_cols; // Execute step-wise matrix multiplication - for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) { - for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) { + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { // Shift x and y - shifted_x = x + col_idx; - shifted_y = y + row_idx; + shifted_x = x + 2 * col_idx; + shifted_y = y + 2 * row_idx; // Check if this PE is currently within the matrix C if (shifted_x < num_cols_C && shifted_y < num_rows_C) { - // Reset value - curr_element_C = 0; + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < rep_count; ++i) { - data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); if (q_next_horz) { - __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); } if (q_next_vert) { - __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); } - curr_element_C += data_horz * data_vert; + curr_element_0_C += data_horz[1] * data_vert[2]; + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[1]; } // Store values - matrix_C[shifted_y * num_cols_C + shifted_x] = curr_element_C; + anchor_row_0 = shifted_y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); + data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); + data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); if (q_next_horz) { - __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST); - + __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); } if (q_next_vert) { - __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); + __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); } } } From 51b6eb5b9609d5f3c519e32fba40d9e67ffc3693 Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Thu, 20 May 2021 20:39:32 +0200 Subject: [PATCH 10/24] [apps] Improve matmul_xqueue code - add interleaving and fix data dependency - hotfix to enforce data dependency in dummy pop/push --- software/apps/systolic/matmul_xqueue/main.c | 4 +- software/runtime/systolic/matmul_xqueue.h | 845 ++++++++++++++------ 2 files changed, 603 insertions(+), 246 deletions(-) diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c index 99ac34f80..da46fbe35 100644 --- a/software/apps/systolic/matmul_xqueue/main.c +++ b/software/apps/systolic/matmul_xqueue/main.c @@ -200,8 +200,8 @@ int main() { printf("> End\n"); // Print out systolic matrix C - printf("> Print Systolic Matrix C\n"); - systolic_matrix_print(syst_matrix_C); + //printf("> Print Systolic Matrix C\n"); + //systolic_matrix_print(syst_matrix_C); } // wait until all cores have finished diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h index 01ddfc9fa..113091293 100644 --- a/software/runtime/systolic/matmul_xqueue.h +++ b/software/runtime/systolic/matmul_xqueue.h @@ -23,6 +23,15 @@ /* A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix * C = AB * (max dimension is 16-bit) + * Matrix is processed in 2x2 submatrices with the following indexing + * + * B B 0 2 + * B B 1 3 + * + * A A C C = 0 1 0 1 + * A A C C 2 3 2 3 + * + * e.g. C0 = A1 * B1 + A0 * B0 */ #include "alloc.h" @@ -44,6 +53,16 @@ typedef struct { int32_t *queues_vert[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; int32_t *queues_horz[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; +// queue push +inline int32_t queue_push(int32_t *queue, int32_t data) { + return __atomic_fetch_and(queue, data, __ATOMIC_RELAXED); +} + +// queue pop +inline int32_t queue_pop(int32_t *queue) { + return __atomic_fetch_or(queue, 0, __ATOMIC_RELAXED); +} + // TODO: GENERALIZE FOR ANY NUMBER OF TILES void systolic_init(uint32_t const *grid_mapping) { // Create systolic array via queues @@ -142,10 +161,12 @@ void systolic_rcp_pe(const uint32_t rep_count, systolic_matrix_t const *__restrict__ A, systolic_matrix_t const *__restrict__ B, systolic_matrix_t const *__restrict__ C) { - int32_t *q_next_horz; - int32_t *q_next_vert; + int32_t *queue_next_horz; + int32_t *queue_next_vert; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; + int32_t resp_horz __attribute__((unused)); + int32_t resp_vert __attribute__((unused)); int32_t *matrix_A; int32_t *matrix_B; int32_t *matrix_C; @@ -161,8 +182,8 @@ void systolic_rcp_pe(const uint32_t rep_count, uint32_t anchor_row_1; // Assign queues - q_next_horz = queues_horz[0][1]; - q_next_vert = queues_vert[1][0]; + queue_next_horz = queues_horz[0][1]; + queue_next_vert = queues_vert[1][0]; // Get matrix arrays matrix_A = A->matrix; @@ -187,29 +208,29 @@ void systolic_rcp_pe(const uint32_t rep_count, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < 2 * rep_count; i += 2) { data_horz[0] = matrix_A[y * num_cols_A + i]; + data_vert[0] = matrix_B[i * num_cols_B + x]; + resp_horz = queue_push(queue_next_horz, data_horz[0]); + resp_vert = queue_push(queue_next_vert, data_vert[0]); + curr_element_0_C += data_horz[0] * data_vert[0]; data_horz[1] = matrix_A[y * num_cols_A + i + 1]; + data_vert[1] = matrix_B[(i + 1) * num_cols_B + x]; + resp_horz = queue_push(queue_next_horz, data_horz[1]); + resp_vert = queue_push(queue_next_vert, data_vert[1]); + curr_element_0_C += data_horz[1] * data_vert[1]; data_horz[2] = matrix_A[(y + 1) * num_cols_A + i]; + data_vert[2] = matrix_B[i * num_cols_B + x + 1]; + resp_horz = queue_push(queue_next_horz, data_horz[1]); + resp_vert = queue_push(queue_next_vert, data_vert[1]); + curr_element_1_C += data_horz[0] * data_vert[2]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[2]; data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1]; - data_vert[0] = matrix_B[i * num_cols_B + x]; - data_vert[1] = matrix_B[i * num_cols_B + x + 1]; - data_vert[2] = matrix_B[(i + 1) * num_cols_B + x]; data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1]; - __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); - curr_element_0_C += data_horz[1] * data_vert[2]; + resp_horz = queue_push(queue_next_horz, data_horz[3]); + resp_vert = queue_push(queue_next_vert, data_vert[3]); curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_2_C += data_horz[3] * data_vert[1]; curr_element_3_C += data_horz[3] * data_vert[3]; - curr_element_0_C += data_horz[0] * data_vert[0]; - curr_element_1_C += data_horz[0] * data_vert[1]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[1]; } // Store values @@ -227,11 +248,13 @@ void systolic_rcp_pe(const uint32_t rep_count, void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, systolic_matrix_t const *__restrict__ B, systolic_matrix_t const *__restrict__ C) { - int32_t *q_prev_horz; - int32_t *q_next_horz; - int32_t *q_next_vert; + int32_t *queue_prev_horz; + int32_t *queue_next_horz; + int32_t *queue_next_vert; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; + int32_t resp_horz __attribute__((unused)); + int32_t resp_vert __attribute__((unused)); int32_t *matrix_B; int32_t *matrix_C; uint32_t num_cols_B; @@ -246,13 +269,13 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, uint32_t anchor_row_1; // Assign queues - q_prev_horz = queues_horz[0][col_idx]; + queue_prev_horz = queues_horz[0][col_idx]; if (col_idx == SYSTOLIC_SIZE - 1) { - q_next_horz = NULL; + queue_next_horz = NULL; } else { - q_next_horz = queues_horz[0][col_idx + 1]; + queue_next_horz = queues_horz[0][col_idx + 1]; } - q_next_vert = queues_vert[1][col_idx]; + queue_next_vert = queues_vert[1][col_idx]; // Get matrix arrays matrix_B = B->matrix; @@ -263,74 +286,134 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, num_rows_C = C->num_rows; num_cols_C = C->num_cols; - // Execute step-wise matrix multiplication - for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { - for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { - // Shift x - shifted_x = x + 2 * col_idx; - - // Check if this PE is currently within the matrix C - if (shifted_x < num_cols_C) { - // Reset values - curr_element_0_C = 0; - curr_element_1_C = 0; - curr_element_2_C = 0; - curr_element_3_C = 0; - - // Systolic matrix multiplication through MACs - for (uint32_t i = 0; i < 2 * rep_count; i += 2) { - data_vert[0] = matrix_B[i * num_cols_B + shifted_x]; - data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1]; - data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x]; - data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; - data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - if (q_next_horz) { - __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); + // Check if PE is at the right boundary + if (queue_next_horz) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x + shifted_x = x + 2 * col_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + data_vert[0] = matrix_B[i * num_cols_B + shifted_x]; + data_horz[0] = queue_pop(queue_prev_horz); + resp_horz = queue_push(queue_next_horz, data_horz[0]); + resp_vert = queue_push(queue_next_vert, data_vert[0]); + curr_element_0_C += data_horz[0] * data_vert[0]; + data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x]; + data_horz[1] = queue_pop(queue_prev_horz); + resp_horz = queue_push(queue_next_horz, data_horz[1]); + resp_vert = queue_push(queue_next_vert, data_vert[1]); + curr_element_0_C += data_horz[1] * data_vert[1]; + data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1]; + data_horz[2] = queue_pop(queue_prev_horz); + resp_horz = queue_push(queue_next_horz, data_horz[2]); + resp_vert = queue_push(queue_next_vert, data_vert[2]); + curr_element_1_C += data_horz[0] * data_vert[2]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[2]; + data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; + data_horz[3] = queue_pop(queue_prev_horz); + resp_horz = queue_push(queue_next_horz, data_horz[3]); + resp_vert = queue_push(queue_next_vert, data_vert[3]); + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz[0] = queue_pop(queue_prev_horz); + resp_horz = queue_push(queue_next_horz, data_horz[0]); + resp_vert = queue_push(queue_next_vert, data_vert[0]); + data_horz[1] = queue_pop(queue_prev_horz); + resp_horz = queue_push(queue_next_horz, data_horz[1]); + resp_vert = queue_push(queue_next_vert, data_vert[1]); + data_horz[2] = queue_pop(queue_prev_horz); + resp_horz = queue_push(queue_next_horz, data_horz[2]); + resp_vert = queue_push(queue_next_vert, data_vert[2]); + data_horz[3] = queue_pop(queue_prev_horz); + resp_horz = queue_push(queue_next_horz, data_horz[3]); + resp_vert = queue_push(queue_next_vert, data_vert[3]); } - __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); - curr_element_0_C += data_horz[1] * data_vert[2]; - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[2]; - curr_element_3_C += data_horz[3] * data_vert[3]; - curr_element_0_C += data_horz[0] * data_vert[0]; - curr_element_1_C += data_horz[0] * data_vert[1]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[1]; } + } + } + } else { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x + shifted_x = x + 2 * col_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + data_vert[0] = matrix_B[i * num_cols_B + shifted_x]; + data_horz[0] = queue_pop(queue_prev_horz); + resp_vert = queue_push(queue_next_vert, data_vert[0]); + curr_element_0_C += data_horz[0] * data_vert[0]; + data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x]; + data_horz[1] = queue_pop(queue_prev_horz); + resp_vert = queue_push(queue_next_vert, data_vert[1]); + curr_element_0_C += data_horz[1] * data_vert[1]; + data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1]; + data_horz[2] = queue_pop(queue_prev_horz); + resp_vert = queue_push(queue_next_vert, data_vert[2]); + curr_element_1_C += data_horz[0] * data_vert[2]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[2]; + data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; + data_horz[3] = queue_pop(queue_prev_horz); + resp_vert = queue_push(queue_next_vert, data_vert[3]); + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } - // Store values - anchor_row_0 = y * num_cols_C + shifted_x; - anchor_row_1 = anchor_row_0 + num_cols_C; - matrix_C[anchor_row_0] = curr_element_0_C; - matrix_C[anchor_row_0 + 1] = curr_element_1_C; - matrix_C[anchor_row_1] = curr_element_2_C; - matrix_C[anchor_row_1 + 1] = curr_element_3_C; - } else { - // Pop and push dummy data - for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - if (q_next_horz) { - __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); + // Store values + anchor_row_0 = y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz[0] = queue_pop(queue_prev_horz); + resp_vert = queue_push(queue_next_vert, data_horz[0]); + data_horz[1] = queue_pop(queue_prev_horz); + resp_vert = queue_push(queue_next_vert, data_horz[1]); + data_horz[2] = queue_pop(queue_prev_horz); + resp_vert = queue_push(queue_next_vert, data_horz[2]); + data_horz[3] = queue_pop(queue_prev_horz); + resp_vert = queue_push(queue_next_vert, data_horz[3]); } - __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); } } } @@ -341,11 +424,13 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, systolic_matrix_t const *__restrict__ A, systolic_matrix_t const *__restrict__ C) { - int32_t *q_next_horz; - int32_t *q_prev_vert; - int32_t *q_next_vert; + int32_t *queue_next_horz; + int32_t *queue_prev_vert; + int32_t *queue_next_vert; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; + int32_t resp_horz __attribute__((unused)); + int32_t resp_vert __attribute__((unused)); int32_t *matrix_A; int32_t *matrix_C; uint32_t num_cols_A; @@ -360,12 +445,12 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, uint32_t anchor_row_1; // Assign queues - q_next_horz = queues_horz[row_idx][1]; - q_prev_vert = queues_vert[row_idx][0]; + queue_next_horz = queues_horz[row_idx][1]; + queue_prev_vert = queues_vert[row_idx][0]; if (row_idx == SYSTOLIC_SIZE - 1) { - q_next_vert = NULL; + queue_next_vert = NULL; } else { - q_next_vert = queues_vert[row_idx + 1][0]; + queue_next_vert = queues_vert[row_idx + 1][0]; } // Get matrix arrays @@ -377,73 +462,133 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, num_rows_C = C->num_rows; num_cols_C = C->num_cols; - // Execute step-wise matrix multiplication - for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { - for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { - // Shift y - shifted_y = y + 2 * row_idx; - - // Check if this PE is currently within the matrix C - if (shifted_y < num_rows_C) { - // Reset values - curr_element_0_C = 0; - curr_element_1_C = 0; - curr_element_2_C = 0; - curr_element_3_C = 0; - - // Systolic matrix multiplication through MACs - for (uint32_t i = 0; i < 2 * rep_count; i += 2) { - data_horz[0] = matrix_A[shifted_y * num_cols_A + i]; - data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1]; - data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i]; - data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; - data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); - if (q_next_vert) { - __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); + // Check if PE is at the bottom boundary + if (queue_next_vert) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift y + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + data_horz[0] = matrix_A[shifted_y * num_cols_A + i]; + data_vert[0] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[0]); + resp_vert = queue_push(queue_next_vert, data_vert[0]); + curr_element_0_C += data_horz[0] * data_vert[0]; + data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1]; + data_vert[1] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[1]); + resp_vert = queue_push(queue_next_vert, data_vert[1]); + curr_element_0_C += data_horz[1] * data_vert[1]; + data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i]; + data_vert[2] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[2]); + resp_vert = queue_push(queue_next_vert, data_vert[2]); + curr_element_1_C += data_horz[0] * data_vert[2]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[2]; + data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; + data_vert[3] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[3]); + resp_vert = queue_push(queue_next_vert, data_vert[3]); + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = shifted_y * num_cols_C + x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_vert[0] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[0]); + resp_vert = queue_push(queue_next_vert, data_vert[0]); + data_vert[1] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[1]); + resp_vert = queue_push(queue_next_vert, data_vert[1]); + data_vert[2] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[2]); + resp_vert = queue_push(queue_next_vert, data_vert[2]); + data_vert[3] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[3]); + resp_vert = queue_push(queue_next_vert, data_vert[3]); } - curr_element_0_C += data_horz[1] * data_vert[2]; - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[2]; - curr_element_3_C += data_horz[3] * data_vert[3]; - curr_element_0_C += data_horz[0] * data_vert[0]; - curr_element_1_C += data_horz[0] * data_vert[1]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[1]; } + } + } + } else { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift y + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + data_horz[0] = matrix_A[shifted_y * num_cols_A + i]; + data_vert[0] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[0]); + curr_element_0_C += data_horz[0] * data_vert[0]; + data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1]; + data_vert[1] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[1]); + curr_element_0_C += data_horz[1] * data_vert[1]; + data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i]; + data_vert[2] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[2]); + curr_element_1_C += data_horz[0] * data_vert[2]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[2]; + data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; + data_vert[3] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[3]); + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } - // Store values - anchor_row_0 = shifted_y * num_cols_C + x; - anchor_row_1 = anchor_row_0 + num_cols_C; - matrix_C[anchor_row_0] = curr_element_0_C; - matrix_C[anchor_row_0 + 1] = curr_element_1_C; - matrix_C[anchor_row_1] = curr_element_2_C; - matrix_C[anchor_row_1 + 1] = curr_element_3_C; - } else { - // Pop and push dummy data - for (uint32_t i = 0; i < rep_count; ++i) { - data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); - if (q_next_vert) { - __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); + // Store values + anchor_row_0 = shifted_y * num_cols_C + x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_vert[0] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_vert[0]); + data_vert[1] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_vert[1]); + data_vert[2] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_vert[2]); + data_vert[3] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_vert[3]); } } } @@ -455,12 +600,15 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, const uint32_t rep_count, systolic_matrix_t const *__restrict__ C) { - int32_t *q_prev_horz; - int32_t *q_next_horz; - int32_t *q_prev_vert; - int32_t *q_next_vert; + int32_t *queue_prev_horz; + int32_t *queue_next_horz; + int32_t *queue_prev_vert; + int32_t *queue_next_vert; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; + int32_t data_dummy __attribute__((unused)) = 0; + int32_t resp_horz __attribute__((unused)); + int32_t resp_vert __attribute__((unused)); int32_t *matrix_C; uint32_t num_rows_C; uint32_t num_cols_C; @@ -474,17 +622,17 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, uint32_t anchor_row_1; // Assign queues - q_prev_horz = queues_horz[row_idx][col_idx]; + queue_prev_horz = queues_horz[row_idx][col_idx]; if (col_idx == SYSTOLIC_SIZE - 1) { - q_next_horz = NULL; + queue_next_horz = NULL; } else { - q_next_horz = queues_horz[row_idx][col_idx + 1]; + queue_next_horz = queues_horz[row_idx][col_idx + 1]; } - q_prev_vert = queues_vert[row_idx][col_idx]; + queue_prev_vert = queues_vert[row_idx][col_idx]; if (row_idx == SYSTOLIC_SIZE - 1) { - q_next_vert = NULL; + queue_next_vert = NULL; } else { - q_next_vert = queues_vert[row_idx + 1][col_idx]; + queue_next_vert = queues_vert[row_idx + 1][col_idx]; } // Get matrix arrays @@ -494,82 +642,291 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, num_rows_C = C->num_rows; num_cols_C = C->num_cols; - // Execute step-wise matrix multiplication - for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { - for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { - // Shift x and y - shifted_x = x + 2 * col_idx; - shifted_y = y + 2 * row_idx; - - // Check if this PE is currently within the matrix C - if (shifted_x < num_cols_C && shifted_y < num_rows_C) { - // Reset values - curr_element_0_C = 0; - curr_element_1_C = 0; - curr_element_2_C = 0; - curr_element_3_C = 0; - - // Systolic matrix multiplication through MACs - for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - if (q_next_horz) { - __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); + // PE is not at a boundary + if (queue_next_horz && queue_next_vert) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x and y + shifted_x = x + 2 * col_idx; + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C && shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz[0] = queue_pop(queue_prev_horz); + data_vert[0] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[0]); + resp_vert = queue_push(queue_next_vert, data_vert[0]); + curr_element_0_C += data_horz[0] * data_vert[0]; + data_horz[1] = queue_pop(queue_prev_horz); + data_vert[1] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[1]); + resp_vert = queue_push(queue_next_vert, data_vert[1]); + curr_element_0_C += data_horz[1] * data_vert[1]; + data_horz[2] = queue_pop(queue_prev_horz); + data_vert[2] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[2]); + resp_vert = queue_push(queue_next_vert, data_vert[2]); + curr_element_1_C += data_horz[0] * data_vert[2]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[2]; + data_horz[3] = queue_pop(queue_prev_horz); + data_vert[3] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[3]); + resp_vert = queue_push(queue_next_vert, data_vert[3]); + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = shifted_y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz[0] = queue_pop(queue_prev_horz); + data_vert[0] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[0]); + resp_vert = queue_push(queue_next_vert, data_vert[0]); + data_horz[1] = queue_pop(queue_prev_horz); + data_vert[1] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[1]); + resp_vert = queue_push(queue_next_vert, data_vert[1]); + data_horz[2] = queue_pop(queue_prev_horz); + data_vert[2] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[2]); + resp_vert = queue_push(queue_next_vert, data_vert[2]); + data_horz[3] = queue_pop(queue_prev_horz); + data_vert[3] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[3]); + resp_vert = queue_push(queue_next_vert, data_vert[3]); + } + } + } + } + } + + // PE is at the right boundary + if (!queue_next_horz && queue_next_vert) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x and y + shifted_x = x + 2 * col_idx; + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C && shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz[0] = queue_pop(queue_prev_horz); + data_vert[0] = queue_pop(queue_prev_vert); + resp_vert = queue_push(queue_next_vert, data_vert[0]); + curr_element_0_C += data_horz[0] * data_vert[0]; + data_horz[1] = queue_pop(queue_prev_horz); + data_vert[1] = queue_pop(queue_prev_vert); + resp_vert = queue_push(queue_next_vert, data_vert[1]); + curr_element_0_C += data_horz[1] * data_vert[1]; + data_horz[2] = queue_pop(queue_prev_horz); + data_vert[2] = queue_pop(queue_prev_vert); + resp_vert = queue_push(queue_next_vert, data_vert[2]); + curr_element_1_C += data_horz[0] * data_vert[2]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[2]; + data_horz[3] = queue_pop(queue_prev_horz); + data_vert[3] = queue_pop(queue_prev_vert); + resp_vert = queue_push(queue_next_vert, data_vert[3]); + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_3_C += data_horz[3] * data_vert[3]; } - if (q_next_vert) { - __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); + + // Store values + anchor_row_0 = shifted_y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz[0] = queue_pop(queue_prev_horz); + data_vert[0] = queue_pop(queue_prev_vert); + data_vert[0] += data_horz[0]; + resp_vert = queue_push(queue_next_vert, data_vert[0]); + data_horz[1] = queue_pop(queue_prev_horz); + data_vert[1] = queue_pop(queue_prev_vert); + data_vert[1] += data_horz[1]; + resp_vert = queue_push(queue_next_vert, data_vert[1]); + data_horz[2] = queue_pop(queue_prev_horz); + data_vert[2] = queue_pop(queue_prev_vert); + data_vert[2] += data_horz[2]; + resp_vert = queue_push(queue_next_vert, data_vert[2]); + data_horz[3] = queue_pop(queue_prev_horz); + data_vert[3] = queue_pop(queue_prev_vert); + data_vert[3] += data_horz[3]; + resp_vert = queue_push(queue_next_vert, data_vert[3]); } - curr_element_0_C += data_horz[1] * data_vert[2]; - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[2]; - curr_element_3_C += data_horz[3] * data_vert[3]; - curr_element_0_C += data_horz[0] * data_vert[0]; - curr_element_1_C += data_horz[0] * data_vert[1]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[1]; } + } + } + } - // Store values - anchor_row_0 = shifted_y * num_cols_C + shifted_x; - anchor_row_1 = anchor_row_0 + num_cols_C; - matrix_C[anchor_row_0] = curr_element_0_C; - matrix_C[anchor_row_0 + 1] = curr_element_1_C; - matrix_C[anchor_row_1] = curr_element_2_C; - matrix_C[anchor_row_1 + 1] = curr_element_3_C; - } else { - // Pop and push dummy data - for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST); - data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST); - if (q_next_horz) { - __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST); + // PE is at the bottom boundary + if (queue_next_horz && !queue_next_vert) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x and y + shifted_x = x + 2 * col_idx; + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C && shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz[0] = queue_pop(queue_prev_horz); + data_vert[0] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[0]); + curr_element_0_C += data_horz[0] * data_vert[0]; + data_horz[1] = queue_pop(queue_prev_horz); + data_vert[1] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[1]); + curr_element_0_C += data_horz[1] * data_vert[1]; + data_horz[2] = queue_pop(queue_prev_horz); + data_vert[2] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[2]); + curr_element_1_C += data_horz[0] * data_vert[2]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[2]; + data_horz[3] = queue_pop(queue_prev_horz); + data_vert[3] = queue_pop(queue_prev_vert); + resp_horz = queue_push(queue_next_horz, data_horz[3]); + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_3_C += data_horz[3] * data_vert[3]; } - if (q_next_vert) { - __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST); - __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST); + + // Store values + anchor_row_0 = shifted_y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz[0] = queue_pop(queue_prev_horz); + data_vert[0] = queue_pop(queue_prev_vert); + data_horz[0] += data_vert[0]; + resp_horz = queue_push(queue_next_horz, data_horz[0]); + data_horz[1] = queue_pop(queue_prev_horz); + data_vert[1] = queue_pop(queue_prev_vert); + data_horz[1] += data_vert[1]; + resp_horz = queue_push(queue_next_horz, data_horz[1]); + data_horz[2] = queue_pop(queue_prev_horz); + data_vert[2] = queue_pop(queue_prev_vert); + data_horz[2] += data_vert[2]; + resp_horz = queue_push(queue_next_horz, data_horz[2]); + data_horz[3] = queue_pop(queue_prev_horz); + data_vert[3] = queue_pop(queue_prev_vert); + data_horz[3] += data_vert[3]; + resp_horz = queue_push(queue_next_horz, data_horz[3]); + } + } + } + } + } + + // PE is at the bottom right corner + if (!queue_next_horz && !queue_next_vert) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x and y + shifted_x = x + 2 * col_idx; + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C && shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz[0] = queue_pop(queue_prev_horz); + data_vert[0] = queue_pop(queue_prev_vert); + curr_element_0_C += data_horz[0] * data_vert[0]; + data_horz[1] = queue_pop(queue_prev_horz); + data_vert[1] = queue_pop(queue_prev_vert); + curr_element_0_C += data_horz[1] * data_vert[1]; + data_horz[2] = queue_pop(queue_prev_horz); + data_vert[2] = queue_pop(queue_prev_vert); + curr_element_1_C += data_horz[0] * data_vert[2]; + curr_element_2_C += data_horz[2] * data_vert[0]; + curr_element_3_C += data_horz[2] * data_vert[2]; + data_horz[3] = queue_pop(queue_prev_horz); + data_vert[3] = queue_pop(queue_prev_vert); + curr_element_1_C += data_horz[1] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = shifted_y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + data_horz[0] = queue_pop(queue_prev_horz); + data_vert[0] = queue_pop(queue_prev_vert); + data_dummy += data_horz[0] * data_vert[0]; + data_horz[1] = queue_pop(queue_prev_horz); + data_vert[1] = queue_pop(queue_prev_vert); + data_dummy += data_horz[1] * data_vert[1]; + data_horz[2] = queue_pop(queue_prev_horz); + data_vert[2] = queue_pop(queue_prev_vert); + data_dummy += data_horz[2] * data_vert[2]; + data_horz[3] = queue_pop(queue_prev_horz); + data_vert[3] = queue_pop(queue_prev_vert); + data_dummy += data_horz[3] * data_vert[3]; + // TODO: FIND SAFER WAY TO ENFORCE DATA DEPENDENCY + if (!data_dummy) + break; } } } From 5f2d0bcaa54129a93a87087b7e553b3b70d5f82f Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Thu, 27 May 2021 20:20:52 +0200 Subject: [PATCH 11/24] [apps] Optimize matmul_xqueue with asm inline --- software/apps/systolic/matmul_xqueue/main.c | 18 +- software/runtime/systolic/matmul_xqueue.h | 442 ++++++++++---------- 2 files changed, 242 insertions(+), 218 deletions(-) diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c index da46fbe35..fafc4fbea 100644 --- a/software/apps/systolic/matmul_xqueue/main.c +++ b/software/apps/systolic/matmul_xqueue/main.c @@ -27,9 +27,9 @@ #include "synchronization.h" // Dimensions of matrices -#define DIM_M 12 -#define DIM_N 12 -#define DIM_P 12 +#define DIM_M 16 +#define DIM_N 16 +#define DIM_P 16 uint32_t *grid_mapping; @@ -171,6 +171,10 @@ int main() { mempool_start_benchmark(); } + // Start benchmark for all cores + // mempool_barrier(num_cores); + // mempool_start_benchmark(); + // Wait for all cores mempool_barrier(num_cores); @@ -193,6 +197,10 @@ int main() { // Wait for all cores mempool_barrier(num_cores); + // Stop benchmark for all cores + // mempool_stop_benchmark(); + // mempool_barrier(num_cores); + // Print out benchmark if (core_id == 0) { // Stop benchmark @@ -200,8 +208,8 @@ int main() { printf("> End\n"); // Print out systolic matrix C - //printf("> Print Systolic Matrix C\n"); - //systolic_matrix_print(syst_matrix_C); + // printf("> Print Systolic Matrix C\n"); + // systolic_matrix_print(syst_matrix_C); } // wait until all cores have finished diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h index 113091293..2ba8f317d 100644 --- a/software/runtime/systolic/matmul_xqueue.h +++ b/software/runtime/systolic/matmul_xqueue.h @@ -54,13 +54,14 @@ int32_t *queues_vert[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; int32_t *queues_horz[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; // queue push -inline int32_t queue_push(int32_t *queue, int32_t data) { - return __atomic_fetch_and(queue, data, __ATOMIC_RELAXED); +static inline void queue_push(void *const queue, int32_t data, + int32_t *const ret) { + asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue)); } // queue pop -inline int32_t queue_pop(int32_t *queue) { - return __atomic_fetch_or(queue, 0, __ATOMIC_RELAXED); +inline void queue_pop(void *const queue, int32_t *const ret) { + asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue)); } // TODO: GENERALIZE FOR ANY NUMBER OF TILES @@ -81,7 +82,22 @@ void systolic_init(uint32_t const *grid_mapping) { ++grid_pos; } } - // TODO: PRINT OUT THE ADDRESSES TO CHECK + + // Print out queue addresses + // printf("queues_vert\n"); + // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { + // for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { + // printf("%5d ", queues_vert[y][x]); + // } + // printf("\n"); + // } + // printf("queues_horz\n"); + // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { + // for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { + // printf("%5d ", queues_horz[y][x]); + // } + // printf("\n"); + // } } void systolic_matrix_allocate(systolic_matrix_t **syst_matrix, @@ -165,8 +181,8 @@ void systolic_rcp_pe(const uint32_t rep_count, int32_t *queue_next_vert; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; - int32_t resp_horz __attribute__((unused)); - int32_t resp_vert __attribute__((unused)); + int32_t resp_horz __attribute__((unused)) = 0; + int32_t resp_vert __attribute__((unused)) = 0; int32_t *matrix_A; int32_t *matrix_B; int32_t *matrix_C; @@ -209,25 +225,25 @@ void systolic_rcp_pe(const uint32_t rep_count, for (uint32_t i = 0; i < 2 * rep_count; i += 2) { data_horz[0] = matrix_A[y * num_cols_A + i]; data_vert[0] = matrix_B[i * num_cols_B + x]; - resp_horz = queue_push(queue_next_horz, data_horz[0]); - resp_vert = queue_push(queue_next_vert, data_vert[0]); - curr_element_0_C += data_horz[0] * data_vert[0]; + queue_push(queue_next_horz, data_horz[0], &resp_horz); + queue_push(queue_next_vert, data_vert[0], &resp_vert); data_horz[1] = matrix_A[y * num_cols_A + i + 1]; data_vert[1] = matrix_B[(i + 1) * num_cols_B + x]; - resp_horz = queue_push(queue_next_horz, data_horz[1]); - resp_vert = queue_push(queue_next_vert, data_vert[1]); - curr_element_0_C += data_horz[1] * data_vert[1]; + curr_element_0_C += data_horz[0] * data_vert[0]; + queue_push(queue_next_horz, data_horz[1], &resp_horz); + queue_push(queue_next_vert, data_vert[1], &resp_vert); data_horz[2] = matrix_A[(y + 1) * num_cols_A + i]; data_vert[2] = matrix_B[i * num_cols_B + x + 1]; - resp_horz = queue_push(queue_next_horz, data_horz[1]); - resp_vert = queue_push(queue_next_vert, data_vert[1]); + curr_element_0_C += data_horz[1] * data_vert[1]; + queue_push(queue_next_horz, data_horz[1], &resp_horz); + queue_push(queue_next_vert, data_vert[1], &resp_vert); + data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1]; + data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1]; curr_element_1_C += data_horz[0] * data_vert[2]; curr_element_2_C += data_horz[2] * data_vert[0]; curr_element_3_C += data_horz[2] * data_vert[2]; - data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1]; - data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1]; - resp_horz = queue_push(queue_next_horz, data_horz[3]); - resp_vert = queue_push(queue_next_vert, data_vert[3]); + queue_push(queue_next_horz, data_horz[3], &resp_horz); + queue_push(queue_next_vert, data_vert[3], &resp_vert); curr_element_1_C += data_horz[1] * data_vert[3]; curr_element_2_C += data_horz[3] * data_vert[1]; curr_element_3_C += data_horz[3] * data_vert[3]; @@ -253,8 +269,8 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, int32_t *queue_next_vert; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; - int32_t resp_horz __attribute__((unused)); - int32_t resp_vert __attribute__((unused)); + int32_t resp_horz __attribute__((unused)) = 0; + int32_t resp_vert __attribute__((unused)) = 0; int32_t *matrix_B; int32_t *matrix_C; uint32_t num_cols_B; @@ -305,26 +321,26 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < 2 * rep_count; i += 2) { data_vert[0] = matrix_B[i * num_cols_B + shifted_x]; - data_horz[0] = queue_pop(queue_prev_horz); - resp_horz = queue_push(queue_next_horz, data_horz[0]); - resp_vert = queue_push(queue_next_vert, data_vert[0]); - curr_element_0_C += data_horz[0] * data_vert[0]; + queue_pop(queue_prev_horz, &data_horz[0]); + queue_push(queue_next_horz, data_horz[0], &resp_horz); + queue_push(queue_next_vert, data_vert[0], &resp_vert); data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x]; - data_horz[1] = queue_pop(queue_prev_horz); - resp_horz = queue_push(queue_next_horz, data_horz[1]); - resp_vert = queue_push(queue_next_vert, data_vert[1]); - curr_element_0_C += data_horz[1] * data_vert[1]; + curr_element_0_C += data_horz[0] * data_vert[0]; + queue_pop(queue_prev_horz, &data_horz[1]); + queue_push(queue_next_horz, data_horz[1], &resp_horz); + queue_push(queue_next_vert, data_vert[1], &resp_vert); data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1]; - data_horz[2] = queue_pop(queue_prev_horz); - resp_horz = queue_push(queue_next_horz, data_horz[2]); - resp_vert = queue_push(queue_next_vert, data_vert[2]); + curr_element_0_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz, &data_horz[2]); + queue_push(queue_next_horz, data_horz[2], &resp_horz); + queue_push(queue_next_vert, data_vert[2], &resp_vert); + data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; curr_element_1_C += data_horz[0] * data_vert[2]; curr_element_2_C += data_horz[2] * data_vert[0]; curr_element_3_C += data_horz[2] * data_vert[2]; - data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; - data_horz[3] = queue_pop(queue_prev_horz); - resp_horz = queue_push(queue_next_horz, data_horz[3]); - resp_vert = queue_push(queue_next_vert, data_vert[3]); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_push(queue_next_horz, data_horz[3], &resp_horz); + queue_push(queue_next_vert, data_vert[3], &resp_vert); curr_element_1_C += data_horz[1] * data_vert[3]; curr_element_2_C += data_horz[3] * data_vert[1]; curr_element_3_C += data_horz[3] * data_vert[3]; @@ -340,18 +356,18 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = queue_pop(queue_prev_horz); - resp_horz = queue_push(queue_next_horz, data_horz[0]); - resp_vert = queue_push(queue_next_vert, data_vert[0]); - data_horz[1] = queue_pop(queue_prev_horz); - resp_horz = queue_push(queue_next_horz, data_horz[1]); - resp_vert = queue_push(queue_next_vert, data_vert[1]); - data_horz[2] = queue_pop(queue_prev_horz); - resp_horz = queue_push(queue_next_horz, data_horz[2]); - resp_vert = queue_push(queue_next_vert, data_vert[2]); - data_horz[3] = queue_pop(queue_prev_horz); - resp_horz = queue_push(queue_next_horz, data_horz[3]); - resp_vert = queue_push(queue_next_vert, data_vert[3]); + queue_pop(queue_prev_horz, &data_horz[0]); + queue_push(queue_next_horz, data_horz[0], &resp_horz); + queue_push(queue_next_vert, data_vert[0], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[1]); + queue_push(queue_next_horz, data_horz[1], &resp_horz); + queue_push(queue_next_vert, data_vert[1], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[2]); + queue_push(queue_next_horz, data_horz[2], &resp_horz); + queue_push(queue_next_vert, data_vert[2], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_push(queue_next_horz, data_horz[3], &resp_horz); + queue_push(queue_next_vert, data_vert[3], &resp_vert); } } } @@ -374,22 +390,22 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < 2 * rep_count; i += 2) { data_vert[0] = matrix_B[i * num_cols_B + shifted_x]; - data_horz[0] = queue_pop(queue_prev_horz); - resp_vert = queue_push(queue_next_vert, data_vert[0]); - curr_element_0_C += data_horz[0] * data_vert[0]; + queue_pop(queue_prev_horz, &data_horz[0]); + queue_push(queue_next_vert, data_vert[0], &resp_vert); data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x]; - data_horz[1] = queue_pop(queue_prev_horz); - resp_vert = queue_push(queue_next_vert, data_vert[1]); - curr_element_0_C += data_horz[1] * data_vert[1]; + curr_element_0_C += data_horz[0] * data_vert[0]; + queue_pop(queue_prev_horz, &data_horz[1]); + queue_push(queue_next_vert, data_vert[1], &resp_vert); data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1]; - data_horz[2] = queue_pop(queue_prev_horz); - resp_vert = queue_push(queue_next_vert, data_vert[2]); + curr_element_0_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz, &data_horz[2]); + queue_push(queue_next_vert, data_vert[2], &resp_vert); + data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; curr_element_1_C += data_horz[0] * data_vert[2]; curr_element_2_C += data_horz[2] * data_vert[0]; curr_element_3_C += data_horz[2] * data_vert[2]; - data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; - data_horz[3] = queue_pop(queue_prev_horz); - resp_vert = queue_push(queue_next_vert, data_vert[3]); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_push(queue_next_vert, data_vert[3], &resp_vert); curr_element_1_C += data_horz[1] * data_vert[3]; curr_element_2_C += data_horz[3] * data_vert[1]; curr_element_3_C += data_horz[3] * data_vert[3]; @@ -405,14 +421,14 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = queue_pop(queue_prev_horz); - resp_vert = queue_push(queue_next_vert, data_horz[0]); - data_horz[1] = queue_pop(queue_prev_horz); - resp_vert = queue_push(queue_next_vert, data_horz[1]); - data_horz[2] = queue_pop(queue_prev_horz); - resp_vert = queue_push(queue_next_vert, data_horz[2]); - data_horz[3] = queue_pop(queue_prev_horz); - resp_vert = queue_push(queue_next_vert, data_horz[3]); + queue_pop(queue_prev_horz, &data_horz[0]); + queue_push(queue_next_vert, data_horz[0], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[1]); + queue_push(queue_next_vert, data_horz[1], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[2]); + queue_push(queue_next_vert, data_horz[2], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_push(queue_next_vert, data_horz[3], &resp_vert); } } } @@ -429,8 +445,8 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, int32_t *queue_next_vert; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; - int32_t resp_horz __attribute__((unused)); - int32_t resp_vert __attribute__((unused)); + int32_t resp_horz __attribute__((unused)) = 0; + int32_t resp_vert __attribute__((unused)) = 0; int32_t *matrix_A; int32_t *matrix_C; uint32_t num_cols_A; @@ -481,26 +497,26 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < 2 * rep_count; i += 2) { data_horz[0] = matrix_A[shifted_y * num_cols_A + i]; - data_vert[0] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[0]); - resp_vert = queue_push(queue_next_vert, data_vert[0]); - curr_element_0_C += data_horz[0] * data_vert[0]; + queue_pop(queue_prev_vert, &data_vert[0]); + queue_push(queue_next_horz, data_horz[0], &resp_horz); + queue_push(queue_next_vert, data_vert[0], &resp_vert); data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1]; - data_vert[1] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[1]); - resp_vert = queue_push(queue_next_vert, data_vert[1]); - curr_element_0_C += data_horz[1] * data_vert[1]; + curr_element_0_C += data_horz[0] * data_vert[0]; + queue_pop(queue_prev_vert, &data_vert[1]); + queue_push(queue_next_horz, data_horz[1], &resp_horz); + queue_push(queue_next_vert, data_vert[1], &resp_vert); data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i]; - data_vert[2] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[2]); - resp_vert = queue_push(queue_next_vert, data_vert[2]); + curr_element_0_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_vert, &data_vert[2]); + queue_push(queue_next_horz, data_horz[2], &resp_horz); + queue_push(queue_next_vert, data_vert[2], &resp_vert); + data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; curr_element_1_C += data_horz[0] * data_vert[2]; curr_element_2_C += data_horz[2] * data_vert[0]; curr_element_3_C += data_horz[2] * data_vert[2]; - data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; - data_vert[3] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[3]); - resp_vert = queue_push(queue_next_vert, data_vert[3]); + queue_pop(queue_prev_vert, &data_vert[3]); + queue_push(queue_next_horz, data_horz[3], &resp_horz); + queue_push(queue_next_vert, data_vert[3], &resp_vert); curr_element_1_C += data_horz[1] * data_vert[3]; curr_element_2_C += data_horz[3] * data_vert[1]; curr_element_3_C += data_horz[3] * data_vert[3]; @@ -516,18 +532,18 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_vert[0] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[0]); - resp_vert = queue_push(queue_next_vert, data_vert[0]); - data_vert[1] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[1]); - resp_vert = queue_push(queue_next_vert, data_vert[1]); - data_vert[2] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[2]); - resp_vert = queue_push(queue_next_vert, data_vert[2]); - data_vert[3] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[3]); - resp_vert = queue_push(queue_next_vert, data_vert[3]); + queue_pop(queue_prev_vert, &data_vert[0]); + queue_push(queue_next_horz, data_horz[0], &resp_horz); + queue_push(queue_next_vert, data_vert[0], &resp_vert); + queue_pop(queue_prev_vert, &data_vert[1]); + queue_push(queue_next_horz, data_horz[1], &resp_horz); + queue_push(queue_next_vert, data_vert[1], &resp_vert); + queue_pop(queue_prev_vert, &data_vert[2]); + queue_push(queue_next_horz, data_horz[2], &resp_horz); + queue_push(queue_next_vert, data_vert[2], &resp_vert); + queue_pop(queue_prev_vert, &data_vert[3]); + queue_push(queue_next_horz, data_horz[3], &resp_horz); + queue_push(queue_next_vert, data_vert[3], &resp_vert); } } } @@ -550,22 +566,22 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < 2 * rep_count; i += 2) { data_horz[0] = matrix_A[shifted_y * num_cols_A + i]; - data_vert[0] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[0]); - curr_element_0_C += data_horz[0] * data_vert[0]; + queue_pop(queue_prev_vert, &data_vert[0]); + queue_push(queue_next_horz, data_horz[0], &resp_horz); data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1]; - data_vert[1] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[1]); - curr_element_0_C += data_horz[1] * data_vert[1]; + curr_element_0_C += data_horz[0] * data_vert[0]; + queue_pop(queue_prev_vert, &data_vert[1]); + queue_push(queue_next_horz, data_horz[1], &resp_horz); data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i]; - data_vert[2] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[2]); + curr_element_0_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_vert, &data_vert[2]); + queue_push(queue_next_horz, data_horz[2], &resp_horz); + data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; curr_element_1_C += data_horz[0] * data_vert[2]; curr_element_2_C += data_horz[2] * data_vert[0]; curr_element_3_C += data_horz[2] * data_vert[2]; - data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; - data_vert[3] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[3]); + queue_pop(queue_prev_vert, &data_vert[3]); + queue_push(queue_next_horz, data_horz[3], &resp_horz); curr_element_1_C += data_horz[1] * data_vert[3]; curr_element_2_C += data_horz[3] * data_vert[1]; curr_element_3_C += data_horz[3] * data_vert[3]; @@ -581,14 +597,14 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_vert[0] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_vert[0]); - data_vert[1] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_vert[1]); - data_vert[2] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_vert[2]); - data_vert[3] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_vert[3]); + queue_pop(queue_prev_vert, &data_vert[0]); + queue_push(queue_next_horz, data_vert[0], &resp_horz); + queue_pop(queue_prev_vert, &data_vert[1]); + queue_push(queue_next_horz, data_vert[1], &resp_horz); + queue_pop(queue_prev_vert, &data_vert[2]); + queue_push(queue_next_horz, data_vert[2], &resp_horz); + queue_pop(queue_prev_vert, &data_vert[3]); + queue_push(queue_next_horz, data_vert[3], &resp_horz); } } } @@ -607,8 +623,8 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; int32_t data_dummy __attribute__((unused)) = 0; - int32_t resp_horz __attribute__((unused)); - int32_t resp_vert __attribute__((unused)); + int32_t resp_horz __attribute__((unused)) = 0; + int32_t resp_vert __attribute__((unused)) = 0; int32_t *matrix_C; uint32_t num_rows_C; uint32_t num_cols_C; @@ -661,27 +677,27 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = queue_pop(queue_prev_horz); - data_vert[0] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[0]); - resp_vert = queue_push(queue_next_vert, data_vert[0]); + queue_pop(queue_prev_horz, &data_horz[0]); + queue_pop(queue_prev_vert, &data_vert[0]); + queue_push(queue_next_horz, data_horz[0], &resp_horz); + queue_push(queue_next_vert, data_vert[0], &resp_vert); curr_element_0_C += data_horz[0] * data_vert[0]; - data_horz[1] = queue_pop(queue_prev_horz); - data_vert[1] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[1]); - resp_vert = queue_push(queue_next_vert, data_vert[1]); + queue_pop(queue_prev_horz, &data_horz[1]); + queue_pop(queue_prev_vert, &data_vert[1]); + queue_push(queue_next_horz, data_horz[1], &resp_horz); + queue_push(queue_next_vert, data_vert[1], &resp_vert); curr_element_0_C += data_horz[1] * data_vert[1]; - data_horz[2] = queue_pop(queue_prev_horz); - data_vert[2] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[2]); - resp_vert = queue_push(queue_next_vert, data_vert[2]); + queue_pop(queue_prev_horz, &data_horz[2]); + queue_pop(queue_prev_vert, &data_vert[2]); + queue_push(queue_next_horz, data_horz[2], &resp_horz); + queue_push(queue_next_vert, data_vert[2], &resp_vert); curr_element_1_C += data_horz[0] * data_vert[2]; curr_element_2_C += data_horz[2] * data_vert[0]; curr_element_3_C += data_horz[2] * data_vert[2]; - data_horz[3] = queue_pop(queue_prev_horz); - data_vert[3] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[3]); - resp_vert = queue_push(queue_next_vert, data_vert[3]); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_pop(queue_prev_vert, &data_vert[3]); + queue_push(queue_next_horz, data_horz[3], &resp_horz); + queue_push(queue_next_vert, data_vert[3], &resp_vert); curr_element_1_C += data_horz[1] * data_vert[3]; curr_element_2_C += data_horz[3] * data_vert[1]; curr_element_3_C += data_horz[3] * data_vert[3]; @@ -697,22 +713,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = queue_pop(queue_prev_horz); - data_vert[0] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[0]); - resp_vert = queue_push(queue_next_vert, data_vert[0]); - data_horz[1] = queue_pop(queue_prev_horz); - data_vert[1] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[1]); - resp_vert = queue_push(queue_next_vert, data_vert[1]); - data_horz[2] = queue_pop(queue_prev_horz); - data_vert[2] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[2]); - resp_vert = queue_push(queue_next_vert, data_vert[2]); - data_horz[3] = queue_pop(queue_prev_horz); - data_vert[3] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[3]); - resp_vert = queue_push(queue_next_vert, data_vert[3]); + queue_pop(queue_prev_horz, &data_horz[0]); + queue_pop(queue_prev_vert, &data_vert[0]); + queue_push(queue_next_horz, data_horz[0], &resp_horz); + queue_push(queue_next_vert, data_vert[0], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[1]); + queue_pop(queue_prev_vert, &data_vert[1]); + queue_push(queue_next_horz, data_horz[1], &resp_horz); + queue_push(queue_next_vert, data_vert[1], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[2]); + queue_pop(queue_prev_vert, &data_vert[2]); + queue_push(queue_next_horz, data_horz[2], &resp_horz); + queue_push(queue_next_vert, data_vert[2], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_pop(queue_prev_vert, &data_vert[3]); + queue_push(queue_next_horz, data_horz[3], &resp_horz); + queue_push(queue_next_vert, data_vert[3], &resp_vert); } } } @@ -738,23 +754,23 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = queue_pop(queue_prev_horz); - data_vert[0] = queue_pop(queue_prev_vert); - resp_vert = queue_push(queue_next_vert, data_vert[0]); + queue_pop(queue_prev_horz, &data_horz[0]); + queue_pop(queue_prev_vert, &data_vert[0]); + queue_push(queue_next_vert, data_vert[0], &resp_vert); curr_element_0_C += data_horz[0] * data_vert[0]; - data_horz[1] = queue_pop(queue_prev_horz); - data_vert[1] = queue_pop(queue_prev_vert); - resp_vert = queue_push(queue_next_vert, data_vert[1]); + queue_pop(queue_prev_horz, &data_horz[1]); + queue_pop(queue_prev_vert, &data_vert[1]); + queue_push(queue_next_vert, data_vert[1], &resp_vert); curr_element_0_C += data_horz[1] * data_vert[1]; - data_horz[2] = queue_pop(queue_prev_horz); - data_vert[2] = queue_pop(queue_prev_vert); - resp_vert = queue_push(queue_next_vert, data_vert[2]); + queue_pop(queue_prev_horz, &data_horz[2]); + queue_pop(queue_prev_vert, &data_vert[2]); + queue_push(queue_next_vert, data_vert[2], &resp_vert); curr_element_1_C += data_horz[0] * data_vert[2]; curr_element_2_C += data_horz[2] * data_vert[0]; curr_element_3_C += data_horz[2] * data_vert[2]; - data_horz[3] = queue_pop(queue_prev_horz); - data_vert[3] = queue_pop(queue_prev_vert); - resp_vert = queue_push(queue_next_vert, data_vert[3]); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_pop(queue_prev_vert, &data_vert[3]); + queue_push(queue_next_vert, data_vert[3], &resp_vert); curr_element_1_C += data_horz[1] * data_vert[3]; curr_element_2_C += data_horz[3] * data_vert[1]; curr_element_3_C += data_horz[3] * data_vert[3]; @@ -770,22 +786,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = queue_pop(queue_prev_horz); - data_vert[0] = queue_pop(queue_prev_vert); + queue_pop(queue_prev_horz, &data_horz[0]); + queue_pop(queue_prev_vert, &data_vert[0]); data_vert[0] += data_horz[0]; - resp_vert = queue_push(queue_next_vert, data_vert[0]); - data_horz[1] = queue_pop(queue_prev_horz); - data_vert[1] = queue_pop(queue_prev_vert); + queue_push(queue_next_vert, data_vert[0], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[1]); + queue_pop(queue_prev_vert, &data_vert[1]); data_vert[1] += data_horz[1]; - resp_vert = queue_push(queue_next_vert, data_vert[1]); - data_horz[2] = queue_pop(queue_prev_horz); - data_vert[2] = queue_pop(queue_prev_vert); + queue_push(queue_next_vert, data_vert[1], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[2]); + queue_pop(queue_prev_vert, &data_vert[2]); data_vert[2] += data_horz[2]; - resp_vert = queue_push(queue_next_vert, data_vert[2]); - data_horz[3] = queue_pop(queue_prev_horz); - data_vert[3] = queue_pop(queue_prev_vert); + queue_push(queue_next_vert, data_vert[2], &resp_vert); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_pop(queue_prev_vert, &data_vert[3]); data_vert[3] += data_horz[3]; - resp_vert = queue_push(queue_next_vert, data_vert[3]); + queue_push(queue_next_vert, data_vert[3], &resp_vert); } } } @@ -811,23 +827,23 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = queue_pop(queue_prev_horz); - data_vert[0] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[0]); + queue_pop(queue_prev_horz, &data_horz[0]); + queue_pop(queue_prev_vert, &data_vert[0]); + queue_push(queue_next_horz, data_horz[0], &resp_horz); curr_element_0_C += data_horz[0] * data_vert[0]; - data_horz[1] = queue_pop(queue_prev_horz); - data_vert[1] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[1]); + queue_pop(queue_prev_horz, &data_horz[1]); + queue_pop(queue_prev_vert, &data_vert[1]); + queue_push(queue_next_horz, data_horz[1], &resp_horz); curr_element_0_C += data_horz[1] * data_vert[1]; - data_horz[2] = queue_pop(queue_prev_horz); - data_vert[2] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[2]); + queue_pop(queue_prev_horz, &data_horz[2]); + queue_pop(queue_prev_vert, &data_vert[2]); + queue_push(queue_next_horz, data_horz[2], &resp_horz); curr_element_1_C += data_horz[0] * data_vert[2]; curr_element_2_C += data_horz[2] * data_vert[0]; curr_element_3_C += data_horz[2] * data_vert[2]; - data_horz[3] = queue_pop(queue_prev_horz); - data_vert[3] = queue_pop(queue_prev_vert); - resp_horz = queue_push(queue_next_horz, data_horz[3]); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_pop(queue_prev_vert, &data_vert[3]); + queue_push(queue_next_horz, data_horz[3], &resp_horz); curr_element_1_C += data_horz[1] * data_vert[3]; curr_element_2_C += data_horz[3] * data_vert[1]; curr_element_3_C += data_horz[3] * data_vert[3]; @@ -843,22 +859,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = queue_pop(queue_prev_horz); - data_vert[0] = queue_pop(queue_prev_vert); + queue_pop(queue_prev_horz, &data_horz[0]); + queue_pop(queue_prev_vert, &data_vert[0]); data_horz[0] += data_vert[0]; - resp_horz = queue_push(queue_next_horz, data_horz[0]); - data_horz[1] = queue_pop(queue_prev_horz); - data_vert[1] = queue_pop(queue_prev_vert); + queue_push(queue_next_horz, data_horz[0], &resp_horz); + queue_pop(queue_prev_horz, &data_horz[1]); + queue_pop(queue_prev_vert, &data_vert[1]); data_horz[1] += data_vert[1]; - resp_horz = queue_push(queue_next_horz, data_horz[1]); - data_horz[2] = queue_pop(queue_prev_horz); - data_vert[2] = queue_pop(queue_prev_vert); + queue_push(queue_next_horz, data_horz[1], &resp_horz); + queue_pop(queue_prev_horz, &data_horz[2]); + queue_pop(queue_prev_vert, &data_vert[2]); data_horz[2] += data_vert[2]; - resp_horz = queue_push(queue_next_horz, data_horz[2]); - data_horz[3] = queue_pop(queue_prev_horz); - data_vert[3] = queue_pop(queue_prev_vert); + queue_push(queue_next_horz, data_horz[2], &resp_horz); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_pop(queue_prev_vert, &data_vert[3]); data_horz[3] += data_vert[3]; - resp_horz = queue_push(queue_next_horz, data_horz[3]); + queue_push(queue_next_horz, data_horz[3], &resp_horz); } } } @@ -884,19 +900,19 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = queue_pop(queue_prev_horz); - data_vert[0] = queue_pop(queue_prev_vert); + queue_pop(queue_prev_horz, &data_horz[0]); + queue_pop(queue_prev_vert, &data_vert[0]); curr_element_0_C += data_horz[0] * data_vert[0]; - data_horz[1] = queue_pop(queue_prev_horz); - data_vert[1] = queue_pop(queue_prev_vert); + queue_pop(queue_prev_horz, &data_horz[1]); + queue_pop(queue_prev_vert, &data_vert[1]); curr_element_0_C += data_horz[1] * data_vert[1]; - data_horz[2] = queue_pop(queue_prev_horz); - data_vert[2] = queue_pop(queue_prev_vert); + queue_pop(queue_prev_horz, &data_horz[2]); + queue_pop(queue_prev_vert, &data_vert[2]); curr_element_1_C += data_horz[0] * data_vert[2]; curr_element_2_C += data_horz[2] * data_vert[0]; curr_element_3_C += data_horz[2] * data_vert[2]; - data_horz[3] = queue_pop(queue_prev_horz); - data_vert[3] = queue_pop(queue_prev_vert); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_pop(queue_prev_vert, &data_vert[3]); curr_element_1_C += data_horz[1] * data_vert[3]; curr_element_2_C += data_horz[3] * data_vert[1]; curr_element_3_C += data_horz[3] * data_vert[3]; @@ -912,17 +928,17 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - data_horz[0] = queue_pop(queue_prev_horz); - data_vert[0] = queue_pop(queue_prev_vert); + queue_pop(queue_prev_horz, &data_horz[0]); + queue_pop(queue_prev_vert, &data_vert[0]); data_dummy += data_horz[0] * data_vert[0]; - data_horz[1] = queue_pop(queue_prev_horz); - data_vert[1] = queue_pop(queue_prev_vert); + queue_pop(queue_prev_horz, &data_horz[1]); + queue_pop(queue_prev_vert, &data_vert[1]); data_dummy += data_horz[1] * data_vert[1]; - data_horz[2] = queue_pop(queue_prev_horz); - data_vert[2] = queue_pop(queue_prev_vert); + queue_pop(queue_prev_horz, &data_horz[2]); + queue_pop(queue_prev_vert, &data_vert[2]); data_dummy += data_horz[2] * data_vert[2]; - data_horz[3] = queue_pop(queue_prev_horz); - data_vert[3] = queue_pop(queue_prev_vert); + queue_pop(queue_prev_horz, &data_horz[3]); + queue_pop(queue_prev_vert, &data_vert[3]); data_dummy += data_horz[3] * data_vert[3]; // TODO: FIND SAFER WAY TO ENFORCE DATA DEPENDENCY if (!data_dummy) From 72441bc90a90f8298077cf1dc89993cfda50db92 Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Thu, 10 Jun 2021 17:46:17 +0200 Subject: [PATCH 12/24] [apps] Use 2 interleaved queues per direction in matmul_xqueue --- software/runtime/systolic/matmul_xqueue.h | 675 ++++++++++++---------- 1 file changed, 365 insertions(+), 310 deletions(-) diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h index 2ba8f317d..d9afe9a12 100644 --- a/software/runtime/systolic/matmul_xqueue.h +++ b/software/runtime/systolic/matmul_xqueue.h @@ -25,13 +25,15 @@ * (max dimension is 16-bit) * Matrix is processed in 2x2 submatrices with the following indexing * - * B B 0 2 - * B B 1 3 + * B B 0 1 + * B B 2 3 * - * A A C C = 0 1 0 1 - * A A C C 2 3 2 3 + * A A C C = 0 2 0 1 + * A A C C 1 3 2 3 * - * e.g. C0 = A1 * B1 + A0 * B0 + * e.g. C0 = A2 * B2 + A0 * B0 + * + * We use two interleaved queues per direction */ #include "alloc.h" @@ -50,18 +52,19 @@ typedef struct { // TODO: SQRT ROOT OF NUM_CORES FOR SYSTOLIC SIZE // Array of queue ptrs in row-major order -int32_t *queues_vert[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; -int32_t *queues_horz[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; +int32_t *queues_vert_0[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; +int32_t *queues_vert_1[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; +int32_t *queues_horz_0[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; +int32_t *queues_horz_1[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; // queue push -static inline void queue_push(void *const queue, int32_t data, - int32_t *const ret) { - asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue)); +static inline void queue_push(void *const queue, int32_t data, int32_t *const ret) { + asm volatile ("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue)); } // queue pop inline void queue_pop(void *const queue, int32_t *const ret) { - asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue)); + asm volatile ("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue)); } // TODO: GENERALIZE FOR ANY NUMBER OF TILES @@ -76,25 +79,41 @@ void systolic_init(uint32_t const *grid_mapping) { for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { tile_id = grid_mapping[grid_pos]; tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4; - queues_vert[y][x] = &__seq_start + tile_offset + bank_sel[tile_id]; - queues_horz[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 1; - bank_sel[tile_id] += 2; + queues_vert_0[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 0; + queues_vert_1[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 1; + queues_horz_0[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 2; + queues_horz_1[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 3; + bank_sel[tile_id] += 4; ++grid_pos; } } // Print out queue addresses - // printf("queues_vert\n"); + // printf("queues_vert_0\n"); + // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { + // for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { + // printf("%5d ", queues_vert_0[y][x]); + // } + // printf("\n"); + // } + // printf("queues_vert_1\n"); + // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { + // for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { + // printf("%5d ", queues_vert_1[y][x]); + // } + // printf("\n"); + // } + // printf("queues_horz_0\n"); // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { // for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { - // printf("%5d ", queues_vert[y][x]); + // printf("%5d ", queues_horz_0[y][x]); // } // printf("\n"); // } - // printf("queues_horz\n"); + // printf("queues_horz_1\n"); // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { // for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { - // printf("%5d ", queues_horz[y][x]); + // printf("%5d ", queues_horz_1[y][x]); // } // printf("\n"); // } @@ -177,12 +196,16 @@ void systolic_rcp_pe(const uint32_t rep_count, systolic_matrix_t const *__restrict__ A, systolic_matrix_t const *__restrict__ B, systolic_matrix_t const *__restrict__ C) { - int32_t *queue_next_horz; - int32_t *queue_next_vert; + int32_t *queue_next_horz_0; + int32_t *queue_next_horz_1; + int32_t *queue_next_vert_0; + int32_t *queue_next_vert_1; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; - int32_t resp_horz __attribute__((unused)) = 0; - int32_t resp_vert __attribute__((unused)) = 0; + int32_t resp_horz_0 __attribute__((unused)) = 0; + int32_t resp_horz_1 __attribute__((unused)) = 0; + int32_t resp_vert_0 __attribute__((unused)) = 0; + int32_t resp_vert_1 __attribute__((unused)) = 0; int32_t *matrix_A; int32_t *matrix_B; int32_t *matrix_C; @@ -198,8 +221,10 @@ void systolic_rcp_pe(const uint32_t rep_count, uint32_t anchor_row_1; // Assign queues - queue_next_horz = queues_horz[0][1]; - queue_next_vert = queues_vert[1][0]; + queue_next_horz_0 = queues_horz_0[0][1]; + queue_next_horz_1 = queues_horz_1[0][1]; + queue_next_vert_0 = queues_vert_0[1][0]; + queue_next_vert_1 = queues_vert_1[1][0]; // Get matrix arrays matrix_A = A->matrix; @@ -225,27 +250,27 @@ void systolic_rcp_pe(const uint32_t rep_count, for (uint32_t i = 0; i < 2 * rep_count; i += 2) { data_horz[0] = matrix_A[y * num_cols_A + i]; data_vert[0] = matrix_B[i * num_cols_B + x]; - queue_push(queue_next_horz, data_horz[0], &resp_horz); - queue_push(queue_next_vert, data_vert[0], &resp_vert); - data_horz[1] = matrix_A[y * num_cols_A + i + 1]; - data_vert[1] = matrix_B[(i + 1) * num_cols_B + x]; + data_horz[1] = matrix_A[(y + 1) * num_cols_A + i]; + data_vert[1] = matrix_B[i * num_cols_B + x + 1]; + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); curr_element_0_C += data_horz[0] * data_vert[0]; - queue_push(queue_next_horz, data_horz[1], &resp_horz); - queue_push(queue_next_vert, data_vert[1], &resp_vert); - data_horz[2] = matrix_A[(y + 1) * num_cols_A + i]; - data_vert[2] = matrix_B[i * num_cols_B + x + 1]; - curr_element_0_C += data_horz[1] * data_vert[1]; - queue_push(queue_next_horz, data_horz[1], &resp_horz); - queue_push(queue_next_vert, data_vert[1], &resp_vert); + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + data_horz[2] = matrix_A[y * num_cols_A + i + 1]; + data_vert[2] = matrix_B[(i + 1) * num_cols_B + x]; data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1]; data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1]; - curr_element_1_C += data_horz[0] * data_vert[2]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[2]; - queue_push(queue_next_horz, data_horz[3], &resp_horz); - queue_push(queue_next_vert, data_vert[3], &resp_vert); - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[1]; + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; curr_element_3_C += data_horz[3] * data_vert[3]; } @@ -264,13 +289,18 @@ void systolic_rcp_pe(const uint32_t rep_count, void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, systolic_matrix_t const *__restrict__ B, systolic_matrix_t const *__restrict__ C) { - int32_t *queue_prev_horz; - int32_t *queue_next_horz; - int32_t *queue_next_vert; + int32_t *queue_prev_horz_0; + int32_t *queue_prev_horz_1; + int32_t *queue_next_horz_0; + int32_t *queue_next_horz_1; + int32_t *queue_next_vert_0; + int32_t *queue_next_vert_1; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; - int32_t resp_horz __attribute__((unused)) = 0; - int32_t resp_vert __attribute__((unused)) = 0; + int32_t resp_horz_0 __attribute__((unused)) = 0; + int32_t resp_horz_1 __attribute__((unused)) = 0; + int32_t resp_vert_0 __attribute__((unused)) = 0; + int32_t resp_vert_1 __attribute__((unused)) = 0; int32_t *matrix_B; int32_t *matrix_C; uint32_t num_cols_B; @@ -285,13 +315,17 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, uint32_t anchor_row_1; // Assign queues - queue_prev_horz = queues_horz[0][col_idx]; + queue_prev_horz_0 = queues_horz_0[0][col_idx]; + queue_prev_horz_1 = queues_horz_1[0][col_idx]; if (col_idx == SYSTOLIC_SIZE - 1) { - queue_next_horz = NULL; + queue_next_horz_0 = NULL; + queue_next_horz_1 = NULL; } else { - queue_next_horz = queues_horz[0][col_idx + 1]; + queue_next_horz_0 = queues_horz_0[0][col_idx + 1]; + queue_next_horz_1 = queues_horz_1[0][col_idx + 1]; } - queue_next_vert = queues_vert[1][col_idx]; + queue_next_vert_0 = queues_vert_0[1][col_idx]; + queue_next_vert_1 = queues_vert_1[1][col_idx]; // Get matrix arrays matrix_B = B->matrix; @@ -303,7 +337,7 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, num_cols_C = C->num_cols; // Check if PE is at the right boundary - if (queue_next_horz) { + if (queue_next_horz_0) { // Execute step-wise matrix multiplication for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { @@ -320,29 +354,29 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + queue_pop(queue_prev_horz_0, &data_horz[0]); data_vert[0] = matrix_B[i * num_cols_B + shifted_x]; - queue_pop(queue_prev_horz, &data_horz[0]); - queue_push(queue_next_horz, data_horz[0], &resp_horz); - queue_push(queue_next_vert, data_vert[0], &resp_vert); - data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x]; + queue_pop(queue_prev_horz_1, &data_horz[1]); + data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1]; + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); curr_element_0_C += data_horz[0] * data_vert[0]; - queue_pop(queue_prev_horz, &data_horz[1]); - queue_push(queue_next_horz, data_horz[1], &resp_horz); - queue_push(queue_next_vert, data_vert[1], &resp_vert); - data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1]; - curr_element_0_C += data_horz[1] * data_vert[1]; - queue_pop(queue_prev_horz, &data_horz[2]); - queue_push(queue_next_horz, data_horz[2], &resp_horz); - queue_push(queue_next_vert, data_vert[2], &resp_vert); + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x]; + queue_pop(queue_prev_horz_1, &data_horz[3]); data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; - curr_element_1_C += data_horz[0] * data_vert[2]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[2]; - queue_pop(queue_prev_horz, &data_horz[3]); - queue_push(queue_next_horz, data_horz[3], &resp_horz); - queue_push(queue_next_vert, data_vert[3], &resp_vert); - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[1]; + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; curr_element_3_C += data_horz[3] * data_vert[3]; } @@ -356,18 +390,18 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_horz, &data_horz[0]); - queue_push(queue_next_horz, data_horz[0], &resp_horz); - queue_push(queue_next_vert, data_vert[0], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[1]); - queue_push(queue_next_horz, data_horz[1], &resp_horz); - queue_push(queue_next_vert, data_vert[1], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[2]); - queue_push(queue_next_horz, data_horz[2], &resp_horz); - queue_push(queue_next_vert, data_vert[2], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[3]); - queue_push(queue_next_horz, data_horz[3], &resp_horz); - queue_push(queue_next_vert, data_vert[3], &resp_vert); + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); } } } @@ -389,25 +423,25 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + queue_pop(queue_prev_horz_0, &data_horz[0]); data_vert[0] = matrix_B[i * num_cols_B + shifted_x]; - queue_pop(queue_prev_horz, &data_horz[0]); - queue_push(queue_next_vert, data_vert[0], &resp_vert); - data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x]; + queue_pop(queue_prev_horz_1, &data_horz[1]); + data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1]; + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); curr_element_0_C += data_horz[0] * data_vert[0]; - queue_pop(queue_prev_horz, &data_horz[1]); - queue_push(queue_next_vert, data_vert[1], &resp_vert); - data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1]; - curr_element_0_C += data_horz[1] * data_vert[1]; - queue_pop(queue_prev_horz, &data_horz[2]); - queue_push(queue_next_vert, data_vert[2], &resp_vert); + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x]; + queue_pop(queue_prev_horz_1, &data_horz[3]); data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; - curr_element_1_C += data_horz[0] * data_vert[2]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[2]; - queue_pop(queue_prev_horz, &data_horz[3]); - queue_push(queue_next_vert, data_vert[3], &resp_vert); - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[1]; + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; curr_element_3_C += data_horz[3] * data_vert[3]; } @@ -421,14 +455,14 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_horz, &data_horz[0]); - queue_push(queue_next_vert, data_horz[0], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[1]); - queue_push(queue_next_vert, data_horz[1], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[2]); - queue_push(queue_next_vert, data_horz[2], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[3]); - queue_push(queue_next_vert, data_horz[3], &resp_vert); + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_push(queue_next_vert_0, data_horz[0], &resp_vert_0); + queue_push(queue_next_vert_1, data_horz[1], &resp_vert_1); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_push(queue_next_vert_0, data_horz[2], &resp_vert_0); + queue_push(queue_next_vert_1, data_horz[3], &resp_vert_1); } } } @@ -440,13 +474,18 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, systolic_matrix_t const *__restrict__ A, systolic_matrix_t const *__restrict__ C) { - int32_t *queue_next_horz; - int32_t *queue_prev_vert; - int32_t *queue_next_vert; + int32_t *queue_next_horz_0; + int32_t *queue_next_horz_1; + int32_t *queue_prev_vert_0; + int32_t *queue_prev_vert_1; + int32_t *queue_next_vert_0; + int32_t *queue_next_vert_1; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; - int32_t resp_horz __attribute__((unused)) = 0; - int32_t resp_vert __attribute__((unused)) = 0; + int32_t resp_horz_0 __attribute__((unused)) = 0; + int32_t resp_horz_1 __attribute__((unused)) = 0; + int32_t resp_vert_0 __attribute__((unused)) = 0; + int32_t resp_vert_1 __attribute__((unused)) = 0; int32_t *matrix_A; int32_t *matrix_C; uint32_t num_cols_A; @@ -461,12 +500,16 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, uint32_t anchor_row_1; // Assign queues - queue_next_horz = queues_horz[row_idx][1]; - queue_prev_vert = queues_vert[row_idx][0]; + queue_next_horz_0 = queues_horz_0[row_idx][1]; + queue_next_horz_1 = queues_horz_1[row_idx][1]; + queue_prev_vert_0 = queues_vert_0[row_idx][0]; + queue_prev_vert_1 = queues_vert_1[row_idx][0]; if (row_idx == SYSTOLIC_SIZE - 1) { - queue_next_vert = NULL; + queue_next_vert_0 = NULL; + queue_next_vert_1 = NULL; } else { - queue_next_vert = queues_vert[row_idx + 1][0]; + queue_next_vert_0 = queues_vert_0[row_idx + 1][0]; + queue_next_vert_1 = queues_vert_1[row_idx + 1][0]; } // Get matrix arrays @@ -479,7 +522,7 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, num_cols_C = C->num_cols; // Check if PE is at the bottom boundary - if (queue_next_vert) { + if (queue_next_vert_0) { // Execute step-wise matrix multiplication for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { @@ -497,28 +540,28 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < 2 * rep_count; i += 2) { data_horz[0] = matrix_A[shifted_y * num_cols_A + i]; - queue_pop(queue_prev_vert, &data_vert[0]); - queue_push(queue_next_horz, data_horz[0], &resp_horz); - queue_push(queue_next_vert, data_vert[0], &resp_vert); - data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1]; + queue_pop(queue_prev_vert_0, &data_vert[0]); + data_horz[1] = matrix_A[(shifted_y + 1) * num_cols_A + i]; + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); curr_element_0_C += data_horz[0] * data_vert[0]; - queue_pop(queue_prev_vert, &data_vert[1]); - queue_push(queue_next_horz, data_horz[1], &resp_horz); - queue_push(queue_next_vert, data_vert[1], &resp_vert); - data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i]; - curr_element_0_C += data_horz[1] * data_vert[1]; - queue_pop(queue_prev_vert, &data_vert[2]); - queue_push(queue_next_horz, data_horz[2], &resp_horz); - queue_push(queue_next_vert, data_vert[2], &resp_vert); + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + data_horz[2] = matrix_A[shifted_y * num_cols_A + i + 1]; + queue_pop(queue_prev_vert_0, &data_vert[2]); data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; - curr_element_1_C += data_horz[0] * data_vert[2]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[2]; - queue_pop(queue_prev_vert, &data_vert[3]); - queue_push(queue_next_horz, data_horz[3], &resp_horz); - queue_push(queue_next_vert, data_vert[3], &resp_vert); - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[1]; + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; curr_element_3_C += data_horz[3] * data_vert[3]; } @@ -532,18 +575,18 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_vert, &data_vert[0]); - queue_push(queue_next_horz, data_horz[0], &resp_horz); - queue_push(queue_next_vert, data_vert[0], &resp_vert); - queue_pop(queue_prev_vert, &data_vert[1]); - queue_push(queue_next_horz, data_horz[1], &resp_horz); - queue_push(queue_next_vert, data_vert[1], &resp_vert); - queue_pop(queue_prev_vert, &data_vert[2]); - queue_push(queue_next_horz, data_horz[2], &resp_horz); - queue_push(queue_next_vert, data_vert[2], &resp_vert); - queue_pop(queue_prev_vert, &data_vert[3]); - queue_push(queue_next_horz, data_horz[3], &resp_horz); - queue_push(queue_next_vert, data_vert[3], &resp_vert); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); } } } @@ -566,24 +609,24 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < 2 * rep_count; i += 2) { data_horz[0] = matrix_A[shifted_y * num_cols_A + i]; - queue_pop(queue_prev_vert, &data_vert[0]); - queue_push(queue_next_horz, data_horz[0], &resp_horz); - data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1]; + queue_pop(queue_prev_vert_0, &data_vert[0]); + data_horz[1] = matrix_A[(shifted_y + 1) * num_cols_A + i]; + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); curr_element_0_C += data_horz[0] * data_vert[0]; - queue_pop(queue_prev_vert, &data_vert[1]); - queue_push(queue_next_horz, data_horz[1], &resp_horz); - data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i]; - curr_element_0_C += data_horz[1] * data_vert[1]; - queue_pop(queue_prev_vert, &data_vert[2]); - queue_push(queue_next_horz, data_horz[2], &resp_horz); + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + data_horz[2] = matrix_A[shifted_y * num_cols_A + i + 1]; + queue_pop(queue_prev_vert_0, &data_vert[2]); data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; - curr_element_1_C += data_horz[0] * data_vert[2]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[2]; - queue_pop(queue_prev_vert, &data_vert[3]); - queue_push(queue_next_horz, data_horz[3], &resp_horz); - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[1]; + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; curr_element_3_C += data_horz[3] * data_vert[3]; } @@ -597,14 +640,14 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_vert, &data_vert[0]); - queue_push(queue_next_horz, data_vert[0], &resp_horz); - queue_pop(queue_prev_vert, &data_vert[1]); - queue_push(queue_next_horz, data_vert[1], &resp_horz); - queue_pop(queue_prev_vert, &data_vert[2]); - queue_push(queue_next_horz, data_vert[2], &resp_horz); - queue_pop(queue_prev_vert, &data_vert[3]); - queue_push(queue_next_horz, data_vert[3], &resp_horz); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_vert[0], &resp_horz_0); + queue_push(queue_next_horz_1, data_vert[1], &resp_horz_1); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_vert[2], &resp_horz_0); + queue_push(queue_next_horz_1, data_vert[3], &resp_horz_1); } } } @@ -616,15 +659,21 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, const uint32_t rep_count, systolic_matrix_t const *__restrict__ C) { - int32_t *queue_prev_horz; - int32_t *queue_next_horz; - int32_t *queue_prev_vert; - int32_t *queue_next_vert; + int32_t *queue_prev_horz_0; + int32_t *queue_prev_horz_1; + int32_t *queue_next_horz_0; + int32_t *queue_next_horz_1; + int32_t *queue_prev_vert_0; + int32_t *queue_prev_vert_1; + int32_t *queue_next_vert_0; + int32_t *queue_next_vert_1; int32_t data_horz[4] = {0, 0, 0, 0}; int32_t data_vert[4] = {0, 0, 0, 0}; int32_t data_dummy __attribute__((unused)) = 0; - int32_t resp_horz __attribute__((unused)) = 0; - int32_t resp_vert __attribute__((unused)) = 0; + int32_t resp_horz_0 __attribute__((unused)) = 0; + int32_t resp_horz_1 __attribute__((unused)) = 0; + int32_t resp_vert_0 __attribute__((unused)) = 0; + int32_t resp_vert_1 __attribute__((unused)) = 0; int32_t *matrix_C; uint32_t num_rows_C; uint32_t num_cols_C; @@ -638,17 +687,23 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, uint32_t anchor_row_1; // Assign queues - queue_prev_horz = queues_horz[row_idx][col_idx]; + queue_prev_horz_0 = queues_horz_0[row_idx][col_idx]; + queue_prev_horz_1 = queues_horz_1[row_idx][col_idx]; if (col_idx == SYSTOLIC_SIZE - 1) { - queue_next_horz = NULL; + queue_next_horz_0 = NULL; + queue_next_horz_1 = NULL; } else { - queue_next_horz = queues_horz[row_idx][col_idx + 1]; + queue_next_horz_0 = queues_horz_0[row_idx][col_idx + 1]; + queue_next_horz_1 = queues_horz_1[row_idx][col_idx + 1]; } - queue_prev_vert = queues_vert[row_idx][col_idx]; + queue_prev_vert_0 = queues_vert_0[row_idx][col_idx]; + queue_prev_vert_1 = queues_vert_1[row_idx][col_idx]; if (row_idx == SYSTOLIC_SIZE - 1) { - queue_next_vert = NULL; + queue_next_vert_0 = NULL; + queue_next_vert_1 = NULL; } else { - queue_next_vert = queues_vert[row_idx + 1][col_idx]; + queue_next_vert_0 = queues_vert_0[row_idx + 1][col_idx]; + queue_next_vert_1 = queues_vert_1[row_idx + 1][col_idx]; } // Get matrix arrays @@ -659,7 +714,7 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, num_cols_C = C->num_cols; // PE is not at a boundary - if (queue_next_horz && queue_next_vert) { + if (queue_next_horz_0 && queue_next_vert_0) { // Execute step-wise matrix multiplication for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { @@ -677,29 +732,29 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_horz, &data_horz[0]); - queue_pop(queue_prev_vert, &data_vert[0]); - queue_push(queue_next_horz, data_horz[0], &resp_horz); - queue_push(queue_next_vert, data_vert[0], &resp_vert); + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); curr_element_0_C += data_horz[0] * data_vert[0]; - queue_pop(queue_prev_horz, &data_horz[1]); - queue_pop(queue_prev_vert, &data_vert[1]); - queue_push(queue_next_horz, data_horz[1], &resp_horz); - queue_push(queue_next_vert, data_vert[1], &resp_vert); - curr_element_0_C += data_horz[1] * data_vert[1]; - queue_pop(queue_prev_horz, &data_horz[2]); - queue_pop(queue_prev_vert, &data_vert[2]); - queue_push(queue_next_horz, data_horz[2], &resp_horz); - queue_push(queue_next_vert, data_vert[2], &resp_vert); - curr_element_1_C += data_horz[0] * data_vert[2]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[2]; - queue_pop(queue_prev_horz, &data_horz[3]); - queue_pop(queue_prev_vert, &data_vert[3]); - queue_push(queue_next_horz, data_horz[3], &resp_horz); - queue_push(queue_next_vert, data_vert[3], &resp_vert); - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; curr_element_3_C += data_horz[3] * data_vert[3]; } @@ -713,22 +768,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_horz, &data_horz[0]); - queue_pop(queue_prev_vert, &data_vert[0]); - queue_push(queue_next_horz, data_horz[0], &resp_horz); - queue_push(queue_next_vert, data_vert[0], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[1]); - queue_pop(queue_prev_vert, &data_vert[1]); - queue_push(queue_next_horz, data_horz[1], &resp_horz); - queue_push(queue_next_vert, data_vert[1], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[2]); - queue_pop(queue_prev_vert, &data_vert[2]); - queue_push(queue_next_horz, data_horz[2], &resp_horz); - queue_push(queue_next_vert, data_vert[2], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[3]); - queue_pop(queue_prev_vert, &data_vert[3]); - queue_push(queue_next_horz, data_horz[3], &resp_horz); - queue_push(queue_next_vert, data_vert[3], &resp_vert); + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); } } } @@ -736,7 +791,7 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } // PE is at the right boundary - if (!queue_next_horz && queue_next_vert) { + if (!queue_next_horz_0 && queue_next_vert_0) { // Execute step-wise matrix multiplication for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { @@ -754,25 +809,25 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_horz, &data_horz[0]); - queue_pop(queue_prev_vert, &data_vert[0]); - queue_push(queue_next_vert, data_vert[0], &resp_vert); + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); curr_element_0_C += data_horz[0] * data_vert[0]; - queue_pop(queue_prev_horz, &data_horz[1]); - queue_pop(queue_prev_vert, &data_vert[1]); - queue_push(queue_next_vert, data_vert[1], &resp_vert); - curr_element_0_C += data_horz[1] * data_vert[1]; - queue_pop(queue_prev_horz, &data_horz[2]); - queue_pop(queue_prev_vert, &data_vert[2]); - queue_push(queue_next_vert, data_vert[2], &resp_vert); - curr_element_1_C += data_horz[0] * data_vert[2]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[2]; - queue_pop(queue_prev_horz, &data_horz[3]); - queue_pop(queue_prev_vert, &data_vert[3]); - queue_push(queue_next_vert, data_vert[3], &resp_vert); - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; curr_element_3_C += data_horz[3] * data_vert[3]; } @@ -786,22 +841,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_horz, &data_horz[0]); - queue_pop(queue_prev_vert, &data_vert[0]); + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); data_vert[0] += data_horz[0]; - queue_push(queue_next_vert, data_vert[0], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[1]); - queue_pop(queue_prev_vert, &data_vert[1]); data_vert[1] += data_horz[1]; - queue_push(queue_next_vert, data_vert[1], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[2]); - queue_pop(queue_prev_vert, &data_vert[2]); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); data_vert[2] += data_horz[2]; - queue_push(queue_next_vert, data_vert[2], &resp_vert); - queue_pop(queue_prev_horz, &data_horz[3]); - queue_pop(queue_prev_vert, &data_vert[3]); data_vert[3] += data_horz[3]; - queue_push(queue_next_vert, data_vert[3], &resp_vert); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); } } } @@ -809,7 +864,7 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } // PE is at the bottom boundary - if (queue_next_horz && !queue_next_vert) { + if (queue_next_horz_0 && !queue_next_vert_0) { // Execute step-wise matrix multiplication for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { @@ -827,25 +882,25 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_horz, &data_horz[0]); - queue_pop(queue_prev_vert, &data_vert[0]); - queue_push(queue_next_horz, data_horz[0], &resp_horz); + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); curr_element_0_C += data_horz[0] * data_vert[0]; - queue_pop(queue_prev_horz, &data_horz[1]); - queue_pop(queue_prev_vert, &data_vert[1]); - queue_push(queue_next_horz, data_horz[1], &resp_horz); - curr_element_0_C += data_horz[1] * data_vert[1]; - queue_pop(queue_prev_horz, &data_horz[2]); - queue_pop(queue_prev_vert, &data_vert[2]); - queue_push(queue_next_horz, data_horz[2], &resp_horz); - curr_element_1_C += data_horz[0] * data_vert[2]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[2]; - queue_pop(queue_prev_horz, &data_horz[3]); - queue_pop(queue_prev_vert, &data_vert[3]); - queue_push(queue_next_horz, data_horz[3], &resp_horz); - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; curr_element_3_C += data_horz[3] * data_vert[3]; } @@ -859,22 +914,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_horz, &data_horz[0]); - queue_pop(queue_prev_vert, &data_vert[0]); + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); data_horz[0] += data_vert[0]; - queue_push(queue_next_horz, data_horz[0], &resp_horz); - queue_pop(queue_prev_horz, &data_horz[1]); - queue_pop(queue_prev_vert, &data_vert[1]); data_horz[1] += data_vert[1]; - queue_push(queue_next_horz, data_horz[1], &resp_horz); - queue_pop(queue_prev_horz, &data_horz[2]); - queue_pop(queue_prev_vert, &data_vert[2]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); data_horz[2] += data_vert[2]; - queue_push(queue_next_horz, data_horz[2], &resp_horz); - queue_pop(queue_prev_horz, &data_horz[3]); - queue_pop(queue_prev_vert, &data_vert[3]); data_horz[3] += data_vert[3]; - queue_push(queue_next_horz, data_horz[3], &resp_horz); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); } } } @@ -882,7 +937,7 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } // PE is at the bottom right corner - if (!queue_next_horz && !queue_next_vert) { + if (!queue_next_horz_0 && !queue_next_vert_0) { // Execute step-wise matrix multiplication for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { @@ -900,21 +955,21 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, // Systolic matrix multiplication through MACs for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_horz, &data_horz[0]); - queue_pop(queue_prev_vert, &data_vert[0]); + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); curr_element_0_C += data_horz[0] * data_vert[0]; - queue_pop(queue_prev_horz, &data_horz[1]); - queue_pop(queue_prev_vert, &data_vert[1]); - curr_element_0_C += data_horz[1] * data_vert[1]; - queue_pop(queue_prev_horz, &data_horz[2]); - queue_pop(queue_prev_vert, &data_vert[2]); - curr_element_1_C += data_horz[0] * data_vert[2]; - curr_element_2_C += data_horz[2] * data_vert[0]; - curr_element_3_C += data_horz[2] * data_vert[2]; - queue_pop(queue_prev_horz, &data_horz[3]); - queue_pop(queue_prev_vert, &data_vert[3]); - curr_element_1_C += data_horz[1] * data_vert[3]; - curr_element_2_C += data_horz[3] * data_vert[1]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; curr_element_3_C += data_horz[3] * data_vert[3]; } @@ -928,17 +983,17 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, } else { // Pop and push dummy data for (uint32_t i = 0; i < rep_count; ++i) { - queue_pop(queue_prev_horz, &data_horz[0]); - queue_pop(queue_prev_vert, &data_vert[0]); + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); data_dummy += data_horz[0] * data_vert[0]; - queue_pop(queue_prev_horz, &data_horz[1]); - queue_pop(queue_prev_vert, &data_vert[1]); data_dummy += data_horz[1] * data_vert[1]; - queue_pop(queue_prev_horz, &data_horz[2]); - queue_pop(queue_prev_vert, &data_vert[2]); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); data_dummy += data_horz[2] * data_vert[2]; - queue_pop(queue_prev_horz, &data_horz[3]); - queue_pop(queue_prev_vert, &data_vert[3]); data_dummy += data_horz[3] * data_vert[3]; // TODO: FIND SAFER WAY TO ENFORCE DATA DEPENDENCY if (!data_dummy) From 3674ea7b81d7034d6f331d583c65101877510dd1 Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Tue, 15 Jun 2021 16:13:40 +0200 Subject: [PATCH 13/24] [apps] Improve matmul_xqueue code - generalize systolic_init() - add additional grid mappings --- software/apps/systolic/matmul_xqueue/main.c | 80 ++++++++++++--------- software/runtime/systolic/matmul_xqueue.h | 26 +++---- 2 files changed, 61 insertions(+), 45 deletions(-) diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c index fafc4fbea..f7a648ab3 100644 --- a/software/apps/systolic/matmul_xqueue/main.c +++ b/software/apps/systolic/matmul_xqueue/main.c @@ -27,11 +27,12 @@ #include "synchronization.h" // Dimensions of matrices -#define DIM_M 16 -#define DIM_N 16 -#define DIM_P 16 +#define DIM_M 24 +#define DIM_N 24 +#define DIM_P 24 -uint32_t *grid_mapping; +uint32_t *tile_mapping; +uint32_t *core_mapping; int32_t *matrix_A; int32_t *matrix_B; @@ -77,7 +78,8 @@ int main() { // Allocate systolic grid mapping if (core_id == 0) { - grid_mapping = (uint32_t *)simple_malloc(num_cores * 4); + tile_mapping = (uint32_t *)simple_malloc(num_cores * 4); + core_mapping = (uint32_t *)simple_malloc(num_cores * 4); } // ---------- @@ -92,44 +94,53 @@ int main() { uint32_t col_idx = core_id / 4; uint32_t row_idx = core_id % 4; - // Assign grid position (tile wise) - // uint32_t col_idx; - // uint32_t row_idx; - // if (core_id < 4) { - // col_idx = core_id % 2; - // row_idx = core_id / 2; - // } else if (core_id < 8) { - // col_idx = core_id % 2 + 2; - // row_idx = core_id / 6; - // } else if (core_id < 12) { - // col_idx = core_id % 2; - // row_idx = core_id / 10 + 2; - // } else { - // col_idx = core_id % 2 + 2; - // row_idx = core_id / 14 + 2; - // } - - // uint32_t mapped_tile = tile_id; + // Assign grid position (square wise) + // uint32_t col_idx = tile_id % 2; + // col_idx *= 2; + // col_idx += core_id % 2; + // uint32_t row_idx = tile_id / 2; + // row_idx *= 2; + // row_idx += (core_id % 4) / 2; // ---------- // 256 CORES // ---------- + // Assign grid position (row wise) + // uint32_t col_idx = core_id % 16; + // uint32_t row_idx = core_id / 16; + // Assign grid position (col wise) // uint32_t col_idx = core_id / 16; // uint32_t row_idx = core_id % 16; - // Assign grid position (tile wise) - // uint32_t mapped_group = core_id % 4; - // uint32_t col_idx = tile_id / 4; - // uint32_t row_idx = (tile_id % 4) + (mapped_group * 4); - // uint32_t mapped_tile = (tile_id % 16) + (mapped_group * 16); + // Assign grid position (square wise) + // uint32_t col_idx = tile_id % 8; + // col_idx *= 2; + // col_idx += core_id % 2; + // uint32_t row_idx = tile_id / 8; + // row_idx *= 2; + // row_idx += (core_id % 4) / 2; + + // Assign grid position (square square wise) + // uint32_t group_id = tile_id / 16; + // uint32_t add_col = group_id % 2; + // uint32_t add_row = group_id / 2; + // uint32_t col_idx = tile_id % 4; + // col_idx *= 2; + // col_idx += core_id % 2; + // col_idx += add_col * 8; + // uint32_t row_idx = (tile_id % 16) / 4; + // row_idx *= 2; + // row_idx += (core_id % 4) / 2; + // row_idx += add_row * 8; // Wait for all cores mempool_barrier(num_cores); - // Set systolic grid mapping - grid_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = tile_id; + // Set tile and core mapping + tile_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = tile_id; + core_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = core_id; // Wait for all cores mempool_barrier(num_cores); @@ -138,11 +149,14 @@ int main() { if (core_id == 0) { printf("> Initialize\n"); - // Print out grid mapping - // print_matrix((int32_t *)grid_mapping, 4, 4); + // Print out tile mapping + // print_matrix((int32_t *)tile_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE); + + // Print out core mapping + // print_matrix((int32_t *)core_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE); // Initialize systolic array - systolic_init(grid_mapping); + systolic_init(tile_mapping, core_mapping); // Create systolic matrices generate_gradient_matrix(&matrix_A, DIM_M, DIM_N); diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h index d9afe9a12..cb26e762b 100644 --- a/software/runtime/systolic/matmul_xqueue.h +++ b/software/runtime/systolic/matmul_xqueue.h @@ -58,32 +58,34 @@ int32_t *queues_horz_0[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; int32_t *queues_horz_1[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; // queue push -static inline void queue_push(void *const queue, int32_t data, int32_t *const ret) { - asm volatile ("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue)); +static inline void queue_push(void *const queue, int32_t data, + int32_t *const ret) { + asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue)); } // queue pop inline void queue_pop(void *const queue, int32_t *const ret) { - asm volatile ("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue)); + asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue)); } -// TODO: GENERALIZE FOR ANY NUMBER OF TILES -void systolic_init(uint32_t const *grid_mapping) { +void systolic_init(uint32_t const *tile_mapping, uint32_t const *core_mapping) { // Create systolic array via queues extern int32_t __seq_start; uint32_t grid_pos = 0; uint32_t tile_id; + uint32_t core_id; uint32_t tile_offset; - uint32_t bank_sel[4] = {0, 0, 0, 0}; + uint32_t core_offset; for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { - tile_id = grid_mapping[grid_pos]; + tile_id = tile_mapping[grid_pos]; + core_id = core_mapping[grid_pos]; tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4; - queues_vert_0[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 0; - queues_vert_1[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 1; - queues_horz_0[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 2; - queues_horz_1[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 3; - bank_sel[tile_id] += 4; + core_offset = core_id % 4 * 4; + queues_vert_0[y][x] = &__seq_start + tile_offset + core_offset + 0; + queues_vert_1[y][x] = &__seq_start + tile_offset + core_offset + 1; + queues_horz_0[y][x] = &__seq_start + tile_offset + core_offset + 2; + queues_horz_1[y][x] = &__seq_start + tile_offset + core_offset + 3; ++grid_pos; } } From 3ea900b5c47140a8f6c3d688ea2b8b23ce8fb5f7 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Tue, 13 Sep 2022 15:58:22 +0200 Subject: [PATCH 14/24] [apps] Implement systolic xqueue 2d convolution --- software/apps/systolic/conv_xqueue/main.c | 250 +++++++++++++++ software/runtime/systolic/conv_xqueue.h | 358 ++++++++++++++++++++++ 2 files changed, 608 insertions(+) create mode 100644 software/apps/systolic/conv_xqueue/main.c create mode 100644 software/runtime/systolic/conv_xqueue.h diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c new file mode 100644 index 000000000..e3b3644ab --- /dev/null +++ b/software/apps/systolic/conv_xqueue/main.c @@ -0,0 +1,250 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Gua Hao Khov, ETH Zurich + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "systolic/conv_xqueue.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +// Dimensions of matrix X +#define DIM_X_M 32 +#define DIM_X_N 32 + +// Dimensions of matrix Y +#define DIM_Y_M (DIM_X_M - (KERNEL_SIZE - 1)) +#define DIM_Y_N (DIM_X_N - (KERNEL_SIZE - 1)) + +// Dimensions of maps +#define KERNEL_ROWS KERNEL_SIZE +#define KERNEL_COLS KERNEL_SIZE *NUM_KERNELS +#define NUM_ACCS NUM_KERNELS + +uint32_t *kernel_tile_map; +uint32_t *kernel_core_map; +uint32_t *row_acc_tile_map; +uint32_t *row_acc_core_map; + +int32_t *matrix_X; +int32_t *matrix_Y; + +int32_t weights[3][3] = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; + +void generate_gradient_matrix(int32_t **matrix, uint32_t num_rows, + uint32_t num_cols) { + int32_t *new_matrix = (int32_t *)simple_malloc(num_rows * num_cols * 4); + for (uint32_t y = 0; y < num_rows; ++y) { + for (uint32_t x = 0; x < num_cols; ++x) { + new_matrix[y * num_cols + x] = (int32_t)(y + x); + } + } + *matrix = new_matrix; +} + +void print_matrix(int32_t const *matrix, uint32_t num_rows, + uint32_t num_columns) { + printf("Matrix at 0x%8X\n", (uint32_t)matrix); + for (uint32_t i = 0; i < num_rows; ++i) { + for (uint32_t j = 0; j < num_columns; ++j) { + printf("%5d ", matrix[i * num_columns + j]); + } + printf("\n"); + } +} + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t tile_id = core_id / 4; + + // Initialize synchronization variables + mempool_barrier_init(core_id); + + // Initialization + mempool_init(core_id, num_cores); + + // Allocate tile and core maps + if (core_id == 0) { + kernel_tile_map = (uint32_t *)simple_malloc(KERNEL_ROWS * KERNEL_COLS * 4); + kernel_core_map = (uint32_t *)simple_malloc(KERNEL_ROWS * KERNEL_COLS * 4); + row_acc_tile_map = (uint32_t *)simple_malloc(NUM_ACCS * 4); + row_acc_core_map = (uint32_t *)simple_malloc(NUM_ACCS * 4); + } + + // Systolic identifiers + int32_t is_enabled = 0; + int32_t is_kernel_core = 0; + uint32_t kernel_id = 0; + uint32_t kernel_row = 0; + uint32_t kernel_col = 0; + + // ---------- + // ACC COMBO + // ---------- + + // TODO: VISUAL DESCRIPTION + // TODO: CURRENTLY ONLY WORKS FOR KERNEL_SIZE == 3 + + kernel_id = tile_id / 5; + uint32_t kernel_pair_id = tile_id % 5; + uint32_t tile_core_id = core_id % 4; + if (kernel_pair_id < 3) { + is_kernel_core = 1; + kernel_row = kernel_pair_id; + kernel_col = tile_core_id % 2; + kernel_id += tile_core_id / 2; + } else { + if (tile_core_id == 3) { + is_kernel_core = 0; + } else { + is_kernel_core = 1; + kernel_row = tile_core_id; + kernel_col = 2; + } + kernel_id += kernel_pair_id % 3; + } + + // Core is only enabled if its kernel is required + if (kernel_id < NUM_KERNELS) { + is_enabled = 1; + } else { + is_enabled = 0; + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Set tile and core maps + if (is_enabled) { + if (is_kernel_core) { + kernel_tile_map[kernel_row * KERNEL_COLS + kernel_col] = tile_id; + kernel_core_map[kernel_row * KERNEL_COLS + kernel_col] = core_id; + } else { + row_acc_tile_map[kernel_id] = tile_id; + row_acc_core_map[kernel_id] = core_id; + } + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Setup + if (core_id == 0) { + printf("> Initialize\n"); + + // Print out maps + // print_matrix((int32_t *)kernel_tile_map, KERNEL_ROWS, KERNEL_COLS); + // print_matrix((int32_t *)kernel_core_map, KERNEL_ROWS, KERNEL_COLS); + // print_matrix((int32_t *)row_acc_tile_map, 1, NUM_ACCS); + // print_matrix((int32_t *)row_acc_core_map, 1, NUM_ACCS); + + // Initialize systolic array + systolic_init(kernel_tile_map, kernel_core_map, row_acc_tile_map, + row_acc_core_map); + + // Create and initialize matrices + generate_gradient_matrix(&matrix_X, DIM_X_M, DIM_X_N); + matrix_Y = (int32_t *)simple_malloc(DIM_Y_M * DIM_Y_N * 4); + + // Print out matrix X + // printf("> Print Matrix X\n"); + // print_matrix(matrix_X, DIM_X_M, DIM_X_N); + } + + // Wait for all cores + mempool_barrier(num_cores); + + if (core_id == 0) { + // Start benchmark + printf("> Start\n"); + // mempool_start_benchmark(); + } + + // Start benchmark for all cores + mempool_barrier(num_cores); + mempool_start_benchmark(); + + // Wait for all cores + mempool_barrier(num_cores); + + if (is_enabled) { + if (is_kernel_core) { + switch (kernel_col) { + case 0: + if (kernel_id == 0) { + systolic_conv_first_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + matrix_X, (int32_t *)weights); + } else { + if (kernel_row == 2) { + systolic_conv_first_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + matrix_X, (int32_t *)weights); + } else { + systolic_conv_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + (int32_t *)weights); + } + } + break; + case (KERNEL_SIZE - 1): + if (kernel_id == NUM_KERNELS - 1) { + systolic_conv_last_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + (int32_t *)weights); + } else { + if (kernel_row == 0) { + systolic_conv_last_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + (int32_t *)weights); + } else { + systolic_conv_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + (int32_t *)weights); + } + } + break; + default: + systolic_conv_follower(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + (int32_t *)weights); + } + } else { + systolic_conv_row_acc(kernel_id, DIM_Y_M, DIM_Y_N, matrix_Y); + } + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Stop benchmark for all cores + mempool_stop_benchmark(); + mempool_barrier(num_cores); + + // Print out benchmark + if (core_id == 0) { + // Stop benchmark + // mempool_stop_benchmark(); + printf("> End\n"); + + // Print out matrix Y + printf("> Print Matrix Y\n"); + print_matrix(matrix_Y, DIM_Y_M, DIM_Y_N); + } + + // wait until all cores have finished + mempool_barrier(num_cores); + return 0; +} diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h new file mode 100644 index 000000000..8730dcef3 --- /dev/null +++ b/software/runtime/systolic/conv_xqueue.h @@ -0,0 +1,358 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Gua Hao Khov, ETH Zurich + +/* This library implements a simple systolic architecture emulation + * using global code based orchestration + */ + +/* TODO DESCRIPTION + * + * + * + * + * + * + */ + +#include "alloc.h" +#include "printf.h" + +// Kernel size (fixed) +#define KERNEL_SIZE 3 + +// Number of kernels +#define NUM_KERNELS 1 + +// Array of queue ptrs in row-major order (concatenated kernels) +int32_t *queues_x[KERNEL_SIZE][NUM_KERNELS * KERNEL_SIZE]; +int32_t *queues_y[KERNEL_SIZE][NUM_KERNELS * KERNEL_SIZE]; +int32_t *queues_row_acc[KERNEL_SIZE][NUM_KERNELS]; + +// queue push +static inline void queue_push(void *const queue, int32_t data, + int32_t *const ret) { + asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue)); +} + +// queue pop +inline void queue_pop(void *const queue, int32_t *const ret) { + asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue)); +} + +void systolic_init(uint32_t const *kernel_tile_map, + uint32_t const *kernel_core_map, + uint32_t const *row_acc_tile_map, + uint32_t const *row_acc_core_map) { + // Create systolic array via queues + extern int32_t __seq_start; + uint32_t grid_pos; + uint32_t tile_id; + uint32_t core_id; + uint32_t tile_offset; + uint32_t core_offset; + + // Kernel queues + grid_pos = 0; + for (uint32_t y = 0; y < KERNEL_SIZE; ++y) { + for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) { + tile_id = kernel_tile_map[grid_pos]; + core_id = kernel_core_map[grid_pos]; + tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4; + core_offset = core_id % 4 * 4; + queues_x[y][x] = &__seq_start + tile_offset + core_offset + 0; + queues_y[y][x] = &__seq_start + tile_offset + core_offset + 1; + ++grid_pos; + } + } + + // Row accumulator queues + grid_pos = 0; + for (uint32_t x = 0; x < NUM_KERNELS; ++x) { + tile_id = row_acc_tile_map[x]; + core_id = row_acc_core_map[x]; + tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4; + core_offset = core_id % 4 * 4; + queues_row_acc[0][x] = &__seq_start + tile_offset + core_offset + 0; + queues_row_acc[1][x] = &__seq_start + tile_offset + core_offset + 1; + queues_row_acc[2][x] = &__seq_start + tile_offset + core_offset + 2; + } + + // Print out queue addresses + // printf("queues_x\n"); + // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) { + // for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) { + // printf("%5d ", queues_x[y][x]); + // } + // printf("\n"); + // } + // printf("queues_y\n"); + // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) { + // for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) { + // printf("%5d ", queues_y[y][x]); + // } + // printf("\n"); + // } + // printf("queues_row_acc\n"); + // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) { + // for (uint32_t x = 0; x < NUM_KERNELS; ++x) { + // printf("%5d ", queues_row_acc[y][x]); + // } + // printf("\n"); + // } +} + +void systolic_conv_first_leader(const uint32_t kernel_id, + const uint32_t kernel_row, + const uint32_t num_rows, + const uint32_t num_cols, + int32_t const *__restrict__ X, + int32_t const *__restrict__ W) { + int32_t *queue_next_x; + int32_t *queue_next_y; + int32_t resp_x __attribute__((unused)) = 0; + int32_t resp_y __attribute__((unused)) = 0; + int32_t weight; + int32_t curr_x; + int32_t curr_y; + uint32_t first_row = kernel_id + kernel_row; + uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1; + + // Assign queues + queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1]; + queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1]; + + // Load weight + weight = W[kernel_row * KERNEL_SIZE + 0]; + + // Execute row-wise systolic 2d convolution + for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) { + // Populate kernel + curr_x = X[row * num_cols + 0]; + queue_push(queue_next_x, curr_x, &resp_x); + curr_x = X[row * num_cols + 1]; + queue_push(queue_next_x, curr_x, &resp_x); + curr_x = X[row * num_cols + 2]; + // Convolution + for (uint32_t col = 3; col < num_cols; ++col) { + queue_push(queue_next_x, curr_x, &resp_x); + curr_y = curr_x * weight; + curr_x = X[row * num_cols + col]; + queue_push(queue_next_y, curr_y, &resp_y); + } + // Flush kernel + queue_push(queue_next_x, curr_x, &resp_x); + curr_y = curr_x * weight; + queue_push(queue_next_y, curr_y, &resp_y); + } +} + +void systolic_conv_leader(const uint32_t kernel_id, const uint32_t kernel_row, + const uint32_t num_rows, const uint32_t num_cols, + int32_t const *__restrict__ W) { + int32_t *queue_prev_x; + int32_t *queue_next_x; + int32_t *queue_next_y; + int32_t resp_x __attribute__((unused)) = 0; + int32_t resp_y __attribute__((unused)) = 0; + int32_t weight; + int32_t curr_x; + int32_t curr_y; + uint32_t first_row = kernel_id + kernel_row; + uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1; + + // Assign queues + queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 0]; + queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1]; + queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1]; + + // Load weight + weight = W[kernel_row * KERNEL_SIZE + 0]; + + // Execute row-wise systolic 2d convolution + for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) { + // Populate kernel + queue_pop(queue_prev_x, &curr_x); + queue_push(queue_next_x, curr_x, &resp_x); + queue_pop(queue_prev_x, &curr_x); + queue_push(queue_next_x, curr_x, &resp_x); + queue_pop(queue_prev_x, &curr_x); + // Convolution + for (uint32_t col = 3; col < num_cols; ++col) { + queue_push(queue_next_x, curr_x, &resp_x); + curr_y = curr_x * weight; + queue_pop(queue_prev_x, &curr_x); + queue_push(queue_next_y, curr_y, &resp_y); + } + // Flush kernel + queue_push(queue_next_x, curr_x, &resp_x); + curr_y = curr_x * weight; + queue_push(queue_next_y, curr_y, &resp_y); + } +} + +void systolic_conv_follower(const uint32_t kernel_id, const uint32_t kernel_row, + const uint32_t num_rows, const uint32_t num_cols, + int32_t const *__restrict__ W) { + int32_t *queue_prev_x; + int32_t *queue_next_x; + int32_t *queue_prev_y; + int32_t *queue_next_y; + int32_t resp_x __attribute__((unused)) = 0; + int32_t resp_y __attribute__((unused)) = 0; + int32_t weight; + int32_t curr_x; + int32_t curr_y; + uint32_t first_row = kernel_id + kernel_row; + uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1; + + // Assign queues + queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1]; + queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2]; + queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1]; + queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2]; + + // Load weight + weight = W[kernel_row * KERNEL_SIZE + 1]; + + // Execute row-wise systolic 2d convolution + for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) { + // Populate kernel + queue_pop(queue_prev_x, &curr_x); + queue_push(queue_next_x, curr_x, &resp_x); + queue_pop(queue_prev_x, &curr_x); + // Convolution + for (uint32_t col = 2; col < num_cols; ++col) { + queue_pop(queue_prev_y, &curr_y); + queue_push(queue_next_x, curr_x, &resp_x); + curr_y += curr_x * weight; + queue_pop(queue_prev_x, &curr_x); + queue_push(queue_next_y, curr_y, &resp_y); + } + // Flush kernel + queue_push(queue_next_x, curr_x, &resp_x); + } +} + +void systolic_conv_NAME(const uint32_t kernel_id, const uint32_t kernel_row, + const uint32_t num_rows, const uint32_t num_cols, + int32_t const *__restrict__ W) { + int32_t *queue_prev_x; + int32_t *queue_next_x; + int32_t *queue_prev_y; + int32_t *queue_next_y; + int32_t resp_x __attribute__((unused)) = 0; + int32_t resp_y __attribute__((unused)) = 0; + int32_t weight; + int32_t curr_x; + int32_t curr_y; + uint32_t first_row = kernel_id + kernel_row; + uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1; + + // Assign queues + queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2]; + queue_next_x = queues_x[kernel_row + 1][(kernel_id + 1) * KERNEL_SIZE]; + queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2]; + queue_next_y = queues_row_acc[kernel_row][kernel_id]; + + // Load weight + weight = W[kernel_row * KERNEL_SIZE + 2]; + + // Execute row-wise systolic 2d convolution + for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) { + // Populate kernel + queue_pop(queue_prev_x, &curr_x); + // Convolution + for (uint32_t col = 1; col < num_cols - 1; ++col) { + queue_pop(queue_prev_y, &curr_y); + queue_push(queue_next_x, curr_x, &resp_x); + curr_y += curr_x * weight; + queue_pop(queue_prev_x, &curr_x); + queue_push(queue_next_y, curr_y, &resp_y); + } + // Flush kernel + queue_push(queue_next_x, curr_x, &resp_x); + queue_pop(queue_prev_x, &curr_x); + queue_push(queue_next_x, curr_x, &resp_x); + } +} + +void systolic_conv_last_NAME(const uint32_t kernel_id, + const uint32_t kernel_row, const uint32_t num_rows, + const uint32_t num_cols, + int32_t const *__restrict__ W) { + int32_t *queue_prev_x; + int32_t *queue_prev_y; + int32_t *queue_next_y; + int32_t resp_y __attribute__((unused)) = 0; + int32_t weight; + int32_t curr_x; + int32_t curr_y; + uint32_t first_row = kernel_id + kernel_row; + uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1; + + // Assign queues + queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2]; + queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2]; + queue_next_y = queues_row_acc[kernel_row][kernel_id]; + + // Load weight + weight = W[kernel_row * KERNEL_SIZE + 2]; + + // Execute row-wise systolic 2d convolution + for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) { + // Populate kernel + queue_pop(queue_prev_x, &curr_x); + // Convolution + for (uint32_t col = 1; col < num_cols - 1; ++col) { + queue_pop(queue_prev_y, &curr_y); + curr_y += curr_x * weight; + queue_pop(queue_prev_x, &curr_x); + queue_push(queue_next_y, curr_y, &resp_y); + } + // Flush kernel + queue_pop(queue_prev_x, &curr_x); + } +} + +void systolic_conv_row_acc(const uint32_t kernel_id, const uint32_t num_rows_y, + const uint32_t num_cols_y, int32_t *__restrict__ Y) { + int32_t *queue_y_0; + int32_t *queue_y_1; + int32_t *queue_y_2; + int32_t curr_y_0; + int32_t curr_y_1; + int32_t curr_y_2; + int32_t total_y; + + // Assign queues + queue_y_0 = queues_row_acc[0][kernel_id]; + queue_y_1 = queues_row_acc[1][kernel_id]; + queue_y_2 = queues_row_acc[2][kernel_id]; + + // Execute row-wise systolic 2d convolution + for (uint32_t row = kernel_id; row < num_rows_y; row += NUM_KERNELS) { + // Accumulate and Store + for (uint32_t col = 0; col < num_cols_y; ++col) { + queue_pop(queue_y_0, &curr_y_0); + queue_pop(queue_y_1, &curr_y_1); + queue_pop(queue_y_2, &curr_y_2); + total_y = curr_y_0 + curr_y_1 + curr_y_2; + Y[row * num_cols_y + col] = total_y; + } + } +} From 8a3b524bed3758ba8ec2153329c054a4cbb1034c Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Wed, 23 Jun 2021 13:41:12 +0200 Subject: [PATCH 15/24] [apps] Improve conv_xqueue code - change PEs role names - add different core mapping - flush queues at the end of execution --- software/apps/systolic/conv_xqueue/main.c | 93 +++++++++++++++++------ software/runtime/systolic/conv_xqueue.h | 42 +++++----- 2 files changed, 91 insertions(+), 44 deletions(-) diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c index e3b3644ab..7184962f7 100644 --- a/software/apps/systolic/conv_xqueue/main.c +++ b/software/apps/systolic/conv_xqueue/main.c @@ -36,7 +36,7 @@ // Dimensions of maps #define KERNEL_ROWS KERNEL_SIZE -#define KERNEL_COLS KERNEL_SIZE *NUM_KERNELS +#define KERNEL_COLS (KERNEL_SIZE * NUM_KERNELS) #define NUM_ACCS NUM_KERNELS uint32_t *kernel_tile_map; @@ -101,17 +101,23 @@ int main() { // ACC COMBO // ---------- - // TODO: VISUAL DESCRIPTION + // XY: X = Tile and Y = Core % 4 + // + // 00 01 30 ** + // 10 11 31 33 02 03 40 ** + // 20 21 32 ** 12 13 41 43 + // 22 23 42 ** + // TODO: CURRENTLY ONLY WORKS FOR KERNEL_SIZE == 3 - kernel_id = tile_id / 5; - uint32_t kernel_pair_id = tile_id % 5; + uint32_t group_id = tile_id / 5; + uint32_t group_tile_id = tile_id % 5; uint32_t tile_core_id = core_id % 4; - if (kernel_pair_id < 3) { + if (group_tile_id < 3) { is_kernel_core = 1; - kernel_row = kernel_pair_id; + kernel_row = group_tile_id; kernel_col = tile_core_id % 2; - kernel_id += tile_core_id / 2; + kernel_id = 2 * group_id + (tile_core_id / 2); } else { if (tile_core_id == 3) { is_kernel_core = 0; @@ -120,9 +126,47 @@ int main() { kernel_row = tile_core_id; kernel_col = 2; } - kernel_id += kernel_pair_id % 3; + kernel_id = 2 * group_id + (group_tile_id % 3); } + // ---------- + // LONG ROWS + // ---------- + + // XY: X = Tile and Y = Core % 4 + // + // 00 01 02 ** + // 10 11 12 90 13 30 31 ** + // 20 21 22 ** 23 40 41 91 42 43 60 ** + // 03 50 51 ** 52 53 70 92 71 72 73 ** + // 32 33 80 ** 81 82 83 93 + // 61 62 63 ** + + // TODO: CURRENTLY ONLY WORKS FOR KERNEL_SIZE == 3 + + // uint32_t group_id = tile_id / 10; + // uint32_t group_tile_id = tile_id % 10; + // uint32_t tile_core_id = core_id % 4; + // if (group_tile_id < 9) { + // is_kernel_core = 1; + // uint32_t group_kernel_id = group_tile_id / 3; + // kernel_row = group_tile_id % 3; + // kernel_col = (tile_core_id + group_kernel_id) % 3; + // uint32_t threshold = 3 - group_kernel_id; + // if (tile_core_id >= threshold) { + // group_kernel_id += 1; + // if (kernel_row == 0) { + // kernel_row = 2; + // } else { + // kernel_row -= 1; + // } + // } + // kernel_id = 4 * group_id + group_kernel_id; + // } else { + // is_kernel_core = 0; + // kernel_id = 4 * group_id + tile_core_id; + // } + // Core is only enabled if its kernel is required if (kernel_id < NUM_KERNELS) { is_enabled = 1; @@ -136,8 +180,9 @@ int main() { // Set tile and core maps if (is_enabled) { if (is_kernel_core) { - kernel_tile_map[kernel_row * KERNEL_COLS + kernel_col] = tile_id; - kernel_core_map[kernel_row * KERNEL_COLS + kernel_col] = core_id; + uint32_t map_col = KERNEL_SIZE * kernel_id + kernel_col; + kernel_tile_map[kernel_row * KERNEL_COLS + map_col] = tile_id; + kernel_core_map[kernel_row * KERNEL_COLS + map_col] = core_id; } else { row_acc_tile_map[kernel_id] = tile_id; row_acc_core_map[kernel_id] = core_id; @@ -191,35 +236,35 @@ int main() { switch (kernel_col) { case 0: if (kernel_id == 0) { - systolic_conv_first_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - matrix_X, (int32_t *)weights); + systolic_conv_first_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + matrix_X, (int32_t *)weights); } else { if (kernel_row == 2) { - systolic_conv_first_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - matrix_X, (int32_t *)weights); + systolic_conv_first_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + matrix_X, (int32_t *)weights); } else { - systolic_conv_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - (int32_t *)weights); + systolic_conv_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + (int32_t *)weights); } } break; case (KERNEL_SIZE - 1): if (kernel_id == NUM_KERNELS - 1) { - systolic_conv_last_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - (int32_t *)weights); + systolic_conv_last_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + (int32_t *)weights); } else { if (kernel_row == 0) { - systolic_conv_last_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - (int32_t *)weights); + systolic_conv_last_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + (int32_t *)weights); } else { - systolic_conv_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - (int32_t *)weights); + systolic_conv_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + (int32_t *)weights); } } break; default: - systolic_conv_follower(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - (int32_t *)weights); + systolic_conv_mid(kernel_id, kernel_row, DIM_X_M, DIM_X_N, + (int32_t *)weights); } } else { systolic_conv_row_acc(kernel_id, DIM_Y_M, DIM_Y_N, matrix_Y); diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h index 8730dcef3..85e372f42 100644 --- a/software/runtime/systolic/conv_xqueue.h +++ b/software/runtime/systolic/conv_xqueue.h @@ -36,7 +36,7 @@ #define KERNEL_SIZE 3 // Number of kernels -#define NUM_KERNELS 1 +#define NUM_KERNELS 25 // Array of queue ptrs in row-major order (concatenated kernels) int32_t *queues_x[KERNEL_SIZE][NUM_KERNELS * KERNEL_SIZE]; @@ -116,12 +116,11 @@ void systolic_init(uint32_t const *kernel_tile_map, // } } -void systolic_conv_first_leader(const uint32_t kernel_id, - const uint32_t kernel_row, - const uint32_t num_rows, - const uint32_t num_cols, - int32_t const *__restrict__ X, - int32_t const *__restrict__ W) { +void systolic_conv_first_front(const uint32_t kernel_id, + const uint32_t kernel_row, + const uint32_t num_rows, const uint32_t num_cols, + int32_t const *__restrict__ X, + int32_t const *__restrict__ W) { int32_t *queue_next_x; int32_t *queue_next_y; int32_t resp_x __attribute__((unused)) = 0; @@ -161,9 +160,9 @@ void systolic_conv_first_leader(const uint32_t kernel_id, } } -void systolic_conv_leader(const uint32_t kernel_id, const uint32_t kernel_row, - const uint32_t num_rows, const uint32_t num_cols, - int32_t const *__restrict__ W) { +void systolic_conv_front(const uint32_t kernel_id, const uint32_t kernel_row, + const uint32_t num_rows, const uint32_t num_cols, + int32_t const *__restrict__ W) { int32_t *queue_prev_x; int32_t *queue_next_x; int32_t *queue_next_y; @@ -205,9 +204,9 @@ void systolic_conv_leader(const uint32_t kernel_id, const uint32_t kernel_row, } } -void systolic_conv_follower(const uint32_t kernel_id, const uint32_t kernel_row, - const uint32_t num_rows, const uint32_t num_cols, - int32_t const *__restrict__ W) { +void systolic_conv_mid(const uint32_t kernel_id, const uint32_t kernel_row, + const uint32_t num_rows, const uint32_t num_cols, + int32_t const *__restrict__ W) { int32_t *queue_prev_x; int32_t *queue_next_x; int32_t *queue_prev_y; @@ -248,9 +247,9 @@ void systolic_conv_follower(const uint32_t kernel_id, const uint32_t kernel_row, } } -void systolic_conv_NAME(const uint32_t kernel_id, const uint32_t kernel_row, - const uint32_t num_rows, const uint32_t num_cols, - int32_t const *__restrict__ W) { +void systolic_conv_end(const uint32_t kernel_id, const uint32_t kernel_row, + const uint32_t num_rows, const uint32_t num_cols, + int32_t const *__restrict__ W) { int32_t *queue_prev_x; int32_t *queue_next_x; int32_t *queue_prev_y; @@ -289,12 +288,15 @@ void systolic_conv_NAME(const uint32_t kernel_id, const uint32_t kernel_row, queue_pop(queue_prev_x, &curr_x); queue_push(queue_next_x, curr_x, &resp_x); } + + // Flush next queues at the end of execution + queue_pop(queue_next_x, &curr_x); + queue_pop(queue_next_x, &curr_x); } -void systolic_conv_last_NAME(const uint32_t kernel_id, - const uint32_t kernel_row, const uint32_t num_rows, - const uint32_t num_cols, - int32_t const *__restrict__ W) { +void systolic_conv_last_end(const uint32_t kernel_id, const uint32_t kernel_row, + const uint32_t num_rows, const uint32_t num_cols, + int32_t const *__restrict__ W) { int32_t *queue_prev_x; int32_t *queue_prev_y; int32_t *queue_next_y; From e29d84cb822ccb318aba054d66d95182042dd76e Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Fri, 25 Jun 2021 02:43:19 +0200 Subject: [PATCH 16/24] [apps] Improve 2d conv density for conv_xqueue --- software/apps/systolic/conv_xqueue/main.c | 178 +---- software/runtime/systolic/conv_xqueue.h | 840 +++++++++++++++------- 2 files changed, 597 insertions(+), 421 deletions(-) diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c index 7184962f7..6fd8045b5 100644 --- a/software/apps/systolic/conv_xqueue/main.c +++ b/software/apps/systolic/conv_xqueue/main.c @@ -31,23 +31,17 @@ #define DIM_X_N 32 // Dimensions of matrix Y -#define DIM_Y_M (DIM_X_M - (KERNEL_SIZE - 1)) -#define DIM_Y_N (DIM_X_N - (KERNEL_SIZE - 1)) +#define DIM_Y_M (DIM_X_M - 2) +#define DIM_Y_N (DIM_X_N - 2) -// Dimensions of maps -#define KERNEL_ROWS KERNEL_SIZE -#define KERNEL_COLS (KERNEL_SIZE * NUM_KERNELS) -#define NUM_ACCS NUM_KERNELS - -uint32_t *kernel_tile_map; -uint32_t *kernel_core_map; -uint32_t *row_acc_tile_map; -uint32_t *row_acc_core_map; +uint32_t *tile_map; +uint32_t *core_map; int32_t *matrix_X; int32_t *matrix_Y; int32_t weights[3][3] = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; +int32_t *matrix_W = (int32_t *)weights; void generate_gradient_matrix(int32_t **matrix, uint32_t num_rows, uint32_t num_cols) { @@ -84,110 +78,16 @@ int main() { // Allocate tile and core maps if (core_id == 0) { - kernel_tile_map = (uint32_t *)simple_malloc(KERNEL_ROWS * KERNEL_COLS * 4); - kernel_core_map = (uint32_t *)simple_malloc(KERNEL_ROWS * KERNEL_COLS * 4); - row_acc_tile_map = (uint32_t *)simple_malloc(NUM_ACCS * 4); - row_acc_core_map = (uint32_t *)simple_malloc(NUM_ACCS * 4); - } - - // Systolic identifiers - int32_t is_enabled = 0; - int32_t is_kernel_core = 0; - uint32_t kernel_id = 0; - uint32_t kernel_row = 0; - uint32_t kernel_col = 0; - - // ---------- - // ACC COMBO - // ---------- - - // XY: X = Tile and Y = Core % 4 - // - // 00 01 30 ** - // 10 11 31 33 02 03 40 ** - // 20 21 32 ** 12 13 41 43 - // 22 23 42 ** - - // TODO: CURRENTLY ONLY WORKS FOR KERNEL_SIZE == 3 - - uint32_t group_id = tile_id / 5; - uint32_t group_tile_id = tile_id % 5; - uint32_t tile_core_id = core_id % 4; - if (group_tile_id < 3) { - is_kernel_core = 1; - kernel_row = group_tile_id; - kernel_col = tile_core_id % 2; - kernel_id = 2 * group_id + (tile_core_id / 2); - } else { - if (tile_core_id == 3) { - is_kernel_core = 0; - } else { - is_kernel_core = 1; - kernel_row = tile_core_id; - kernel_col = 2; - } - kernel_id = 2 * group_id + (group_tile_id % 3); - } - - // ---------- - // LONG ROWS - // ---------- - - // XY: X = Tile and Y = Core % 4 - // - // 00 01 02 ** - // 10 11 12 90 13 30 31 ** - // 20 21 22 ** 23 40 41 91 42 43 60 ** - // 03 50 51 ** 52 53 70 92 71 72 73 ** - // 32 33 80 ** 81 82 83 93 - // 61 62 63 ** - - // TODO: CURRENTLY ONLY WORKS FOR KERNEL_SIZE == 3 - - // uint32_t group_id = tile_id / 10; - // uint32_t group_tile_id = tile_id % 10; - // uint32_t tile_core_id = core_id % 4; - // if (group_tile_id < 9) { - // is_kernel_core = 1; - // uint32_t group_kernel_id = group_tile_id / 3; - // kernel_row = group_tile_id % 3; - // kernel_col = (tile_core_id + group_kernel_id) % 3; - // uint32_t threshold = 3 - group_kernel_id; - // if (tile_core_id >= threshold) { - // group_kernel_id += 1; - // if (kernel_row == 0) { - // kernel_row = 2; - // } else { - // kernel_row -= 1; - // } - // } - // kernel_id = 4 * group_id + group_kernel_id; - // } else { - // is_kernel_core = 0; - // kernel_id = 4 * group_id + tile_core_id; - // } - - // Core is only enabled if its kernel is required - if (kernel_id < NUM_KERNELS) { - is_enabled = 1; - } else { - is_enabled = 0; + tile_map = (uint32_t *)simple_malloc(num_cores * 4); + core_map = (uint32_t *)simple_malloc(num_cores * 4); } // Wait for all cores mempool_barrier(num_cores); // Set tile and core maps - if (is_enabled) { - if (is_kernel_core) { - uint32_t map_col = KERNEL_SIZE * kernel_id + kernel_col; - kernel_tile_map[kernel_row * KERNEL_COLS + map_col] = tile_id; - kernel_core_map[kernel_row * KERNEL_COLS + map_col] = core_id; - } else { - row_acc_tile_map[kernel_id] = tile_id; - row_acc_core_map[kernel_id] = core_id; - } - } + tile_map[core_id] = tile_id; + core_map[core_id] = core_id; // Wait for all cores mempool_barrier(num_cores); @@ -197,14 +97,11 @@ int main() { printf("> Initialize\n"); // Print out maps - // print_matrix((int32_t *)kernel_tile_map, KERNEL_ROWS, KERNEL_COLS); - // print_matrix((int32_t *)kernel_core_map, KERNEL_ROWS, KERNEL_COLS); - // print_matrix((int32_t *)row_acc_tile_map, 1, NUM_ACCS); - // print_matrix((int32_t *)row_acc_core_map, 1, NUM_ACCS); + // print_matrix((int32_t *)tile_map, 1, num_cores); + // print_matrix((int32_t *)core_map, 1, num_cores); // Initialize systolic array - systolic_init(kernel_tile_map, kernel_core_map, row_acc_tile_map, - row_acc_core_map); + systolic_init(tile_map, core_map); // Create and initialize matrices generate_gradient_matrix(&matrix_X, DIM_X_M, DIM_X_N); @@ -231,44 +128,15 @@ int main() { // Wait for all cores mempool_barrier(num_cores); - if (is_enabled) { - if (is_kernel_core) { - switch (kernel_col) { - case 0: - if (kernel_id == 0) { - systolic_conv_first_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - matrix_X, (int32_t *)weights); - } else { - if (kernel_row == 2) { - systolic_conv_first_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - matrix_X, (int32_t *)weights); - } else { - systolic_conv_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - (int32_t *)weights); - } - } - break; - case (KERNEL_SIZE - 1): - if (kernel_id == NUM_KERNELS - 1) { - systolic_conv_last_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - (int32_t *)weights); - } else { - if (kernel_row == 0) { - systolic_conv_last_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - (int32_t *)weights); - } else { - systolic_conv_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - (int32_t *)weights); - } - } - break; - default: - systolic_conv_mid(kernel_id, kernel_row, DIM_X_M, DIM_X_N, - (int32_t *)weights); - } - } else { - systolic_conv_row_acc(kernel_id, DIM_Y_M, DIM_Y_N, matrix_Y); - } + switch (core_id) { + case 0: + systolic_conv_front(DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y); + break; + case (NUM_CORES - 1): + systolic_conv_end(core_id, DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y); + break; + default: + systolic_conv_mid(core_id, DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y); } // Wait for all cores @@ -285,8 +153,8 @@ int main() { printf("> End\n"); // Print out matrix Y - printf("> Print Matrix Y\n"); - print_matrix(matrix_Y, DIM_Y_M, DIM_Y_N); + // printf("> Print Matrix Y\n"); + // print_matrix(matrix_Y, DIM_Y_M, DIM_Y_N); } // wait until all cores have finished diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h index 85e372f42..7224acb90 100644 --- a/software/runtime/systolic/conv_xqueue.h +++ b/software/runtime/systolic/conv_xqueue.h @@ -21,8 +21,8 @@ */ /* TODO DESCRIPTION - * - * + * TODO: LIMITATION NUM_COLS_Y >= 2 <=> NUM_COLS >= 4 + * TODO: COMPLETELY FIXED TO KERNEL SIZE OF 3 * * * @@ -32,16 +32,9 @@ #include "alloc.h" #include "printf.h" -// Kernel size (fixed) -#define KERNEL_SIZE 3 - -// Number of kernels -#define NUM_KERNELS 25 - // Array of queue ptrs in row-major order (concatenated kernels) -int32_t *queues_x[KERNEL_SIZE][NUM_KERNELS * KERNEL_SIZE]; -int32_t *queues_y[KERNEL_SIZE][NUM_KERNELS * KERNEL_SIZE]; -int32_t *queues_row_acc[KERNEL_SIZE][NUM_KERNELS]; +int32_t *queues_x_0[NUM_CORES]; +int32_t *queues_x_1[NUM_CORES]; // queue push static inline void queue_push(void *const queue, int32_t data, @@ -54,307 +47,622 @@ inline void queue_pop(void *const queue, int32_t *const ret) { asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue)); } -void systolic_init(uint32_t const *kernel_tile_map, - uint32_t const *kernel_core_map, - uint32_t const *row_acc_tile_map, - uint32_t const *row_acc_core_map) { +void systolic_init(uint32_t const *tile_map, uint32_t const *core_map) { // Create systolic array via queues extern int32_t __seq_start; - uint32_t grid_pos; uint32_t tile_id; uint32_t core_id; uint32_t tile_offset; uint32_t core_offset; - // Kernel queues - grid_pos = 0; - for (uint32_t y = 0; y < KERNEL_SIZE; ++y) { - for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) { - tile_id = kernel_tile_map[grid_pos]; - core_id = kernel_core_map[grid_pos]; - tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4; - core_offset = core_id % 4 * 4; - queues_x[y][x] = &__seq_start + tile_offset + core_offset + 0; - queues_y[y][x] = &__seq_start + tile_offset + core_offset + 1; - ++grid_pos; - } - } - - // Row accumulator queues - grid_pos = 0; - for (uint32_t x = 0; x < NUM_KERNELS; ++x) { - tile_id = row_acc_tile_map[x]; - core_id = row_acc_core_map[x]; + for (uint32_t i = 0; i < NUM_CORES; ++i) { + tile_id = tile_map[i]; + core_id = core_map[i]; tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4; core_offset = core_id % 4 * 4; - queues_row_acc[0][x] = &__seq_start + tile_offset + core_offset + 0; - queues_row_acc[1][x] = &__seq_start + tile_offset + core_offset + 1; - queues_row_acc[2][x] = &__seq_start + tile_offset + core_offset + 2; + queues_x_0[i] = &__seq_start + tile_offset + core_offset + 0; + queues_x_1[i] = &__seq_start + tile_offset + core_offset + 1; } // Print out queue addresses - // printf("queues_x\n"); - // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) { - // for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) { - // printf("%5d ", queues_x[y][x]); - // } - // printf("\n"); - // } - // printf("queues_y\n"); - // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) { - // for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) { - // printf("%5d ", queues_y[y][x]); - // } - // printf("\n"); + // printf("queues_x_0\n"); + // for (uint32_t i = 0; i < NUM_CORES; ++i) { + // printf("%5d ", queues_x_0[i]); // } - // printf("queues_row_acc\n"); - // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) { - // for (uint32_t x = 0; x < NUM_KERNELS; ++x) { - // printf("%5d ", queues_row_acc[y][x]); - // } - // printf("\n"); + // printf("\n"); + // printf("queues_x_1\n"); + // for (uint32_t i = 0; i < NUM_CORES; ++i) { + // printf("%5d ", queues_x_1[i]); // } + // printf("\n"); } -void systolic_conv_first_front(const uint32_t kernel_id, - const uint32_t kernel_row, - const uint32_t num_rows, const uint32_t num_cols, - int32_t const *__restrict__ X, - int32_t const *__restrict__ W) { - int32_t *queue_next_x; - int32_t *queue_next_y; - int32_t resp_x __attribute__((unused)) = 0; - int32_t resp_y __attribute__((unused)) = 0; - int32_t weight; - int32_t curr_x; - int32_t curr_y; - uint32_t first_row = kernel_id + kernel_row; - uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1; +void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, + int32_t const *__restrict__ X, + int32_t const *__restrict__ W, + int32_t *__restrict__ Y) { + int32_t *queue_next_x_0; + int32_t *queue_next_x_1; + int32_t resp_x_0 __attribute__((unused)) = 0; + int32_t resp_x_1 __attribute__((unused)) = 0; + int32_t weights[3][3]; + int32_t curr_x[3]; + int32_t acc_y[3] = {0, 0, 0}; + uint32_t row; + uint32_t col; + uint32_t num_cols_y = num_cols - 2; // Assign queues - queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1]; - queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1]; + queue_next_x_0 = queues_x_0[1]; + queue_next_x_1 = queues_x_1[1]; - // Load weight - weight = W[kernel_row * KERNEL_SIZE + 0]; - - // Execute row-wise systolic 2d convolution - for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) { - // Populate kernel - curr_x = X[row * num_cols + 0]; - queue_push(queue_next_x, curr_x, &resp_x); - curr_x = X[row * num_cols + 1]; - queue_push(queue_next_x, curr_x, &resp_x); - curr_x = X[row * num_cols + 2]; - // Convolution - for (uint32_t col = 3; col < num_cols; ++col) { - queue_push(queue_next_x, curr_x, &resp_x); - curr_y = curr_x * weight; - curr_x = X[row * num_cols + col]; - queue_push(queue_next_y, curr_y, &resp_y); + // Load weights + for (uint32_t y = 0; y < 3; ++y) { + for (uint32_t x = 0; x < 3; ++x) { + weights[y][x] = W[y * 3 + x]; } - // Flush kernel - queue_push(queue_next_x, curr_x, &resp_x); - curr_y = curr_x * weight; - queue_push(queue_next_y, curr_y, &resp_y); } -} - -void systolic_conv_front(const uint32_t kernel_id, const uint32_t kernel_row, - const uint32_t num_rows, const uint32_t num_cols, - int32_t const *__restrict__ W) { - int32_t *queue_prev_x; - int32_t *queue_next_x; - int32_t *queue_next_y; - int32_t resp_x __attribute__((unused)) = 0; - int32_t resp_y __attribute__((unused)) = 0; - int32_t weight; - int32_t curr_x; - int32_t curr_y; - uint32_t first_row = kernel_id + kernel_row; - uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1; - - // Assign queues - queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 0]; - queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1]; - queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1]; - - // Load weight - weight = W[kernel_row * KERNEL_SIZE + 0]; // Execute row-wise systolic 2d convolution - for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) { - // Populate kernel - queue_pop(queue_prev_x, &curr_x); - queue_push(queue_next_x, curr_x, &resp_x); - queue_pop(queue_prev_x, &curr_x); - queue_push(queue_next_x, curr_x, &resp_x); - queue_pop(queue_prev_x, &curr_x); - // Convolution - for (uint32_t col = 3; col < num_cols; ++col) { - queue_push(queue_next_x, curr_x, &resp_x); - curr_y = curr_x * weight; - queue_pop(queue_prev_x, &curr_x); - queue_push(queue_next_y, curr_y, &resp_y); + row = 2; + while (row < num_rows - 1) { + // ---------- + // POPULATE 0 + // ---------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + 0]; + curr_x[2] = X[(row - 0) * num_cols + 0]; + curr_x[0] = X[(row - 2) * num_cols + 0]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st column of weights + acc_y[2] += curr_x[0] * weights[0][0]; + acc_y[2] += curr_x[1] * weights[1][0]; + acc_y[2] += curr_x[2] * weights[2][0]; + // ---------- + // POPULATE 1 + // ---------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + 1]; + curr_x[2] = X[(row - 0) * num_cols + 1]; + curr_x[0] = X[(row - 2) * num_cols + 1]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 2nd column of weights + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[2] * weights[2][1]; + // MACs with 1st column of weights + acc_y[0] += curr_x[0] * weights[0][0]; + acc_y[0] += curr_x[1] * weights[1][0]; + acc_y[0] += curr_x[2] * weights[2][0]; + // ----------- + // CONVOLUTION + // ----------- + col = 2; + while (col < num_cols_y) { + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col]; + curr_x[2] = X[(row - 0) * num_cols + col]; + curr_x[0] = X[(row - 2) * num_cols + col]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // MACs with 2nd column of weights + acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; + // MACs with 1st column of weights + acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0]; + acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0]; + acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // Increment column index + ++col; } - // Flush kernel - queue_push(queue_next_x, curr_x, &resp_x); - curr_y = curr_x * weight; - queue_push(queue_next_y, curr_y, &resp_y); + // ------- + // FLUSH 0 + // ------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col]; + curr_x[2] = X[(row - 0) * num_cols + col]; + curr_x[0] = X[(row - 2) * num_cols + col]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // MACs with 2nd column of weights + acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // Increment column index + ++col; + // ------- + // FLUSH 1 + // ------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col]; + curr_x[2] = X[(row - 0) * num_cols + col]; + curr_x[0] = X[(row - 2) * num_cols + col]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // ------------- + // INCREMENT ROW + // ------------- + row += NUM_CORES; } -} -void systolic_conv_mid(const uint32_t kernel_id, const uint32_t kernel_row, - const uint32_t num_rows, const uint32_t num_cols, - int32_t const *__restrict__ W) { - int32_t *queue_prev_x; - int32_t *queue_next_x; - int32_t *queue_prev_y; - int32_t *queue_next_y; - int32_t resp_x __attribute__((unused)) = 0; - int32_t resp_y __attribute__((unused)) = 0; - int32_t weight; - int32_t curr_x; - int32_t curr_y; - uint32_t first_row = kernel_id + kernel_row; - uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1; - - // Assign queues - queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1]; - queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2]; - queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1]; - queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2]; - - // Load weight - weight = W[kernel_row * KERNEL_SIZE + 1]; - - // Execute row-wise systolic 2d convolution - for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) { - // Populate kernel - queue_pop(queue_prev_x, &curr_x); - queue_push(queue_next_x, curr_x, &resp_x); - queue_pop(queue_prev_x, &curr_x); - // Convolution - for (uint32_t col = 2; col < num_cols; ++col) { - queue_pop(queue_prev_y, &curr_y); - queue_push(queue_next_x, curr_x, &resp_x); - curr_y += curr_x * weight; - queue_pop(queue_prev_x, &curr_x); - queue_push(queue_next_y, curr_y, &resp_y); + // Finish last row of systolic 2d convolution without pushing + if (row == num_rows - 1) { + // ---------- + // POPULATE 0 + // ---------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + 0]; + curr_x[2] = X[(row - 0) * num_cols + 0]; + curr_x[0] = X[(row - 2) * num_cols + 0]; + // MACs with 1st column of weights + acc_y[2] += curr_x[0] * weights[0][0]; + acc_y[2] += curr_x[1] * weights[1][0]; + acc_y[2] += curr_x[2] * weights[2][0]; + // ---------- + // POPULATE 1 + // ---------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + 1]; + curr_x[2] = X[(row - 0) * num_cols + 1]; + curr_x[0] = X[(row - 2) * num_cols + 1]; + // MACs with 2nd column of weights + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[2] * weights[2][1]; + // MACs with 1st column of weights + acc_y[0] += curr_x[0] * weights[0][0]; + acc_y[0] += curr_x[1] * weights[1][0]; + acc_y[0] += curr_x[2] * weights[2][0]; + // ----------- + // CONVOLUTION + // ----------- + col = 2; + while (col < num_cols_y) { + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col]; + curr_x[2] = X[(row - 0) * num_cols + col]; + curr_x[0] = X[(row - 2) * num_cols + col]; + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // MACs with 2nd column of weights + acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; + // MACs with 1st column of weights + acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0]; + acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0]; + acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // Increment column index + ++col; } - // Flush kernel - queue_push(queue_next_x, curr_x, &resp_x); + // ------- + // FLUSH 0 + // ------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col]; + curr_x[2] = X[(row - 0) * num_cols + col]; + curr_x[0] = X[(row - 2) * num_cols + col]; + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // MACs with 2nd column of weights + acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // Increment column index + ++col; + // ------- + // FLUSH 1 + // ------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col]; + curr_x[2] = X[(row - 0) * num_cols + col]; + curr_x[0] = X[(row - 2) * num_cols + col]; + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; } } -void systolic_conv_end(const uint32_t kernel_id, const uint32_t kernel_row, - const uint32_t num_rows, const uint32_t num_cols, - int32_t const *__restrict__ W) { - int32_t *queue_prev_x; - int32_t *queue_next_x; - int32_t *queue_prev_y; - int32_t *queue_next_y; - int32_t resp_x __attribute__((unused)) = 0; - int32_t resp_y __attribute__((unused)) = 0; - int32_t weight; - int32_t curr_x; - int32_t curr_y; - uint32_t first_row = kernel_id + kernel_row; - uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1; +void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, + const uint32_t num_cols, int32_t const *__restrict__ X, + int32_t const *__restrict__ W, int32_t *__restrict__ Y) { + int32_t *queue_prev_x_0; + int32_t *queue_next_x_0; + int32_t *queue_prev_x_1; + int32_t *queue_next_x_1; + int32_t resp_x_0 __attribute__((unused)) = 0; + int32_t resp_x_1 __attribute__((unused)) = 0; + int32_t weights[3][3]; + int32_t curr_x[3]; + int32_t acc_y[3] = {0, 0, 0}; + uint32_t row; + uint32_t col; + uint32_t num_cols_y = num_cols - 2; // Assign queues - queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2]; - queue_next_x = queues_x[kernel_row + 1][(kernel_id + 1) * KERNEL_SIZE]; - queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2]; - queue_next_y = queues_row_acc[kernel_row][kernel_id]; - - // Load weight - weight = W[kernel_row * KERNEL_SIZE + 2]; + queue_prev_x_0 = queues_x_0[kernel_id]; + queue_next_x_0 = queues_x_0[kernel_id + 1]; + queue_prev_x_1 = queues_x_1[kernel_id]; + queue_next_x_1 = queues_x_1[kernel_id + 1]; + + // Load weights + for (uint32_t y = 0; y < 3; ++y) { + for (uint32_t x = 0; x < 3; ++x) { + weights[y][x] = W[y * 3 + x]; + } + } // Execute row-wise systolic 2d convolution - for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) { - // Populate kernel - queue_pop(queue_prev_x, &curr_x); - // Convolution - for (uint32_t col = 1; col < num_cols - 1; ++col) { - queue_pop(queue_prev_y, &curr_y); - queue_push(queue_next_x, curr_x, &resp_x); - curr_y += curr_x * weight; - queue_pop(queue_prev_x, &curr_x); - queue_push(queue_next_y, curr_y, &resp_y); + row = kernel_id + 2; + while (row < num_rows - 1) { + // ---------- + // POPULATE 0 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st column of weights + acc_y[2] += curr_x[0] * weights[0][0]; + acc_y[2] += curr_x[1] * weights[1][0]; + acc_y[2] += curr_x[2] * weights[2][0]; + // ---------- + // POPULATE 1 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 2nd column of weights + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[2] * weights[2][1]; + // MACs with 1st column of weights + acc_y[0] += curr_x[0] * weights[0][0]; + acc_y[0] += curr_x[1] * weights[1][0]; + acc_y[0] += curr_x[2] * weights[2][0]; + // ----------- + // CONVOLUTION + // ----------- + col = 2; + while (col < num_cols_y) { + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // MACs with 2nd column of weights + acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; + // MACs with 1st column of weights + acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0]; + acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0]; + acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // Increment column index + ++col; } - // Flush kernel - queue_push(queue_next_x, curr_x, &resp_x); - queue_pop(queue_prev_x, &curr_x); - queue_push(queue_next_x, curr_x, &resp_x); + // ------- + // FLUSH 0 + // ------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // MACs with 2nd column of weights + acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // Increment column index + ++col; + // ------- + // FLUSH 1 + // ------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // ------------- + // INCREMENT ROW + // ------------- + row += NUM_CORES; } - // Flush next queues at the end of execution - queue_pop(queue_next_x, &curr_x); - queue_pop(queue_next_x, &curr_x); + // Finish last row of systolic 2d convolution without pushing + if (row == num_rows - 1) { + // ---------- + // POPULATE 0 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st column of weights + acc_y[2] += curr_x[0] * weights[0][0]; + acc_y[2] += curr_x[1] * weights[1][0]; + acc_y[2] += curr_x[2] * weights[2][0]; + // ---------- + // POPULATE 1 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 2nd column of weights + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[2] * weights[2][1]; + // MACs with 1st column of weights + acc_y[0] += curr_x[0] * weights[0][0]; + acc_y[0] += curr_x[1] * weights[1][0]; + acc_y[0] += curr_x[2] * weights[2][0]; + // ----------- + // CONVOLUTION + // ----------- + col = 2; + while (col < num_cols_y) { + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // MACs with 2nd column of weights + acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; + // MACs with 1st column of weights + acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0]; + acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0]; + acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // Increment column index + ++col; + } + // ------- + // FLUSH 0 + // ------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // MACs with 2nd column of weights + acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // Increment column index + ++col; + // ------- + // FLUSH 1 + // ------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + } } -void systolic_conv_last_end(const uint32_t kernel_id, const uint32_t kernel_row, - const uint32_t num_rows, const uint32_t num_cols, - int32_t const *__restrict__ W) { - int32_t *queue_prev_x; - int32_t *queue_prev_y; - int32_t *queue_next_y; - int32_t resp_y __attribute__((unused)) = 0; - int32_t weight; - int32_t curr_x; - int32_t curr_y; - uint32_t first_row = kernel_id + kernel_row; - uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1; +void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, + const uint32_t num_cols, int32_t const *__restrict__ X, + int32_t const *__restrict__ W, int32_t *__restrict__ Y) { + int32_t *queue_prev_x_0; + int32_t *queue_prev_x_1; + int32_t weights[3][3]; + int32_t curr_x[3]; + int32_t acc_y[3] = {0, 0, 0}; + uint32_t col; + uint32_t num_cols_y = num_cols - 2; // Assign queues - queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2]; - queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2]; - queue_next_y = queues_row_acc[kernel_row][kernel_id]; + queue_prev_x_0 = queues_x_0[kernel_id]; + queue_prev_x_1 = queues_x_1[kernel_id]; - // Load weight - weight = W[kernel_row * KERNEL_SIZE + 2]; - - // Execute row-wise systolic 2d convolution - for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) { - // Populate kernel - queue_pop(queue_prev_x, &curr_x); - // Convolution - for (uint32_t col = 1; col < num_cols - 1; ++col) { - queue_pop(queue_prev_y, &curr_y); - curr_y += curr_x * weight; - queue_pop(queue_prev_x, &curr_x); - queue_push(queue_next_y, curr_y, &resp_y); + // Load weights + for (uint32_t y = 0; y < 3; ++y) { + for (uint32_t x = 0; x < 3; ++x) { + weights[y][x] = W[y * 3 + x]; } - // Flush kernel - queue_pop(queue_prev_x, &curr_x); } -} - -void systolic_conv_row_acc(const uint32_t kernel_id, const uint32_t num_rows_y, - const uint32_t num_cols_y, int32_t *__restrict__ Y) { - int32_t *queue_y_0; - int32_t *queue_y_1; - int32_t *queue_y_2; - int32_t curr_y_0; - int32_t curr_y_1; - int32_t curr_y_2; - int32_t total_y; - - // Assign queues - queue_y_0 = queues_row_acc[0][kernel_id]; - queue_y_1 = queues_row_acc[1][kernel_id]; - queue_y_2 = queues_row_acc[2][kernel_id]; // Execute row-wise systolic 2d convolution - for (uint32_t row = kernel_id; row < num_rows_y; row += NUM_KERNELS) { - // Accumulate and Store - for (uint32_t col = 0; col < num_cols_y; ++col) { - queue_pop(queue_y_0, &curr_y_0); - queue_pop(queue_y_1, &curr_y_1); - queue_pop(queue_y_2, &curr_y_2); - total_y = curr_y_0 + curr_y_1 + curr_y_2; - Y[row * num_cols_y + col] = total_y; + for (uint32_t row = kernel_id + 2; row < num_rows; row += NUM_CORES) { + // ---------- + // POPULATE 0 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st column of weights + acc_y[2] += curr_x[0] * weights[0][0]; + acc_y[2] += curr_x[1] * weights[1][0]; + acc_y[2] += curr_x[2] * weights[2][0]; + // ---------- + // POPULATE 1 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 2nd column of weights + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[2] * weights[2][1]; + // MACs with 1st column of weights + acc_y[0] += curr_x[0] * weights[0][0]; + acc_y[0] += curr_x[1] * weights[1][0]; + acc_y[0] += curr_x[2] * weights[2][0]; + // ----------- + // CONVOLUTION + // ----------- + col = 2; + while (col < num_cols_y) { + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // MACs with 2nd column of weights + acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; + // MACs with 1st column of weights + acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0]; + acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0]; + acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // Increment column index + ++col; } + // ------- + // FLUSH 0 + // ------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // MACs with 2nd column of weights + acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; + // Increment column index + ++col; + // ------- + // FLUSH 1 + // ------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 3th column of weights + acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + // Reset finished accumulation + acc_y[col % 3] = 0; } } From 0fbe88bd7e3b95b9c7a7796d6154dba9621d0a2f Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Fri, 25 Jun 2021 02:45:16 +0200 Subject: [PATCH 17/24] [apps] Improve conv_xqueue ecode - fix illegal multi queue pop - increase performance via fixed cyclical pattern - add shuffling of MACs to hide accelerator latency --- software/runtime/systolic/conv_xqueue.h | 903 ++++++++++++++++++++---- 1 file changed, 747 insertions(+), 156 deletions(-) diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h index 7224acb90..096ec4427 100644 --- a/software/runtime/systolic/conv_xqueue.h +++ b/software/runtime/systolic/conv_xqueue.h @@ -120,6 +120,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, acc_y[2] += curr_x[0] * weights[0][0]; acc_y[2] += curr_x[1] * weights[1][0]; acc_y[2] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("":::"memory"); // ---------- // POPULATE 1 // ---------- @@ -130,42 +131,165 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, // Push lower part of x vector queue_push(queue_next_x_0, curr_x[1], &resp_x_0); queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 2nd column of weights + // MACs with 1st row of weights acc_y[2] += curr_x[0] * weights[0][1]; - acc_y[2] += curr_x[1] * weights[1][1]; - acc_y[2] += curr_x[2] * weights[2][1]; - // MACs with 1st column of weights acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][1]; acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; - // ----------- - // CONVOLUTION - // ----------- + // ------------------ + // CONVOLUTION BURSTS + // ------------------ col = 2; + while (col < num_cols_y - 2) { + // ----------- + // ITERATION 0 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 0]; + curr_x[2] = X[(row - 0) * num_cols + col + 0]; + curr_x[0] = X[(row - 2) * num_cols + col + 0]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 1 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 1]; + curr_x[2] = X[(row - 0) * num_cols + col + 1]; + curr_x[0] = X[(row - 2) * num_cols + col + 1]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 2 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 2]; + curr_x[2] = X[(row - 0) * num_cols + col + 2]; + curr_x[0] = X[(row - 2) * num_cols + col + 2]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[1] += curr_x[0] * weights[0][2]; + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[1] += curr_x[1] * weights[1][2]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[1] += curr_x[2] * weights[2][2]; + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; + // Reset finished accumulation + acc_y[1] = 0; + // ---------------- + // INCREMENT COLUMN + // ---------------- + col += 3; + } + // --------------------- + // CONVOLUTION REMAINDER + // --------------------- while (col < num_cols_y) { + // ----------- + // ITERATION 0 + // ----------- // Load x vector - curr_x[1] = X[(row - 1) * num_cols + col]; - curr_x[2] = X[(row - 0) * num_cols + col]; - curr_x[0] = X[(row - 2) * num_cols + col]; + curr_x[1] = X[(row - 1) * num_cols + col + 0]; + curr_x[2] = X[(row - 0) * num_cols + col + 0]; + curr_x[0] = X[(row - 2) * num_cols + col + 0]; // Push lower part of x vector queue_push(queue_next_x_0, curr_x[1], &resp_x_0); queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 3th column of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // MACs with 2nd column of weights - acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; - acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; - acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; - // MACs with 1st column of weights - acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0]; - acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0]; - acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0]; + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; // Reset finished accumulation - acc_y[col % 3] = 0; + acc_y[2] = 0; + // Increment column index + ++col; + if (col >= num_cols_y) break; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 1 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 1]; + curr_x[2] = X[(row - 0) * num_cols + col + 1]; + curr_x[0] = X[(row - 2) * num_cols + col + 1]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; // Increment column index ++col; } @@ -179,18 +303,17 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, // Push lower part of x vector queue_push(queue_next_x_0, curr_x[1], &resp_x_0); queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 3th column of weights + // MACs with 1st row of weights acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // MACs with 2nd column of weights acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + // MACs with 2nd row of weights + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + // MACs with 3rd row of weights + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Reset finished accumulation - acc_y[col % 3] = 0; // Increment column index ++col; // ------- @@ -203,14 +326,18 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, // Push lower part of x vector queue_push(queue_next_x_0, curr_x[1], &resp_x_0); queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 3th column of weights + // MACs with 3rd column of weights acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Reset finished accumulation - acc_y[col % 3] = 0; + // ------------------ + // RESET ACCUMULATORS + // ------------------ + acc_y[0] = 0; + acc_y[1] = 0; + acc_y[2] = 0; // ------------- // INCREMENT ROW // ------------- @@ -237,39 +364,156 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, curr_x[1] = X[(row - 1) * num_cols + 1]; curr_x[2] = X[(row - 0) * num_cols + 1]; curr_x[0] = X[(row - 2) * num_cols + 1]; - // MACs with 2nd column of weights + // MACs with 1st row of weights acc_y[2] += curr_x[0] * weights[0][1]; - acc_y[2] += curr_x[1] * weights[1][1]; - acc_y[2] += curr_x[2] * weights[2][1]; - // MACs with 1st column of weights acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][1]; acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; - // ----------- - // CONVOLUTION - // ----------- + // ------------------ + // CONVOLUTION BURSTS + // ------------------ col = 2; + while (col < num_cols_y - 2) { + // ----------- + // ITERATION 0 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 0]; + curr_x[2] = X[(row - 0) * num_cols + col + 0]; + curr_x[0] = X[(row - 2) * num_cols + col + 0]; + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 1 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 1]; + curr_x[2] = X[(row - 0) * num_cols + col + 1]; + curr_x[0] = X[(row - 2) * num_cols + col + 1]; + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 2 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 2]; + curr_x[2] = X[(row - 0) * num_cols + col + 2]; + curr_x[0] = X[(row - 2) * num_cols + col + 2]; + // MACs with 1st row of weights + acc_y[1] += curr_x[0] * weights[0][2]; + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[1] += curr_x[1] * weights[1][2]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[1] += curr_x[2] * weights[2][2]; + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; + // Reset finished accumulation + acc_y[1] = 0; + // ---------------- + // INCREMENT COLUMN + // ---------------- + col += 3; + } + // --------------------- + // CONVOLUTION REMAINDER + // --------------------- while (col < num_cols_y) { + // ----------- + // ITERATION 0 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 0]; + curr_x[2] = X[(row - 0) * num_cols + col + 0]; + curr_x[0] = X[(row - 2) * num_cols + col + 0]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + // Increment column index + ++col; + if (col >= num_cols_y) break; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 1 + // ----------- // Load x vector - curr_x[1] = X[(row - 1) * num_cols + col]; - curr_x[2] = X[(row - 0) * num_cols + col]; - curr_x[0] = X[(row - 2) * num_cols + col]; - // MACs with 3th column of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // MACs with 2nd column of weights - acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; - acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; - acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; - // MACs with 1st column of weights - acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0]; - acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0]; - acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0]; + curr_x[1] = X[(row - 1) * num_cols + col + 1]; + curr_x[2] = X[(row - 0) * num_cols + col + 1]; + curr_x[0] = X[(row - 2) * num_cols + col + 1]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; // Reset finished accumulation - acc_y[col % 3] = 0; + acc_y[0] = 0; // Increment column index ++col; } @@ -280,18 +524,17 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, curr_x[1] = X[(row - 1) * num_cols + col]; curr_x[2] = X[(row - 0) * num_cols + col]; curr_x[0] = X[(row - 2) * num_cols + col]; - // MACs with 3th column of weights + // MACs with 1st row of weights acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // MACs with 2nd column of weights acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + // MACs with 2nd row of weights + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + // MACs with 3rd row of weights + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Reset finished accumulation - acc_y[col % 3] = 0; // Increment column index ++col; // ------- @@ -301,14 +544,12 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, curr_x[1] = X[(row - 1) * num_cols + col]; curr_x[2] = X[(row - 0) * num_cols + col]; curr_x[0] = X[(row - 2) * num_cols + col]; - // MACs with 3th column of weights + // MACs with 3rd column of weights acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Reset finished accumulation - acc_y[col % 3] = 0; } } @@ -358,6 +599,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, acc_y[2] += curr_x[0] * weights[0][0]; acc_y[2] += curr_x[1] * weights[1][0]; acc_y[2] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("":::"memory"); // ---------- // POPULATE 1 // ---------- @@ -368,19 +610,142 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, // Push lower part of x vector queue_push(queue_next_x_0, curr_x[1], &resp_x_0); queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 2nd column of weights + // MACs with 1st row of weights acc_y[2] += curr_x[0] * weights[0][1]; - acc_y[2] += curr_x[1] * weights[1][1]; - acc_y[2] += curr_x[2] * weights[2][1]; - // MACs with 1st column of weights acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][1]; acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; - // ----------- - // CONVOLUTION - // ----------- + // ------------------ + // CONVOLUTION BURSTS + // ------------------ col = 2; + while (col < num_cols_y - 2) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 1 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 2 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 2]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[1] += curr_x[0] * weights[0][2]; + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[1] += curr_x[1] * weights[1][2]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[1] += curr_x[2] * weights[2][2]; + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; + // Reset finished accumulation + acc_y[1] = 0; + // ---------------- + // INCREMENT COLUMN + // ---------------- + col += 3; + } + // --------------------- + // CONVOLUTION REMAINDER + // --------------------- while (col < num_cols_y) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + // Increment column index + ++col; + if (col >= num_cols_y) break; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 1 + // ----------- // Pop and load x vector queue_pop(queue_prev_x_1, &curr_x[1]); curr_x[2] = X[row * num_cols + col]; @@ -388,22 +753,22 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, // Push lower part of x vector queue_push(queue_next_x_0, curr_x[1], &resp_x_0); queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 3th column of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // MACs with 2nd column of weights - acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; - acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; - acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; - // MACs with 1st column of weights - acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0]; - acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0]; - acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0]; + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; // Reset finished accumulation - acc_y[col % 3] = 0; + acc_y[0] = 0; // Increment column index ++col; } @@ -417,18 +782,17 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, // Push lower part of x vector queue_push(queue_next_x_0, curr_x[1], &resp_x_0); queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 3th column of weights + // MACs with 1st row of weights acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // MACs with 2nd column of weights acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + // MACs with 2nd row of weights + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + // MACs with 3rd row of weights + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Reset finished accumulation - acc_y[col % 3] = 0; // Increment column index ++col; // ------- @@ -441,14 +805,18 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, // Push lower part of x vector queue_push(queue_next_x_0, curr_x[1], &resp_x_0); queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 3th column of weights + // MACs with 3rd column of weights acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Reset finished accumulation - acc_y[col % 3] = 0; + // ------------------ + // RESET ACCUMULATORS + // ------------------ + acc_y[0] = 0; + acc_y[1] = 0; + acc_y[2] = 0; // ------------- // INCREMENT ROW // ------------- @@ -475,39 +843,150 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, queue_pop(queue_prev_x_1, &curr_x[1]); curr_x[2] = X[row * num_cols + 1]; queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 2nd column of weights + // MACs with 1st row of weights acc_y[2] += curr_x[0] * weights[0][1]; - acc_y[2] += curr_x[1] * weights[1][1]; - acc_y[2] += curr_x[2] * weights[2][1]; - // MACs with 1st column of weights acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][1]; acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; - // ----------- - // CONVOLUTION - // ----------- + // ------------------ + // CONVOLUTION BURSTS + // ------------------ col = 2; + while (col < num_cols_y - 2) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 1 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 2 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 2]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[1] += curr_x[0] * weights[0][2]; + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[1] += curr_x[1] * weights[1][2]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[1] += curr_x[2] * weights[2][2]; + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; + // Reset finished accumulation + acc_y[1] = 0; + // ---------------- + // INCREMENT COLUMN + // ---------------- + col += 3; + } + // --------------------- + // CONVOLUTION REMAINDER + // --------------------- while (col < num_cols_y) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + // Increment column index + ++col; + if (col >= num_cols_y) break; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 1 + // ----------- // Pop and load x vector queue_pop(queue_prev_x_1, &curr_x[1]); curr_x[2] = X[row * num_cols + col]; queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 3th column of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // MACs with 2nd column of weights - acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; - acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; - acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; - // MACs with 1st column of weights - acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0]; - acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0]; - acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0]; + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; // Reset finished accumulation - acc_y[col % 3] = 0; + acc_y[0] = 0; // Increment column index ++col; } @@ -518,18 +997,17 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, queue_pop(queue_prev_x_1, &curr_x[1]); curr_x[2] = X[row * num_cols + col]; queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 3th column of weights + // MACs with 1st row of weights acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // MACs with 2nd column of weights acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + // MACs with 2nd row of weights + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + // MACs with 3rd row of weights + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Reset finished accumulation - acc_y[col % 3] = 0; // Increment column index ++col; // ------- @@ -539,14 +1017,12 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, queue_pop(queue_prev_x_1, &curr_x[1]); curr_x[2] = X[row * num_cols + col]; queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 3th column of weights + // MACs with 3rd column of weights acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Reset finished accumulation - acc_y[col % 3] = 0; } } @@ -585,6 +1061,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, acc_y[2] += curr_x[0] * weights[0][0]; acc_y[2] += curr_x[1] * weights[1][0]; acc_y[2] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("":::"memory"); // ---------- // POPULATE 1 // ---------- @@ -592,39 +1069,150 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, queue_pop(queue_prev_x_1, &curr_x[1]); curr_x[2] = X[row * num_cols + 1]; queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 2nd column of weights + // MACs with 1st row of weights acc_y[2] += curr_x[0] * weights[0][1]; - acc_y[2] += curr_x[1] * weights[1][1]; - acc_y[2] += curr_x[2] * weights[2][1]; - // MACs with 1st column of weights acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][1]; acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; - // ----------- - // CONVOLUTION - // ----------- + // ------------------ + // CONVOLUTION BURSTS + // ------------------ col = 2; + while (col < num_cols_y - 2) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 1 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 2 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 2]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[1] += curr_x[0] * weights[0][2]; + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[1] += curr_x[1] * weights[1][2]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[1] += curr_x[2] * weights[2][2]; + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; + // Reset finished accumulation + acc_y[1] = 0; + // ---------------- + // INCREMENT COLUMN + // ---------------- + col += 3; + } + // --------------------- + // CONVOLUTION REMAINDER + // --------------------- while (col < num_cols_y) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + // Increment column index + ++col; + if (col >= num_cols_y) break; + __asm__ __volatile__("":::"memory"); + // ----------- + // ITERATION 1 + // ----------- // Pop and load x vector queue_pop(queue_prev_x_1, &curr_x[1]); curr_x[2] = X[row * num_cols + col]; queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 3th column of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // MACs with 2nd column of weights - acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; - acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; - acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; - // MACs with 1st column of weights - acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0]; - acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0]; - acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0]; + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; // Reset finished accumulation - acc_y[col % 3] = 0; + acc_y[0] = 0; // Increment column index ++col; } @@ -635,18 +1223,17 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, queue_pop(queue_prev_x_1, &curr_x[1]); curr_x[2] = X[row * num_cols + col]; queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 3th column of weights + // MACs with 1st row of weights acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // MACs with 2nd column of weights acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; + // MACs with 2nd row of weights + acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; + // MACs with 3rd row of weights + acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Reset finished accumulation - acc_y[col % 3] = 0; // Increment column index ++col; // ------- @@ -656,13 +1243,17 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, queue_pop(queue_prev_x_1, &curr_x[1]); curr_x[2] = X[row * num_cols + col]; queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 3th column of weights + // MACs with 3rd column of weights acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Reset finished accumulation - acc_y[col % 3] = 0; + // ------------------ + // RESET ACCUMULATORS + // ------------------ + acc_y[0] = 0; + acc_y[1] = 0; + acc_y[2] = 0; } } From 244b9a5794c273ea5ad4410e998b382587fb2ba4 Mon Sep 17 00:00:00 2001 From: Gua Hao Khov Date: Fri, 16 Jul 2021 02:35:02 +0200 Subject: [PATCH 18/24] [apps] Improve regularity of conv_xqueue --- hardware/src/tcdm_adapter_xqueue.sv | 6 +- software/apps/systolic/conv_xqueue/main.c | 4 +- software/runtime/systolic/conv_xqueue.h | 264 +++------------------- 3 files changed, 42 insertions(+), 232 deletions(-) diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv index 4adb3f415..407952141 100644 --- a/hardware/src/tcdm_adapter_xqueue.sv +++ b/hardware/src/tcdm_adapter_xqueue.sv @@ -10,9 +10,9 @@ `include "common_cells/registers.svh" -import cf_math_pkg::idx_width; - -module tcdm_adapter_xqueue #( +module tcdm_adapter_xqueue + import cf_math_pkg::idx_width; +#( parameter int unsigned AddrWidth = 32, parameter int unsigned DataWidth = 32, parameter int unsigned XQueueSize = 4, diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c index 6fd8045b5..c95b674d5 100644 --- a/software/apps/systolic/conv_xqueue/main.c +++ b/software/apps/systolic/conv_xqueue/main.c @@ -27,8 +27,8 @@ #include "synchronization.h" // Dimensions of matrix X -#define DIM_X_M 32 -#define DIM_X_N 32 +#define DIM_X_M 258 +#define DIM_X_N 61 // Dimensions of matrix Y #define DIM_Y_M (DIM_X_M - 2) diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h index 096ec4427..bff238d0e 100644 --- a/software/runtime/systolic/conv_xqueue.h +++ b/software/runtime/systolic/conv_xqueue.h @@ -39,12 +39,12 @@ int32_t *queues_x_1[NUM_CORES]; // queue push static inline void queue_push(void *const queue, int32_t data, int32_t *const ret) { - asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue)); + asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue) : "memory"); } // queue pop inline void queue_pop(void *const queue, int32_t *const ret) { - asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue)); + asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue) : "memory"); } void systolic_init(uint32_t const *tile_map, uint32_t const *core_map) { @@ -87,7 +87,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, int32_t resp_x_1 __attribute__((unused)) = 0; int32_t weights[3][3]; int32_t curr_x[3]; - int32_t acc_y[3] = {0, 0, 0}; + register int32_t acc_y[3] = {0, 0, 0}; uint32_t row; uint32_t col; uint32_t num_cols_y = num_cols - 2; @@ -140,11 +140,12 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, // MACs with 3rd row of weights acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("":::"memory"); // ------------------ // CONVOLUTION BURSTS // ------------------ col = 2; - while (col < num_cols_y - 2) { + while (col < num_cols_y) { // ----------- // ITERATION 0 // ----------- @@ -225,15 +226,17 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; // Reset finished accumulation acc_y[1] = 0; + __asm__ __volatile__("":::"memory"); // ---------------- // INCREMENT COLUMN // ---------------- col += 3; } + __asm__ __volatile__("":::"memory"); // --------------------- // CONVOLUTION REMAINDER // --------------------- - while (col < num_cols_y) { + while (col < num_cols) { // ----------- // ITERATION 0 // ----------- @@ -258,11 +261,9 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, acc_y[1] += curr_x[2] * weights[2][0]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; - // Reset finished accumulation - acc_y[2] = 0; // Increment column index ++col; - if (col >= num_cols_y) break; + if (col >= num_cols) break; __asm__ __volatile__("":::"memory"); // ----------- // ITERATION 1 @@ -288,50 +289,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, acc_y[2] += curr_x[2] * weights[2][0]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; - // Reset finished accumulation - acc_y[0] = 0; - // Increment column index - ++col; } - // ------- - // FLUSH 0 - // ------- - // Load x vector - curr_x[1] = X[(row - 1) * num_cols + col]; - curr_x[2] = X[(row - 0) * num_cols + col]; - curr_x[0] = X[(row - 2) * num_cols + col]; - // Push lower part of x vector - queue_push(queue_next_x_0, curr_x[1], &resp_x_0); - queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 1st row of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; - // MACs with 2nd row of weights - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; - // MACs with 3rd row of weights - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; - // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Increment column index - ++col; - // ------- - // FLUSH 1 - // ------- - // Load x vector - curr_x[1] = X[(row - 1) * num_cols + col]; - curr_x[2] = X[(row - 0) * num_cols + col]; - curr_x[0] = X[(row - 2) * num_cols + col]; - // Push lower part of x vector - queue_push(queue_next_x_0, curr_x[1], &resp_x_0); - queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 3rd column of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; // ------------------ // RESET ACCUMULATORS // ------------------ @@ -357,6 +315,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, acc_y[2] += curr_x[0] * weights[0][0]; acc_y[2] += curr_x[1] * weights[1][0]; acc_y[2] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("":::"memory"); // ---------- // POPULATE 1 // ---------- @@ -373,11 +332,12 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, // MACs with 3rd row of weights acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("":::"memory"); // ------------------ // CONVOLUTION BURSTS // ------------------ col = 2; - while (col < num_cols_y - 2) { + while (col < num_cols_y) { // ----------- // ITERATION 0 // ----------- @@ -449,15 +409,17 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; // Reset finished accumulation acc_y[1] = 0; + __asm__ __volatile__("":::"memory"); // ---------------- // INCREMENT COLUMN // ---------------- col += 3; } + __asm__ __volatile__("":::"memory"); // --------------------- // CONVOLUTION REMAINDER // --------------------- - while (col < num_cols_y) { + while (col < num_cols) { // ----------- // ITERATION 0 // ----------- @@ -482,11 +444,9 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, acc_y[1] += curr_x[2] * weights[2][0]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; - // Reset finished accumulation - acc_y[2] = 0; // Increment column index ++col; - if (col >= num_cols_y) break; + if (col >= num_cols) break; __asm__ __volatile__("":::"memory"); // ----------- // ITERATION 1 @@ -512,44 +472,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, acc_y[2] += curr_x[2] * weights[2][0]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; - // Reset finished accumulation - acc_y[0] = 0; - // Increment column index - ++col; } - // ------- - // FLUSH 0 - // ------- - // Load x vector - curr_x[1] = X[(row - 1) * num_cols + col]; - curr_x[2] = X[(row - 0) * num_cols + col]; - curr_x[0] = X[(row - 2) * num_cols + col]; - // MACs with 1st row of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; - // MACs with 2nd row of weights - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; - // MACs with 3rd row of weights - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; - // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Increment column index - ++col; - // ------- - // FLUSH 1 - // ------- - // Load x vector - curr_x[1] = X[(row - 1) * num_cols + col]; - curr_x[2] = X[(row - 0) * num_cols + col]; - curr_x[0] = X[(row - 2) * num_cols + col]; - // MACs with 3rd column of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; } } @@ -564,7 +487,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, int32_t resp_x_1 __attribute__((unused)) = 0; int32_t weights[3][3]; int32_t curr_x[3]; - int32_t acc_y[3] = {0, 0, 0}; + register int32_t acc_y[3] = {0, 0, 0}; uint32_t row; uint32_t col; uint32_t num_cols_y = num_cols - 2; @@ -619,11 +542,12 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, // MACs with 3rd row of weights acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("":::"memory"); // ------------------ // CONVOLUTION BURSTS // ------------------ col = 2; - while (col < num_cols_y - 2) { + while (col < num_cols_y) { // ----------- // ITERATION 0 // ----------- @@ -704,15 +628,17 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; // Reset finished accumulation acc_y[1] = 0; + __asm__ __volatile__("":::"memory"); // ---------------- // INCREMENT COLUMN // ---------------- col += 3; } + __asm__ __volatile__("":::"memory"); // --------------------- // CONVOLUTION REMAINDER // --------------------- - while (col < num_cols_y) { + while (col < num_cols) { // ----------- // ITERATION 0 // ----------- @@ -737,11 +663,9 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, acc_y[1] += curr_x[2] * weights[2][0]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; - // Reset finished accumulation - acc_y[2] = 0; // Increment column index ++col; - if (col >= num_cols_y) break; + if (col >= num_cols) break; __asm__ __volatile__("":::"memory"); // ----------- // ITERATION 1 @@ -767,50 +691,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, acc_y[2] += curr_x[2] * weights[2][0]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; - // Reset finished accumulation - acc_y[0] = 0; - // Increment column index - ++col; } - // ------- - // FLUSH 0 - // ------- - // Pop and load x vector - queue_pop(queue_prev_x_1, &curr_x[1]); - curr_x[2] = X[row * num_cols + col]; - queue_pop(queue_prev_x_0, &curr_x[0]); - // Push lower part of x vector - queue_push(queue_next_x_0, curr_x[1], &resp_x_0); - queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 1st row of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; - // MACs with 2nd row of weights - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; - // MACs with 3rd row of weights - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; - // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Increment column index - ++col; - // ------- - // FLUSH 1 - // ------- - // Pop and load x vector - queue_pop(queue_prev_x_1, &curr_x[1]); - curr_x[2] = X[row * num_cols + col]; - queue_pop(queue_prev_x_0, &curr_x[0]); - // Push lower part of x vector - queue_push(queue_next_x_0, curr_x[1], &resp_x_0); - queue_push(queue_next_x_1, curr_x[2], &resp_x_1); - // MACs with 3rd column of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; // ------------------ // RESET ACCUMULATORS // ------------------ @@ -836,6 +717,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, acc_y[2] += curr_x[0] * weights[0][0]; acc_y[2] += curr_x[1] * weights[1][0]; acc_y[2] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("":::"memory"); // ---------- // POPULATE 1 // ---------- @@ -852,11 +734,12 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, // MACs with 3rd row of weights acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("":::"memory"); // ------------------ // CONVOLUTION BURSTS // ------------------ col = 2; - while (col < num_cols_y - 2) { + while (col < num_cols_y) { // ----------- // ITERATION 0 // ----------- @@ -928,15 +811,17 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; // Reset finished accumulation acc_y[1] = 0; + __asm__ __volatile__("":::"memory"); // ---------------- // INCREMENT COLUMN // ---------------- col += 3; } + __asm__ __volatile__("":::"memory"); // --------------------- // CONVOLUTION REMAINDER // --------------------- - while (col < num_cols_y) { + while (col < num_cols) { // ----------- // ITERATION 0 // ----------- @@ -958,11 +843,9 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, acc_y[1] += curr_x[2] * weights[2][0]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; - // Reset finished accumulation - acc_y[2] = 0; // Increment column index ++col; - if (col >= num_cols_y) break; + if (col >= num_cols) break; __asm__ __volatile__("":::"memory"); // ----------- // ITERATION 1 @@ -985,44 +868,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, acc_y[2] += curr_x[2] * weights[2][0]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; - // Reset finished accumulation - acc_y[0] = 0; - // Increment column index - ++col; } - // ------- - // FLUSH 0 - // ------- - // Pop and load x vector - queue_pop(queue_prev_x_1, &curr_x[1]); - curr_x[2] = X[row * num_cols + col]; - queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 1st row of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; - // MACs with 2nd row of weights - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; - // MACs with 3rd row of weights - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; - // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Increment column index - ++col; - // ------- - // FLUSH 1 - // ------- - // Pop and load x vector - queue_pop(queue_prev_x_1, &curr_x[1]); - curr_x[2] = X[row * num_cols + col]; - queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 3rd column of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; } } @@ -1033,7 +879,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, int32_t *queue_prev_x_1; int32_t weights[3][3]; int32_t curr_x[3]; - int32_t acc_y[3] = {0, 0, 0}; + register int32_t acc_y[3] = {0, 0, 0}; uint32_t col; uint32_t num_cols_y = num_cols - 2; @@ -1078,11 +924,12 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, // MACs with 3rd row of weights acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("":::"memory"); // ------------------ // CONVOLUTION BURSTS // ------------------ col = 2; - while (col < num_cols_y - 2) { + while (col < num_cols_y) { // ----------- // ITERATION 0 // ----------- @@ -1154,15 +1001,17 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; // Reset finished accumulation acc_y[1] = 0; + __asm__ __volatile__("":::"memory"); // ---------------- // INCREMENT COLUMN // ---------------- col += 3; } + __asm__ __volatile__("":::"memory"); // --------------------- // CONVOLUTION REMAINDER // --------------------- - while (col < num_cols_y) { + while (col < num_cols) { // ----------- // ITERATION 0 // ----------- @@ -1184,11 +1033,9 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, acc_y[1] += curr_x[2] * weights[2][0]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; - // Reset finished accumulation - acc_y[2] = 0; // Increment column index ++col; - if (col >= num_cols_y) break; + if (col >= num_cols) break; __asm__ __volatile__("":::"memory"); // ----------- // ITERATION 1 @@ -1211,44 +1058,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, acc_y[2] += curr_x[2] * weights[2][0]; // Store finished accumulation Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; - // Reset finished accumulation - acc_y[0] = 0; - // Increment column index - ++col; } - // ------- - // FLUSH 0 - // ------- - // Pop and load x vector - queue_pop(queue_prev_x_1, &curr_x[1]); - curr_x[2] = X[row * num_cols + col]; - queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 1st row of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1]; - // MACs with 2nd row of weights - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1]; - // MACs with 3rd row of weights - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1]; - // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; - // Increment column index - ++col; - // ------- - // FLUSH 1 - // ------- - // Pop and load x vector - queue_pop(queue_prev_x_1, &curr_x[1]); - curr_x[2] = X[row * num_cols + col]; - queue_pop(queue_prev_x_0, &curr_x[0]); - // MACs with 3rd column of weights - acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2]; - acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2]; - acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2]; - // Store finished accumulation - Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3]; // ------------------ // RESET ACCUMULATORS // ------------------ From 4517da9fe0094cb396c1a3a83b83fd03f17eca95 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Wed, 14 Sep 2022 12:45:20 +0200 Subject: [PATCH 19/24] [apps] Fix license and format --- software/apps/systolic/conv_xqueue/main.c | 18 +--- software/apps/systolic/matmul_xqueue/main.c | 18 +--- software/apps/systolic/xqueue_test/main.c | 20 +--- software/runtime/systolic/conv_xqueue.h | 106 ++++++++++---------- software/runtime/systolic/matmul_xqueue.h | 16 +-- 5 files changed, 63 insertions(+), 115 deletions(-) diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c index c95b674d5..f4c4339b8 100644 --- a/software/apps/systolic/conv_xqueue/main.c +++ b/software/apps/systolic/conv_xqueue/main.c @@ -1,18 +1,6 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. // Author: Gua Hao Khov, ETH Zurich @@ -21,10 +9,10 @@ #include "alloc.h" #include "encoding.h" -#include "systolic/conv_xqueue.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "systolic/conv_xqueue.h" // Dimensions of matrix X #define DIM_X_M 258 diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c index f7a648ab3..dada500b4 100644 --- a/software/apps/systolic/matmul_xqueue/main.c +++ b/software/apps/systolic/matmul_xqueue/main.c @@ -1,18 +1,6 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. // Author: Gua Hao Khov, ETH Zurich @@ -21,10 +9,10 @@ #include "alloc.h" #include "encoding.h" -#include "systolic/matmul_xqueue.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "systolic/matmul_xqueue.h" // Dimensions of matrices #define DIM_M 24 diff --git a/software/apps/systolic/xqueue_test/main.c b/software/apps/systolic/xqueue_test/main.c index 4cd39ca5c..ee4b7ee92 100644 --- a/software/apps/systolic/xqueue_test/main.c +++ b/software/apps/systolic/xqueue_test/main.c @@ -1,18 +1,6 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. // Author: Gua Hao Khov, ETH Zurich @@ -32,14 +20,14 @@ int32_t producer_check, consumer_check, dummy_check; // queue push static inline int32_t queue_push(void *const queue, int32_t data) { int32_t ret; - asm volatile ("q.push.w %0, %1, (%2)" : "=r"(ret) : "r"(data), "r"(queue)); + asm volatile("q.push.w %0, %1, (%2)" : "=r"(ret) : "r"(data), "r"(queue)); return ret; } // queue pop inline int32_t queue_pop(void *const queue) { int32_t ret; - asm volatile ("q.pop.w %0, 0(%1)" : "=r"(ret) : "r"(queue)); + asm volatile("q.pop.w %0, 0(%1)" : "=r"(ret) : "r"(queue)); return ret; } diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h index bff238d0e..8e6e251de 100644 --- a/software/runtime/systolic/conv_xqueue.h +++ b/software/runtime/systolic/conv_xqueue.h @@ -1,18 +1,6 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. // Author: Gua Hao Khov, ETH Zurich @@ -39,7 +27,10 @@ int32_t *queues_x_1[NUM_CORES]; // queue push static inline void queue_push(void *const queue, int32_t data, int32_t *const ret) { - asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue) : "memory"); + asm volatile("q.push.w %0, %1, (%2)" + : "+r"(*ret) + : "r"(data), "r"(queue) + : "memory"); } // queue pop @@ -120,7 +111,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, acc_y[2] += curr_x[0] * weights[0][0]; acc_y[2] += curr_x[1] * weights[1][0]; acc_y[2] += curr_x[2] * weights[2][0]; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ---------- // POPULATE 1 // ---------- @@ -140,7 +131,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, // MACs with 3rd row of weights acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ------------------ // CONVOLUTION BURSTS // ------------------ @@ -172,7 +163,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; // Reset finished accumulation acc_y[2] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 1 // ----------- @@ -199,7 +190,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; // Reset finished accumulation acc_y[0] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 2 // ----------- @@ -226,13 +217,13 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; // Reset finished accumulation acc_y[1] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ---------------- // INCREMENT COLUMN // ---------------- col += 3; } - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // --------------------- // CONVOLUTION REMAINDER // --------------------- @@ -263,8 +254,9 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; // Increment column index ++col; - if (col >= num_cols) break; - __asm__ __volatile__("":::"memory"); + if (col >= num_cols) + break; + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 1 // ----------- @@ -315,7 +307,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, acc_y[2] += curr_x[0] * weights[0][0]; acc_y[2] += curr_x[1] * weights[1][0]; acc_y[2] += curr_x[2] * weights[2][0]; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ---------- // POPULATE 1 // ---------- @@ -332,7 +324,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, // MACs with 3rd row of weights acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ------------------ // CONVOLUTION BURSTS // ------------------ @@ -361,7 +353,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; // Reset finished accumulation acc_y[2] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 1 // ----------- @@ -385,7 +377,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; // Reset finished accumulation acc_y[0] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 2 // ----------- @@ -409,13 +401,13 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; // Reset finished accumulation acc_y[1] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ---------------- // INCREMENT COLUMN // ---------------- col += 3; } - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // --------------------- // CONVOLUTION REMAINDER // --------------------- @@ -446,8 +438,9 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; // Increment column index ++col; - if (col >= num_cols) break; - __asm__ __volatile__("":::"memory"); + if (col >= num_cols) + break; + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 1 // ----------- @@ -522,7 +515,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, acc_y[2] += curr_x[0] * weights[0][0]; acc_y[2] += curr_x[1] * weights[1][0]; acc_y[2] += curr_x[2] * weights[2][0]; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ---------- // POPULATE 1 // ---------- @@ -542,7 +535,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, // MACs with 3rd row of weights acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ------------------ // CONVOLUTION BURSTS // ------------------ @@ -574,7 +567,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; // Reset finished accumulation acc_y[2] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 1 // ----------- @@ -601,7 +594,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; // Reset finished accumulation acc_y[0] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 2 // ----------- @@ -628,13 +621,13 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; // Reset finished accumulation acc_y[1] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ---------------- // INCREMENT COLUMN // ---------------- col += 3; } - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // --------------------- // CONVOLUTION REMAINDER // --------------------- @@ -665,8 +658,9 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; // Increment column index ++col; - if (col >= num_cols) break; - __asm__ __volatile__("":::"memory"); + if (col >= num_cols) + break; + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 1 // ----------- @@ -717,7 +711,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, acc_y[2] += curr_x[0] * weights[0][0]; acc_y[2] += curr_x[1] * weights[1][0]; acc_y[2] += curr_x[2] * weights[2][0]; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ---------- // POPULATE 1 // ---------- @@ -734,7 +728,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, // MACs with 3rd row of weights acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ------------------ // CONVOLUTION BURSTS // ------------------ @@ -763,7 +757,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; // Reset finished accumulation acc_y[2] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 1 // ----------- @@ -787,7 +781,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; // Reset finished accumulation acc_y[0] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 2 // ----------- @@ -811,13 +805,13 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; // Reset finished accumulation acc_y[1] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ---------------- // INCREMENT COLUMN // ---------------- col += 3; } - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // --------------------- // CONVOLUTION REMAINDER // --------------------- @@ -845,8 +839,9 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; // Increment column index ++col; - if (col >= num_cols) break; - __asm__ __volatile__("":::"memory"); + if (col >= num_cols) + break; + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 1 // ----------- @@ -907,7 +902,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, acc_y[2] += curr_x[0] * weights[0][0]; acc_y[2] += curr_x[1] * weights[1][0]; acc_y[2] += curr_x[2] * weights[2][0]; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ---------- // POPULATE 1 // ---------- @@ -924,7 +919,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, // MACs with 3rd row of weights acc_y[2] += curr_x[2] * weights[2][1]; acc_y[0] += curr_x[2] * weights[2][0]; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ------------------ // CONVOLUTION BURSTS // ------------------ @@ -953,7 +948,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; // Reset finished accumulation acc_y[2] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 1 // ----------- @@ -977,7 +972,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; // Reset finished accumulation acc_y[0] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 2 // ----------- @@ -1001,13 +996,13 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; // Reset finished accumulation acc_y[1] = 0; - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // ---------------- // INCREMENT COLUMN // ---------------- col += 3; } - __asm__ __volatile__("":::"memory"); + __asm__ __volatile__("" ::: "memory"); // --------------------- // CONVOLUTION REMAINDER // --------------------- @@ -1035,8 +1030,9 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; // Increment column index ++col; - if (col >= num_cols) break; - __asm__ __volatile__("":::"memory"); + if (col >= num_cols) + break; + __asm__ __volatile__("" ::: "memory"); // ----------- // ITERATION 1 // ----------- diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h index cb26e762b..dbfe51b8b 100644 --- a/software/runtime/systolic/matmul_xqueue.h +++ b/software/runtime/systolic/matmul_xqueue.h @@ -1,18 +1,6 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. // Author: Gua Hao Khov, ETH Zurich From 4691691e04cfff047f87589b75f564cc4c85da7d Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Thu, 15 Sep 2022 15:02:31 +0200 Subject: [PATCH 20/24] [CHANGELOG] Add Xqueue extension and sw kernels --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 187934617..e934e6ca3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added - Add a DMA +- Add support to hardrware-accelerated queues for CGRA (RV32A extension) +- Add systolic implementation of matmul and 2d convolution exploiting hardware-accelerated queues ### Fixed - Measure the `wfi` stalls and stalls caused by `opc` properly @@ -34,7 +36,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add the `terapool` configuration - Add read-only caches to the hierarchical AXI interconnect - Add a `memcpy` benchmark -- Add a systolic configuration including runtime support and a matmul application +- Add a systolic configuration for software-emulated CGRA including runtime support and a systolic matmul - Add `axpy` kernel - Add Spyglass linting scripts - Add an OpenMP runtime and example applications From 8e210c7b6c621ea01ae174493e57cb26fbbd64f2 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Fri, 16 Sep 2022 11:02:56 +0200 Subject: [PATCH 21/24] [config] Update systolic config --- config/README.md | 1 + config/systolic.mk | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/config/README.md b/config/README.md index 1aa187773..60641c979 100644 --- a/config/README.md +++ b/config/README.md @@ -10,6 +10,7 @@ flavors of MemPool. We currently support three flavors: - `terapool`: 1024 cores, organized into 128 tiles with eight cores each - `mempool`: 256 cores, organized into 64 tiles with four cores each (default) - `minpool`: 16 cores, organized into 4 tiles with four cores each +- `systolic`: same as `mempool` but the cores form a CGRA Use the `config` variable to define which configuration to take. For example, to run a simulation with the `minpool` configuration, you would run diff --git a/config/systolic.mk b/config/systolic.mk index e14ce5a99..9d22978d3 100644 --- a/config/systolic.mk +++ b/config/systolic.mk @@ -19,7 +19,7 @@ num_cores_per_tile ?= 4 banking_factor ?= 4 # Radix for hierarchical AXI interconnect -axi_hier_radix ?= 16 +axi_hier_radix ?= 20 # Number of AXI masters per group axi_masters_per_group ?= 1 From 26f38776660f806981b78a6701b225350c542792 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Fri, 23 Sep 2022 17:59:15 +0200 Subject: [PATCH 22/24] [hardware] :bug: Add write response to Xqueue TCDM adapter --- hardware/src/tcdm_adapter_xqueue.sv | 37 +++++++++++++---------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv index 407952141..196ed2222 100644 --- a/hardware/src/tcdm_adapter_xqueue.sv +++ b/hardware/src/tcdm_adapter_xqueue.sv @@ -84,15 +84,15 @@ module tcdm_adapter_xqueue logic sresp_vld; // Helper signals to determine response data acquisition - logic mem_read_req; - logic force_rdata_acq; - logic prevent_rdata_acq; + logic mem_req; + logic prevent_resp_acq; // FSM related signals state_e state_q, state_d; logic vld_amo_op; logic req_accepted, resp_accepted; logic queue_stalled_d, queue_stalled_q; + logic amo_wb; // Temporary storage for AMO operations amo_op_t amo_op_d, amo_op_q; @@ -131,7 +131,7 @@ module tcdm_adapter_xqueue .ready_i(meta_out_rdy ), .data_o (stored_meta_data) ); - assign meta_in_vld = req_accepted & !in_write_i & !stalled_queue_op; + assign meta_in_vld = req_accepted & !stalled_queue_op; assign meta_out_rdy = sresp_select_q ? 1'b0 : resp_accepted; // Stores the metadata at handshake of stalled queue operations @@ -169,10 +169,10 @@ module tcdm_adapter_xqueue assign resp_in_data = out_rdata_i; assign rdata_out_rdy = resp_accepted; - // Set if memory read request occurs this cycle - assign mem_read_req = out_req_o & !out_write_o; - // Acquire response data a cycle after a memory read request (can be forced or prevented) - assign rdata_in_vld_d = force_rdata_acq | (mem_read_req & !prevent_rdata_acq); + // Set if memory read/write request occurs this cycle + assign mem_req = out_req_o && !amo_wb; + // Acquire response data a cycle after a memory read/write request (can be forced or prevented) + assign rdata_in_vld_d = mem_req & !prevent_resp_acq; // Output response valid if both meta and read data are available (the read data will always be last) assign resp_vld = meta_out_vld & rdata_out_vld; @@ -194,6 +194,7 @@ module tcdm_adapter_xqueue amo_op_d = AMONone; addr_d = addr_q; amo_operand_b_d = amo_operand_b_q; + amo_wb = 1'b0; state_d = state_q; sresp_select_d = sresp_select_q; queue_stalled_d = queue_stalled_q; @@ -211,9 +212,8 @@ module tcdm_adapter_xqueue // Response data as feed-through of read data // resp_in_data = out_rdata_i; - // Flags to force or prevent response acquisition - force_rdata_acq = 1'b0; - prevent_rdata_acq = 1'b0; + // Flag to prevent read/write response acquisition in case it does not actually happen + prevent_resp_acq = 1'b0; // Flags to increment queue counters increment_tail = 1'b0; @@ -257,14 +257,11 @@ module tcdm_adapter_xqueue // Note: Memory write is still executed but the tail is not incremented // Set stalled flag queue_stalled_d = 1'b1; - // Prevent acquisition of response data - prevent_rdata_acq = 1'b1; + // Prevent acquisition of read/write response data + prevent_resp_acq = 1'b1; end else begin // Set increment flag increment_tail = 1'b1; - // Force acquisition of response data despite a write access - // Response data will match the write data of the write access - force_rdata_acq = 1'b1; // Previous queue pop failed due to empty queue if (queue_stalled_q) begin queue_stalled_d = 1'b0; @@ -278,8 +275,8 @@ module tcdm_adapter_xqueue if (queue_empty) begin // Set stalled flag queue_stalled_d = 1'b1; - // Prevent acquisition of response data despite read access - prevent_rdata_acq = 1'b1; + // Prevent acquisition of read/write response data + prevent_resp_acq = 1'b1; end else begin // Set increment flag increment_head = 1'b1; @@ -303,6 +300,7 @@ module tcdm_adapter_xqueue out_write_o = 1'b1; out_add_o = addr_q; out_be_o = 4'b1111; + amo_wb = 1'b1; // serve from register if we cut the path if (RegisterAmo) begin out_wdata_o = amo_result_q; @@ -327,9 +325,6 @@ module tcdm_adapter_xqueue increment_tail = 1'b1; // Trigger memory access out_req_o = 1'b1; - // Force acquisition of response data despite a write access - // Response data will match the write data of the write access - force_rdata_acq = 1'b1; // Set meta data selection to stalled meta data sresp_select_d = 1'b1; // Return to Idle From 2d8fa4edd61cacf0d1a030afa803c912bc0b1b63 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Mon, 21 Nov 2022 15:20:22 +0100 Subject: [PATCH 23/24] [software] Generalize systolic matmul NUM_CORES --- software/apps/systolic/matmul_xqueue/main.c | 14 +++++++++----- software/runtime/systolic/matmul_xqueue.h | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c index dada500b4..5c69fde7e 100644 --- a/software/apps/systolic/matmul_xqueue/main.c +++ b/software/apps/systolic/matmul_xqueue/main.c @@ -70,6 +70,7 @@ int main() { core_mapping = (uint32_t *)simple_malloc(num_cores * 4); } +#if NUM_CORES == 16 // ---------- // 16 CORES // ---------- @@ -89,7 +90,7 @@ int main() { // uint32_t row_idx = tile_id / 2; // row_idx *= 2; // row_idx += (core_id % 4) / 2; - +#elif NUM_CORES == 256 // ---------- // 256 CORES // ---------- @@ -99,8 +100,8 @@ int main() { // uint32_t row_idx = core_id / 16; // Assign grid position (col wise) - // uint32_t col_idx = core_id / 16; - // uint32_t row_idx = core_id % 16; + uint32_t col_idx = core_id / 16; + uint32_t row_idx = core_id % 16; // Assign grid position (square wise) // uint32_t col_idx = tile_id % 8; @@ -122,6 +123,9 @@ int main() { // row_idx *= 2; // row_idx += (core_id % 4) / 2; // row_idx += add_row * 8; +#else +#error Unsupported NUM_CORES +#endif // Wait for all cores mempool_barrier(num_cores); @@ -138,10 +142,10 @@ int main() { printf("> Initialize\n"); // Print out tile mapping - // print_matrix((int32_t *)tile_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE); + //print_matrix((int32_t *)tile_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE); // Print out core mapping - // print_matrix((int32_t *)core_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE); + //print_matrix((int32_t *)core_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE); // Initialize systolic array systolic_init(tile_mapping, core_mapping); diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h index dbfe51b8b..c1f8aac3b 100644 --- a/software/runtime/systolic/matmul_xqueue.h +++ b/software/runtime/systolic/matmul_xqueue.h @@ -28,7 +28,7 @@ #include "printf.h" // Dimensions of square systolic array -#define SYSTOLIC_SIZE 4 +#define SYSTOLIC_SIZE 16 // Systolic matrix typedef struct { From 0c2dffe5229cb7ca0ff904a0baa0cd0ecbce068d Mon Sep 17 00:00:00 2001 From: "msc22h14 Vaibhav Krishna (vakrishna)" Date: Fri, 2 Dec 2022 16:09:57 +0100 Subject: [PATCH 24/24] [tb] Add support to trace retired TCDM operations --- CHANGELOG.md | 1 + CONTRIBUTORS.md | 1 + hardware/Makefile | 7 +- hardware/tb/mempool_tb.sv | 177 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 185 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e934e6ca3..f2ef14457 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add a DMA - Add support to hardrware-accelerated queues for CGRA (RV32A extension) - Add systolic implementation of matmul and 2d convolution exploiting hardware-accelerated queues +- Add ability to Trace the operations retired by the TCDM adapters ### Fixed - Measure the `wfi` stalls and stalls caused by `opc` properly diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index a64295735..c7145f41e 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -11,4 +11,5 @@ Thanks to all. * Marc Gantenbein * Marco Bertuletti * Sergio Mazzola +* Vaibhav Krishna * Yichao Zhang diff --git a/hardware/Makefile b/hardware/Makefile index 046d9ed04..5c1cd7ab9 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -47,6 +47,7 @@ verilator_top ?= mempool_tb_verilator python ?= python3 # Enable tracing snitch_trace ?= 0 +bank_trace ?= 0 # Check if the specified QuestaSim version exists ifeq (, $(shell which $(questa_cmd))) @@ -90,13 +91,17 @@ vlog_args += -work $(library) vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups) -DBANKING_FACTOR=$(banking_factor) vlog_defs += -DL2_BASE=$(l2_base) -DL2_SIZE=$(l2_size) -DL2_BANKS=$(l2_banks) vlog_defs += -DBOOT_ADDR=$(boot_addr) -DXPULPIMG=$(xpulpimg) -vlog_defs += -DSNITCH_TRACE=$(snitch_trace) +vlog_defs += -DSNITCH_TRACE=$(snitch_trace) -DBANK_TRACE=$(bank_trace) vlog_defs += -DAXI_DATA_WIDTH=$(axi_data_width) vlog_defs += -DRO_LINE_WIDTH=$(ro_line_width) vlog_defs += -DDMAS_PER_GROUP=$(dmas_per_group) vlog_defs += -DAXI_HIER_RADIX=$(axi_hier_radix) -DAXI_MASTERS_PER_GROUP=$(axi_masters_per_group) vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE=$(xqueue) -DXQUEUE_SIZE=$(xqueue_size) +ifeq ($(xqueue),1) + vlog_defs+= -DXQUEUE_TCDM_ADAPTER +endif + # Traffic generation enabled ifdef tg tg_ncycles ?= 10000 diff --git a/hardware/tb/mempool_tb.sv b/hardware/tb/mempool_tb.sv index c8dd12e9b..fedf5c651 100644 --- a/hardware/tb/mempool_tb.sv +++ b/hardware/tb/mempool_tb.sv @@ -194,6 +194,183 @@ module mempool_tb; end: gen_wfi_tiles end: gen_wfi_groups +`endif +`endif + + /************************ + * Mempool Bank Trace * + ************************/ +//Accessing Signals heirarchically not supported by Verilator +`ifndef TARGET_SYNTHESIS +`ifndef TARGET_VERILATOR + //Hierarchy for TCDM adapter + `ifdef XQUEUE_TCDM_ADAPTER + `define TCDM_ADAPTER(group,tile,bank) \ + dut.i_mempool_cluster.gen_groups[group].i_group.gen_tiles[tile].i_tile.gen_banks[bank].gen_tcdm_adapter_xqueue.i_tcdm_adapter + `else + `define TCDM_ADAPTER(group,tile,bank) \ + dut.i_mempool_cluster.gen_groups[group].i_group.gen_tiles[tile].i_tile.gen_banks[bank].gen_tcdm_adapter.i_tcdm_adapter + `endif + int f; + + initial begin + f = $fopen("trace_bank.dasm", "w"); + end + + localparam int BankTrace = `ifdef BANK_TRACE `BANK_TRACE `else 0 `endif; + + genvar i,j,k; + generate; + for (i=0; i= NumCoresPerTile) begin + ini_group = $bits(group_id_t)'(metadata_sel.ini_addr - NumCoresPerTile) ^ group_id; + ini_tile = metadata_sel.tile_id; + ini_core = metadata_sel.core_id; + end else begin + ini_group = group_id; + ini_tile = j; + ini_core = metadata_sel.ini_addr; + end + `ifdef XQUEUE_TCDM_ADAPTER + //Stall calculation for queue operations + if (`TCDM_ADAPTER(i,j,k).increment_head || `TCDM_ADAPTER(i,j,k).increment_tail) begin + stall_d <= 0; + end else begin + if (`TCDM_ADAPTER(i,j,k).queue_stalled_q) begin + stall_d <= stall_q + 1; + end + end + //Print the cycles of stalled queue operation when it is resolved + if (`TCDM_ADAPTER(i,j,k).queue_stalled_q && !(`TCDM_ADAPTER(i,j,k).queue_stalled_d)) begin + print_stall_d = 1'b1; + stall = stall_q; + end + `endif + //Print Non-Atomic Loads and Stores + if ((`TCDM_ADAPTER(i,j,k).in_amo_i == '0) && `TCDM_ADAPTER(i,j,k).in_valid_i && `TCDM_ADAPTER(i,j,k).in_ready_o) begin + in_addr_d = `TCDM_ADAPTER(i,j,k).in_address_i; + if (`TCDM_ADAPTER(i,j,k).in_write_i) begin + print_sw_d = 1'b1; + sw_d = `TCDM_ADAPTER(i,j,k).in_wdata_i; + end else begin + print_lw_d = 1'b1; + end + end + end + + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) begin + stall_q <= 0; + increment_head_q <= '0; + increment_tail_q <= '0; + vld_amo_op_q <= '0; + q_push_data_q <= '0; + print_stall_q <= '0; + print_lw_q <= '0; + print_sw_q <= '0; + in_addr_q <= '0; + sw_q <= '0; + end else begin + stall_q <= stall_d; + `ifdef XQUEUE_TCDM_ADAPTER + increment_head_q <= `TCDM_ADAPTER(i,j,k).increment_head; + increment_tail_q <= `TCDM_ADAPTER(i,j,k).increment_tail; + vld_amo_op_q <= `TCDM_ADAPTER(i,j,k).vld_amo_op && `TCDM_ADAPTER(i,j,k).req_accepted; + `else + increment_head_q <= '0; + increment_tail_q <= '0; + vld_amo_op_q <= '0; + `endif + q_push_data_q <= q_push_data_d; + print_stall_q <= print_stall_d; + print_lw_q <= print_lw_d; + print_sw_q <= print_sw_d; + in_addr_q <= in_addr_d; + sw_q <= sw_d; + //Print when a Bank Operation is retired + if (BankTrace && `TCDM_ADAPTER(i,j,k).in_valid_o)begin + `ifdef XQUEUE_TCDM_ADAPTER + //AMO excluding Qpush and Qpop + if(vld_amo_op_q)begin + trace_entry = $sformatf("%t: (%1d,%2d,%2d): %s, init=(%1d,%2d,%2d), address= 0x%h, data= %d\n",$time,i,j,k,`TCDM_ADAPTER(i,j,k).amo_op_q, ini_group, ini_tile, ini_core, `TCDM_ADAPTER(i,j,k).addr_q,`TCDM_ADAPTER(i,j,k).amo_result); + $fwrite(f, trace_entry); + end + //Queue operations + if(increment_head_q || increment_tail_q) begin + if (increment_head_q) begin + trace_entry = $sformatf("%t: (%1d,%2d,%2d): Qpop ,",$time,i,j,k); + trace_entry = $sformatf("%s init=(%1d,%2d,%2d), data= %d", trace_entry, ini_group, ini_tile, ini_core, `TCDM_ADAPTER(i,j,k).in_rdata_o); + end else if (increment_tail_q)begin + trace_entry = $sformatf("%t: (%1d,%2d,%2d): Qpush,",$time,i,j,k); + trace_entry = $sformatf("%s init=(%1d,%2d,%2d), data= %d", trace_entry, ini_group, ini_tile, ini_core, q_push_data_q); + end + if(print_stall_q) begin + trace_entry = $sformatf("%s: Qstall=%d\n", trace_entry, stall); + end else begin + trace_entry = $sformatf("%s\n",trace_entry); + end + $fwrite(f, trace_entry); + end + `endif + //Load + if (print_lw_q) begin + trace_entry = $sformatf("%t: (%1d,%2d,%2d): Load Word , init=(%1d,%2d,%2d), address= 0x%h, data = %d\n",$time,i,j,k, ini_group, ini_tile, ini_core, in_addr_q, `TCDM_ADAPTER(i,j,k).in_rdata_o); + $fwrite(f, trace_entry); + end + //Store + if (print_sw_q) begin + trace_entry = $sformatf("%t: (%1d,%2d,%2d): Store Word, init=(%1d,%2d,%2d), address= 0x%h, data = %d\n",$time,i,j,k, ini_group, ini_tile, ini_core, in_addr_q, sw_q); + $fwrite(f, trace_entry); + end + end + end + end + end + end + end + endgenerate + + final begin + $fclose(f); + end + `endif `endif