diff --git a/Bender.yml b/Bender.yml index 08b62d28c..982aa0c6b 100644 --- a/Bender.yml +++ b/Bender.yml @@ -28,6 +28,7 @@ sources: - hardware/src/mempool_cc.sv - hardware/src/snitch_addr_demux.sv - hardware/src/tcdm_adapter.sv + - hardware/src/tcdm_adapter_xqueue.sv - hardware/src/tcdm_shim.sv - hardware/src/tcdm_wide_narrow_mux.sv - hardware/src/address_scrambler.sv diff --git a/CHANGELOG.md b/CHANGELOG.md index 187934617..f2ef14457 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added - Add a DMA +- Add support to hardrware-accelerated queues for CGRA (RV32A extension) +- Add systolic implementation of matmul and 2d convolution exploiting hardware-accelerated queues +- Add ability to Trace the operations retired by the TCDM adapters ### Fixed - Measure the `wfi` stalls and stalls caused by `opc` properly @@ -34,7 +37,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add the `terapool` configuration - Add read-only caches to the hierarchical AXI interconnect - Add a `memcpy` benchmark -- Add a systolic configuration including runtime support and a matmul application +- Add a systolic configuration for software-emulated CGRA including runtime support and a systolic matmul - Add `axpy` kernel - Add Spyglass linting scripts - Add an OpenMP runtime and example applications diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index a64295735..c7145f41e 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -11,4 +11,5 @@ Thanks to all. * Marc Gantenbein * Marco Bertuletti * Sergio Mazzola +* Vaibhav Krishna * Yichao Zhang diff --git a/config/README.md b/config/README.md index 1aa187773..60641c979 100644 --- a/config/README.md +++ b/config/README.md @@ -10,6 +10,7 @@ flavors of MemPool. We currently support three flavors: - `terapool`: 1024 cores, organized into 128 tiles with eight cores each - `mempool`: 256 cores, organized into 64 tiles with four cores each (default) - `minpool`: 16 cores, organized into 4 tiles with four cores each +- `systolic`: same as `mempool` but the cores form a CGRA Use the `config` variable to define which configuration to take. For example, to run a simulation with the `minpool` configuration, you would run diff --git a/config/config.mk b/config/config.mk index ea0ff5425..fb01a9006 100644 --- a/config/config.mk +++ b/config/config.mk @@ -56,6 +56,9 @@ dmas_per_group ?= 4 ## Xqueues configuration ## ############################# +# Hardware queues for systolic (atomic ISA extension in TCDM adapter) +xqueue ?= 0 + # XQueue extension's queue size in each memory bank (in words) xqueue_size ?= 0 diff --git a/config/mempool.mk b/config/mempool.mk index a3df45b35..ec2c34154 100644 --- a/config/mempool.mk +++ b/config/mempool.mk @@ -17,6 +17,9 @@ num_groups ?= 4 # Number of cores per MemPool tile num_cores_per_tile ?= 4 +# L1 scratchpad banking factor +banking_factor ?= 4 + # Radix for hierarchical AXI interconnect axi_hier_radix ?= 20 diff --git a/config/minpool.mk b/config/minpool.mk index 455cd30e6..484bef548 100644 --- a/config/minpool.mk +++ b/config/minpool.mk @@ -17,6 +17,9 @@ num_groups ?= 4 # Number of cores per MemPool tile num_cores_per_tile ?= 4 +# L1 scratchpad banking factor +banking_factor ?= 4 + # Number of DMA backends in each group dmas_per_group ?= 1 diff --git a/config/systolic.mk b/config/systolic.mk index 5de36e4c5..9d22978d3 100644 --- a/config/systolic.mk +++ b/config/systolic.mk @@ -15,8 +15,11 @@ num_groups ?= 4 # Number of cores per MemPool tile num_cores_per_tile ?= 4 +# L1 scratchpad banking factor +banking_factor ?= 4 + # Radix for hierarchical AXI interconnect -axi_hier_radix ?= 16 +axi_hier_radix ?= 20 # Number of AXI masters per group axi_masters_per_group ?= 1 @@ -29,6 +32,10 @@ seq_mem_size ?= 2048 ## Xqueues configuration ## ############################# -# Xqueue extension's queue size (in queue entries) -# in each memory bank (assume banking factor of 4) +# Hardware queues for systolic (atomic ISA extension in TCDM adapter) +xqueue ?= 1 + +# Systolic queues size (assume banking factor of 4) for: +# - software queues emulation (size measured in queue entries) +# - hardware xqueue's queue in each memory bank (size measured in words) xqueue_size ?= 4 diff --git a/config/terapool.mk b/config/terapool.mk index 5d3f90854..a9df13cba 100644 --- a/config/terapool.mk +++ b/config/terapool.mk @@ -17,6 +17,9 @@ num_groups ?= 8 # Number of cores per Terapool tile num_cores_per_tile ?= 8 +# L1 scratchpad banking factor +banking_factor ?= 4 + # Radix for hierarchical AXI interconnect axi_hier_radix ?= 8 diff --git a/hardware/Makefile b/hardware/Makefile index 7965053d4..5c1cd7ab9 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -47,6 +47,7 @@ verilator_top ?= mempool_tb_verilator python ?= python3 # Enable tracing snitch_trace ?= 0 +bank_trace ?= 0 # Check if the specified QuestaSim version exists ifeq (, $(shell which $(questa_cmd))) @@ -87,15 +88,19 @@ endif vlog_args += -suppress vlog-2583 -suppress vlog-13314 -suppress vlog-13233 vlog_args += -work $(library) # Defines -vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups) +vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups) -DBANKING_FACTOR=$(banking_factor) vlog_defs += -DL2_BASE=$(l2_base) -DL2_SIZE=$(l2_size) -DL2_BANKS=$(l2_banks) vlog_defs += -DBOOT_ADDR=$(boot_addr) -DXPULPIMG=$(xpulpimg) -vlog_defs += -DSNITCH_TRACE=$(snitch_trace) +vlog_defs += -DSNITCH_TRACE=$(snitch_trace) -DBANK_TRACE=$(bank_trace) vlog_defs += -DAXI_DATA_WIDTH=$(axi_data_width) vlog_defs += -DRO_LINE_WIDTH=$(ro_line_width) vlog_defs += -DDMAS_PER_GROUP=$(dmas_per_group) vlog_defs += -DAXI_HIER_RADIX=$(axi_hier_radix) -DAXI_MASTERS_PER_GROUP=$(axi_masters_per_group) -vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE_SIZE=$(xqueue_size) +vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE=$(xqueue) -DXQUEUE_SIZE=$(xqueue_size) + +ifeq ($(xqueue),1) + vlog_defs+= -DXQUEUE_TCDM_ADAPTER +endif # Traffic generation enabled ifdef tg diff --git a/hardware/deps/snitch/src/riscv_instr.sv b/hardware/deps/snitch/src/riscv_instr.sv index 23107aa70..afbd2cd7c 100644 --- a/hardware/deps/snitch/src/riscv_instr.sv +++ b/hardware/deps/snitch/src/riscv_instr.sv @@ -935,6 +935,8 @@ package riscv_instr; localparam logic [31:0] PV_PACK_H = 32'b1101001??????????000?????1010111; localparam logic [31:0] PV_PACKHI_B = 32'b1101100??????????001?????1010111; localparam logic [31:0] PV_PACKLO_B = 32'b1110000??????????001?????1010111; + localparam logic [31:0] Q_PUSH = 32'b00111????????????010?????0101111; + localparam logic [31:0] Q_POP = 32'b00110??00000?????010?????0101111; /* CSR Addresses */ localparam logic [11:0] CSR_FFLAGS = 12'h1; localparam logic [11:0] CSR_FRM = 12'h2; diff --git a/hardware/deps/snitch/src/snitch.sv b/hardware/deps/snitch/src/snitch.sv index e4d48bb18..fd2927834 100644 --- a/hardware/deps/snitch/src/snitch.sv +++ b/hardware/deps/snitch/src/snitch.sv @@ -18,7 +18,8 @@ module snitch parameter logic [31:0] MTVEC = BootAddr, // Exception Base Address (see privileged spec 3.1.7) parameter bit RVE = 0, // Reduced-register Extension parameter bit RVM = 1, // Enable IntegerMmultiplication & Division Extension - parameter int RegNrWritePorts = 2 // Implement one or two write ports into the register file + parameter int RegNrWritePorts = 2, // Implement one or two write ports into the register file + parameter bit Xqueue = 0 ) ( input logic clk_i, input logic rst_i, @@ -152,7 +153,10 @@ module snitch AMOMin = 4'h8, AMOMinu = 4'h9, AMOLR = 4'hA, - AMOSC = 4'hB + AMOSC = 4'hB, + // TODO(smazzola): parametrize + QPush = 4'hC, // Only used when Xqueue is enabled + QPop = 4'hD // Only used when Xqueue is enabled } ls_amo; logic [31:0] ld_result; @@ -1324,6 +1328,41 @@ module snitch end /* end of Xpulpimg extension */ +/* Xqueues extension */ + // TODO(khovg): Add define to include instr + riscv_instr::Q_PUSH: begin + if (Xqueue) begin + alu_op = BypassA; + write_rd = 1'b0; + uses_rd = 1'b1; + is_load = 1'b1; + is_signed = 1'b1; + ls_size = Word; + ls_amo = QPush; + opa_select = Reg; + opb_select = Reg; + end else begin + illegal_inst = 1'b1; + end + end + // TODO(khovg): Two source registers are unnnecessary + riscv_instr::Q_POP: begin + if (Xqueue) begin + alu_op = BypassA; + write_rd = 1'b0; + uses_rd = 1'b1; + is_load = 1'b1; + is_signed = 1'b1; + ls_size = Word; + ls_amo = QPop; + opa_select = Reg; + opb_select = Reg; + end else begin + illegal_inst = 1'b1; + end + end +/* end of Xqueues extension */ + // TODO(zarubaf): Illegal Instructions default: begin illegal_inst = 1'b1; diff --git a/hardware/src/mempool_cc.sv b/hardware/src/mempool_cc.sv index 096156608..3c86b19d4 100644 --- a/hardware/src/mempool_cc.sv +++ b/hardware/src/mempool_cc.sv @@ -57,10 +57,11 @@ module mempool_cc // Snitch Integer Core snitch #( - .BootAddr ( BootAddr ), - .MTVEC ( MTVEC ), - .RVE ( RVE ), - .RVM ( RVM ) + .BootAddr ( BootAddr ), + .MTVEC ( MTVEC ), + .RVE ( RVE ), + .RVM ( RVM ), + .Xqueue ( mempool_pkg::Xqueue ) ) i_snitch ( .clk_i , .rst_i , diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index a11eeeff1..ce7915ee3 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -35,7 +35,7 @@ package mempool_pkg; localparam integer unsigned DataWidth = 32; localparam integer unsigned BeWidth = DataWidth / 8; localparam integer unsigned ByteOffset = $clog2(BeWidth); - localparam integer unsigned BankingFactor = 4; + localparam integer unsigned BankingFactor = `ifdef BANKING_FACTOR `BANKING_FACTOR `else 0 `endif; localparam bit LrScEnable = 1'b1; localparam integer unsigned TCDMSizePerBank = 1024; // [B] localparam integer unsigned NumBanks = NumCores * BankingFactor; @@ -258,6 +258,9 @@ package mempool_pkg; * QUEUE PARAMETERS * **********************/ + // Size of queues in words (must be a power of two) + localparam bit Xqueue = `ifdef XQUEUE `XQUEUE `else 1'b0 `endif; + // Size of xqueues in words (must be a power of two) localparam int unsigned XQueueSize = `ifdef XQUEUE_SIZE `XQUEUE_SIZE `else 0 `endif; diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index a3a6aa50b..19dacacae 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -381,34 +381,65 @@ module mempool_tile assign bank_resp_payload[b].rdata.amo = '0; // Don't care assign bank_resp_wide[b] = meta_out.wide; - tcdm_adapter #( - .AddrWidth (TCDMAddrMemWidth), - .DataWidth (DataWidth ), - .metadata_t (bank_metadata_t ), - .LrScEnable (LrScEnable ), - .RegisterAmo(1'b0 ) - ) i_tcdm_adapter ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .in_valid_i (bank_req_valid[b] ), - .in_ready_o (bank_req_ready[b] ), - .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]), - .in_amo_i (bank_req_payload[b].wdata.amo ), - .in_write_i (bank_req_payload[b].wen ), - .in_wdata_i (bank_req_payload[b].wdata.data ), - .in_meta_i (meta_in ), - .in_be_i (bank_req_payload[b].be ), - .in_valid_o (bank_resp_valid[b] ), - .in_ready_i (bank_resp_ready[b] ), - .in_rdata_o (bank_resp_payload[b].rdata.data ), - .in_meta_o (meta_out ), - .out_req_o (req_valid ), - .out_add_o (req_addr ), - .out_write_o (req_write ), - .out_wdata_o (req_wdata ), - .out_be_o (req_be ), - .out_rdata_i (resp_rdata ) - ); + if (Xqueue) begin: gen_tcdm_adapter_xqueue + tcdm_adapter_xqueue #( + .AddrWidth (TCDMAddrMemWidth), + .DataWidth (DataWidth ), + .XQueueSize (XQueueSize ), + .metadata_t (bank_metadata_t ), + .RegisterAmo(1'b0 ) + ) i_tcdm_adapter ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .in_valid_i (bank_req_valid[b] ), + .in_ready_o (bank_req_ready[b] ), + .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]), + .in_amo_i (bank_req_payload[b].wdata.amo ), + .in_write_i (bank_req_payload[b].wen ), + .in_wdata_i (bank_req_payload[b].wdata.data ), + .in_meta_i (meta_in ), + .in_be_i (bank_req_payload[b].be ), + .in_valid_o (bank_resp_valid[b] ), + .in_ready_i (bank_resp_ready[b] ), + .in_rdata_o (bank_resp_payload[b].rdata.data ), + .in_meta_o (meta_out ), + .out_req_o (req_valid ), + .out_add_o (req_addr ), + .out_write_o (req_write ), + .out_wdata_o (req_wdata ), + .out_be_o (req_be ), + .out_rdata_i (resp_rdata ) + ); + end else begin: gen_tcdm_adapter + tcdm_adapter #( + .AddrWidth (TCDMAddrMemWidth), + .DataWidth (DataWidth ), + .metadata_t (bank_metadata_t ), + .LrScEnable (LrScEnable ), + .RegisterAmo(1'b0 ) + ) i_tcdm_adapter ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .in_valid_i (bank_req_valid[b] ), + .in_ready_o (bank_req_ready[b] ), + .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]), + .in_amo_i (bank_req_payload[b].wdata.amo ), + .in_write_i (bank_req_payload[b].wen ), + .in_wdata_i (bank_req_payload[b].wdata.data ), + .in_meta_i (meta_in ), + .in_be_i (bank_req_payload[b].be ), + .in_valid_o (bank_resp_valid[b] ), + .in_ready_i (bank_resp_ready[b] ), + .in_rdata_o (bank_resp_payload[b].rdata.data ), + .in_meta_o (meta_out ), + .out_req_o (req_valid ), + .out_add_o (req_addr ), + .out_write_o (req_write ), + .out_wdata_o (req_wdata ), + .out_be_o (req_be ), + .out_rdata_i (resp_rdata ) + ); + end // Bank tc_sram #( diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv new file mode 100644 index 000000000..196ed2222 --- /dev/null +++ b/hardware/src/tcdm_adapter_xqueue.sv @@ -0,0 +1,508 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Description: Handles the protocol conversion from valid/ready to req/gnt and correctly returns +// the metadata. Additionally, it handles atomics. Hence, it needs to be instantiated in front of +// an SRAM over which it has exclusive access. +// +// Author: Samuel Riedel + +`include "common_cells/registers.svh" + +module tcdm_adapter_xqueue + import cf_math_pkg::idx_width; +#( + parameter int unsigned AddrWidth = 32, + parameter int unsigned DataWidth = 32, + parameter int unsigned XQueueSize = 4, + parameter type metadata_t = logic, + parameter bit RegisterAmo = 1'b0, // Cut path between request and response at the cost of increased AMO latency + // Dependent parameters. DO NOT CHANGE. + localparam int unsigned BeWidth = DataWidth/8, + localparam int unsigned QCntWidth = idx_width(XQueueSize) +) ( + input logic clk_i, + input logic rst_ni, + // master side + input logic in_valid_i, // Bank request + output logic in_ready_o, // Bank grant + input logic [AddrWidth-1:0] in_address_i, // Address + input logic [3:0] in_amo_i, // Atomic Memory Operation + input logic in_write_i, // 1: Store, 0: Load + input logic [DataWidth-1:0] in_wdata_i, // Write data + input metadata_t in_meta_i, // Meta data + input logic [BeWidth-1:0] in_be_i, // Byte enable + output logic in_valid_o, // Response valid + input logic in_ready_i, // Response ready + output logic [DataWidth-1:0] in_rdata_o, // Read data + output metadata_t in_meta_o, // Meta data + // slave side + output logic out_req_o, // Bank request + output logic [AddrWidth-1:0] out_add_o, // Address + output logic out_write_o, // 1: Store, 0: Load + output logic [DataWidth-1:0] out_wdata_o, // Write data + output logic [BeWidth-1:0] out_be_o, // Bit enable + input logic [DataWidth-1:0] out_rdata_i // Read data +); + + typedef enum logic [3:0] { + AMONone = 4'h0, + AMOSwap = 4'h1, + AMOAdd = 4'h2, + AMOAnd = 4'h3, + AMOOr = 4'h4, + AMOXor = 4'h5, + AMOMax = 4'h6, + AMOMaxu = 4'h7, + AMOMin = 4'h8, + AMOMinu = 4'h9, + AMOLR = 4'hA, + AMOSC = 4'hB, + QPush = 4'hC, + QPop = 4'hD + } amo_op_t; + + typedef enum logic [2:0] { + Idle, DoAMO, WriteBackAMO, ResolveQPushStall, ResolveQPopStall + } state_e; + + // Stored data in spill registers and fall through register + metadata_t stored_meta_data; + metadata_t stored_smeta_data; + logic[DataWidth-1:0] resp_in_data; + + // Handshake signals for spill registers and fall through register + logic meta_in_vld, meta_in_rdy, meta_out_vld, meta_out_rdy; + logic smeta_in_vld, smeta_in_rdy, smeta_out_vld, smeta_out_rdy; + logic rdata_in_vld_d, rdata_in_vld_q; + logic rdata_in_rdy, rdata_out_vld, rdata_out_rdy; + + // Response meta data selection and valid signals + logic sresp_select_d, sresp_select_q; + logic resp_vld; + logic sresp_vld; + + // Helper signals to determine response data acquisition + logic mem_req; + logic prevent_resp_acq; + + // FSM related signals + state_e state_q, state_d; + logic vld_amo_op; + logic req_accepted, resp_accepted; + logic queue_stalled_d, queue_stalled_q; + logic amo_wb; + + // Temporary storage for AMO operations + amo_op_t amo_op_d, amo_op_q; + logic [AddrWidth-1:0] addr_d, addr_q; + + // AMO ALU signals + logic [31:0] amo_operand_a; + logic [31:0] amo_operand_b_d, amo_operand_b_q; + logic [31:0] amo_result, amo_result_q; + + // Queue counters + logic unsigned [QCntWidth-1:0] curr_tail_d, curr_tail_q; + logic unsigned [QCntWidth-1:0] next_tail_d, next_tail_q; + logic unsigned [QCntWidth-1:0] curr_head_d, curr_head_q; + + // Queue counter increment + logic unsigned [QCntWidth-1:0] increment_operand, increment_result; + + // Queue management signals + logic queue_empty; + logic queue_full; + logic increment_tail, increment_head; + logic stalled_queue_op; + + // Stores the metadata at handshake (except stalled queue operations) + spill_register #( + .T (metadata_t), + .Bypass(1'b0 ) + ) i_meta_register ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .valid_i(meta_in_vld ), + .ready_o(meta_in_rdy ), + .data_i (in_meta_i ), + .valid_o(meta_out_vld ), + .ready_i(meta_out_rdy ), + .data_o (stored_meta_data) + ); + assign meta_in_vld = req_accepted & !stalled_queue_op; + assign meta_out_rdy = sresp_select_q ? 1'b0 : resp_accepted; + + // Stores the metadata at handshake of stalled queue operations + spill_register #( + .T (metadata_t), + .Bypass(1'b0 ) + ) i_stallmeta_register ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .valid_i(smeta_in_vld ), + .ready_o(smeta_in_rdy ), + .data_i (in_meta_i ), + .valid_o(smeta_out_vld ), + .ready_i(smeta_out_rdy ), + .data_o (stored_smeta_data) + ); + assign smeta_in_vld = req_accepted & stalled_queue_op; + assign smeta_out_rdy = sresp_select_q ? resp_accepted : 1'b0; + + // Store response data if it's not accepted immediately + fall_through_register #( + .T(logic[DataWidth-1:0]) + ) i_rdata_register ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .clr_i (1'b0 ), + .testmode_i(1'b0 ), + .data_i (resp_in_data ), + .valid_i (rdata_in_vld_q), + .ready_o (rdata_in_rdy ), + .data_o (in_rdata_o ), + .valid_o (rdata_out_vld ), + .ready_i (rdata_out_rdy ) + ); + assign resp_in_data = out_rdata_i; + assign rdata_out_rdy = resp_accepted; + + // Set if memory read/write request occurs this cycle + assign mem_req = out_req_o && !amo_wb; + // Acquire response data a cycle after a memory read/write request (can be forced or prevented) + assign rdata_in_vld_d = mem_req & !prevent_resp_acq; + + // Output response valid if both meta and read data are available (the read data will always be last) + assign resp_vld = meta_out_vld & rdata_out_vld; + assign sresp_vld = smeta_out_vld & rdata_out_vld; + // Select output valid depending on response selection + assign in_valid_o = sresp_select_q ? sresp_vld : resp_vld; + // Select output meta data depending on response selection + assign in_meta_o = sresp_select_q ? stored_smeta_data : stored_meta_data; + + // Exclude queue operations as valid amo operations + assign vld_amo_op = !(amo_op_t'(in_amo_i) inside {AMONone, QPush, QPop}); + // Request is accepted on successful input handshake + assign req_accepted = in_valid_i & in_ready_o; + // Response is accepted on successful output handshake + assign resp_accepted = in_ready_i & in_valid_o; + + always_comb begin + // Default + amo_op_d = AMONone; + addr_d = addr_q; + amo_operand_b_d = amo_operand_b_q; + amo_wb = 1'b0; + state_d = state_q; + sresp_select_d = sresp_select_q; + queue_stalled_d = queue_stalled_q; + + // While response is pending no requests are accepted + in_ready_o = in_valid_o & ~in_ready_i ? 1'b0 : 1'b1; + + // Feed-through of request + out_req_o = req_accepted; + out_add_o = in_address_i; + out_write_o = in_write_i; + out_wdata_o = in_wdata_i; + out_be_o = in_be_i; + + // Response data as feed-through of read data + // resp_in_data = out_rdata_i; + + // Flag to prevent read/write response acquisition in case it does not actually happen + prevent_resp_acq = 1'b0; + + // Flags to increment queue counters + increment_tail = 1'b0; + increment_head = 1'b0; + + // FSM + unique case (state_q) + // Idle State handles normal load/stores, non-stalled queue operations + // and the initial read of AMO operations (single cycle operations) + // In case of pending queue stall or AMO operations transition away + Idle: begin + // Prepare queue push + if (amo_op_t'(in_amo_i) == QPush) begin + // Write data at tail of queue + out_add_o = curr_tail_q; + out_write_o = 1'b1; + end + + // Prepare queue pop + if (amo_op_t'(in_amo_i) == QPop) begin + // Read data at head of queue + out_add_o = curr_head_q; + end + + // Request accepted (triggers memory access) + if (req_accepted) begin + // Reset meta data selection to default meta data + sresp_select_d = 1'b0; + + // AMO operation + if (vld_amo_op) begin + amo_op_d = amo_op_t'(in_amo_i); + addr_d = in_address_i; + amo_operand_b_d = in_wdata_i; + state_d = DoAMO; + end + + // Queue push + if (amo_op_t'(in_amo_i) == QPush) begin + if (queue_full) begin + // Note: Memory write is still executed but the tail is not incremented + // Set stalled flag + queue_stalled_d = 1'b1; + // Prevent acquisition of read/write response data + prevent_resp_acq = 1'b1; + end else begin + // Set increment flag + increment_tail = 1'b1; + // Previous queue pop failed due to empty queue + if (queue_stalled_q) begin + queue_stalled_d = 1'b0; + state_d = ResolveQPopStall; + end + end + end + + // Queue pop + if (amo_op_t'(in_amo_i) == QPop) begin + if (queue_empty) begin + // Set stalled flag + queue_stalled_d = 1'b1; + // Prevent acquisition of read/write response data + prevent_resp_acq = 1'b1; + end else begin + // Set increment flag + increment_head = 1'b1; + // Previous queue push failed due to full queue + if (queue_stalled_q) begin + queue_stalled_d = 1'b0; + state_d = ResolveQPushStall; + end + end + end + end + end + + // DoAMO & WriteBackAMO State claims the memory interface for AMO write + DoAMO, WriteBackAMO: begin + in_ready_o = 1'b0; + // Return to Idle one cycle later if we cut the path + state_d = (RegisterAmo && state_q != WriteBackAMO) ? WriteBackAMO : Idle; + // Commit AMO + out_req_o = 1'b1; + out_write_o = 1'b1; + out_add_o = addr_q; + out_be_o = 4'b1111; + amo_wb = 1'b1; + // serve from register if we cut the path + if (RegisterAmo) begin + out_wdata_o = amo_result_q; + end else begin + out_wdata_o = amo_result; + end + end + + // ResolveQPushStall State blocks any requests until queue pop response + // has been accepted and then prepares the queue push response + // (queue push stores data even in full queue but does not update tail) + ResolveQPushStall: begin + // Do not accept any requests during resolve + in_ready_o = 1'b0; + // Retrieve queue push data as dummy response (read data at tail of queue) + out_add_o = curr_tail_q; + out_write_o = 1'b0; + out_be_o = 4'b1111; + // Wait until pop response accepted + if (resp_accepted) begin + // Set increment flag + increment_tail = 1'b1; + // Trigger memory access + out_req_o = 1'b1; + // Set meta data selection to stalled meta data + sresp_select_d = 1'b1; + // Return to Idle + state_d = Idle; + end + end + + // ResolveQPushStall State blocks any requests until queue push response + // has been accepted and then executes the queue pop + ResolveQPopStall: begin + // Do not accept any requests during resolve + in_ready_o = 1'b0; + // Prepare queue pop (read data at head of queue) + out_add_o = curr_head_q; + out_write_o = 1'b0; + out_be_o = 4'b1111; + // Wait until push response accepted + if (resp_accepted) begin + // Set increment flag + increment_head = 1'b1; + // Trigger memory access + out_req_o = 1'b1; + // Set meta data selection to stalled meta data + sresp_select_d = 1'b1; + // Return to Idle + state_d = Idle; + end + end + default:; + endcase + end + + // ---------------- + // AMO ALU + // ---------------- + logic [33:0] adder_sum; + logic [32:0] adder_operand_a, adder_operand_b; + + assign amo_operand_a = out_rdata_i; + assign adder_sum = adder_operand_a + adder_operand_b; + /* verilator lint_off WIDTH */ + always_comb begin : amo_alu + + adder_operand_a = $signed(amo_operand_a); + adder_operand_b = $signed(amo_operand_b_q); + + amo_result = amo_operand_b_q; + + unique case (amo_op_q) + // the default is to output operand_b + AMOSwap:; + AMOAdd: amo_result = adder_sum[31:0]; + AMOAnd: amo_result = amo_operand_a & amo_operand_b_q; + AMOOr: amo_result = amo_operand_a | amo_operand_b_q; + AMOXor: amo_result = amo_operand_a ^ amo_operand_b_q; + AMOMax: begin + adder_operand_b = -$signed(amo_operand_b_q); + amo_result = adder_sum[32] ? amo_operand_b_q : amo_operand_a; + end + AMOMin: begin + adder_operand_b = -$signed(amo_operand_b_q); + amo_result = adder_sum[32] ? amo_operand_a : amo_operand_b_q; + end + AMOMaxu: begin + adder_operand_a = $unsigned(amo_operand_a); + adder_operand_b = -$unsigned(amo_operand_b_q); + amo_result = adder_sum[32] ? amo_operand_b_q : amo_operand_a; + end + AMOMinu: begin + adder_operand_a = $unsigned(amo_operand_a); + adder_operand_b = -$unsigned(amo_operand_b_q); + amo_result = adder_sum[32] ? amo_operand_a : amo_operand_b_q; + end + default: amo_result = '0; + endcase + end + + if (RegisterAmo) begin : gen_amo_slice + `FFLNR(amo_result_q, amo_result, (state_q == DoAMO), clk_i) + end else begin : gen_amo_slice + assign amo_result_q = '0; + end + + // ---------------- + // QUEUE MANAGEMENT + // ---------------- + assign queue_empty = (curr_head_q == curr_tail_q); + assign queue_full = (curr_head_q == next_tail_q); + + assign increment_result = increment_operand + 1; + + always_comb begin : queue_management + // Default + curr_tail_d = curr_tail_q; + next_tail_d = next_tail_q; + curr_head_d = curr_head_q; + + // Increment queue counters + increment_operand = curr_head_q; + if (increment_tail) begin + increment_operand = next_tail_q; + curr_tail_d = next_tail_q; + next_tail_d = increment_result; + end + if (increment_head) begin + increment_operand = curr_head_q; + curr_head_d = increment_result; + end + + // Select spill register for meta data + unique case (amo_op_t'(in_amo_i)) + QPush: stalled_queue_op = queue_full; + QPop: stalled_queue_op = queue_empty; + default: stalled_queue_op = 1'b0; + endcase + end + + // ---------------- + // SEQUENTIAL PROCESS + // ---------------- + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + state_q <= Idle; + amo_op_q <= amo_op_t'('0); + addr_q <= '0; + amo_operand_b_q <= '0; + rdata_in_vld_q <= 1'b0; + sresp_select_q <= 1'b0; + curr_tail_q <= 0; + next_tail_q <= 1; + curr_head_q <= 0; + queue_stalled_q <= 1'b0; + end else begin + state_q <= state_d; + amo_op_q <= amo_op_d; + addr_q <= addr_d; + amo_operand_b_q <= amo_operand_b_d; + rdata_in_vld_q <= rdata_in_vld_d; + sresp_select_q <= sresp_select_d; + curr_tail_q <= curr_tail_d; + next_tail_q <= next_tail_d; + curr_head_q <= curr_head_d; + queue_stalled_q <= queue_stalled_d; + end + end + + // ---------------- + // ASSERTIONS + // ---------------- + // pragma translate_off + // Check for unsupported parameters + if (DataWidth != 32) begin + $error($sformatf("Module currently only supports DataWidth = 32. DataWidth is currently set to: %0d", DataWidth)); + end + + `ifndef VERILATOR + meta_full : assert property( + @(posedge clk_i) disable iff (~rst_ni) (meta_in_vld |-> meta_in_rdy)) + else $fatal (1, "Trying to push new data although the i_meta_register is not ready."); + `endif + + `ifndef VERILATOR + smeta_full : assert property( + @(posedge clk_i) disable iff (~rst_ni) (smeta_in_vld |-> smeta_in_rdy)) + else $fatal (1, "Trying to push new data although the i_stallmeta_register is not ready."); + `endif + + `ifndef VERILATOR + rdata_full : assert property( + @(posedge clk_i) disable iff (~rst_ni) (rdata_in_vld_q |-> rdata_in_rdy)) + else $fatal (1, "Trying to push new data although the i_rdata_register is not ready."); + `endif + + `ifndef VERILATOR + stalled_queue : assert property( + @(posedge clk_i) disable iff (~rst_ni) (!(queue_stalled_q && smeta_in_vld))) + else $fatal (1, "Trying to stall a queue operation despite an already stalled queue."); + `endif + // pragma translate_on + +endmodule diff --git a/hardware/tb/mempool_tb.sv b/hardware/tb/mempool_tb.sv index c8dd12e9b..fedf5c651 100644 --- a/hardware/tb/mempool_tb.sv +++ b/hardware/tb/mempool_tb.sv @@ -194,6 +194,183 @@ module mempool_tb; end: gen_wfi_tiles end: gen_wfi_groups +`endif +`endif + + /************************ + * Mempool Bank Trace * + ************************/ +//Accessing Signals heirarchically not supported by Verilator +`ifndef TARGET_SYNTHESIS +`ifndef TARGET_VERILATOR + //Hierarchy for TCDM adapter + `ifdef XQUEUE_TCDM_ADAPTER + `define TCDM_ADAPTER(group,tile,bank) \ + dut.i_mempool_cluster.gen_groups[group].i_group.gen_tiles[tile].i_tile.gen_banks[bank].gen_tcdm_adapter_xqueue.i_tcdm_adapter + `else + `define TCDM_ADAPTER(group,tile,bank) \ + dut.i_mempool_cluster.gen_groups[group].i_group.gen_tiles[tile].i_tile.gen_banks[bank].gen_tcdm_adapter.i_tcdm_adapter + `endif + int f; + + initial begin + f = $fopen("trace_bank.dasm", "w"); + end + + localparam int BankTrace = `ifdef BANK_TRACE `BANK_TRACE `else 0 `endif; + + genvar i,j,k; + generate; + for (i=0; i= NumCoresPerTile) begin + ini_group = $bits(group_id_t)'(metadata_sel.ini_addr - NumCoresPerTile) ^ group_id; + ini_tile = metadata_sel.tile_id; + ini_core = metadata_sel.core_id; + end else begin + ini_group = group_id; + ini_tile = j; + ini_core = metadata_sel.ini_addr; + end + `ifdef XQUEUE_TCDM_ADAPTER + //Stall calculation for queue operations + if (`TCDM_ADAPTER(i,j,k).increment_head || `TCDM_ADAPTER(i,j,k).increment_tail) begin + stall_d <= 0; + end else begin + if (`TCDM_ADAPTER(i,j,k).queue_stalled_q) begin + stall_d <= stall_q + 1; + end + end + //Print the cycles of stalled queue operation when it is resolved + if (`TCDM_ADAPTER(i,j,k).queue_stalled_q && !(`TCDM_ADAPTER(i,j,k).queue_stalled_d)) begin + print_stall_d = 1'b1; + stall = stall_q; + end + `endif + //Print Non-Atomic Loads and Stores + if ((`TCDM_ADAPTER(i,j,k).in_amo_i == '0) && `TCDM_ADAPTER(i,j,k).in_valid_i && `TCDM_ADAPTER(i,j,k).in_ready_o) begin + in_addr_d = `TCDM_ADAPTER(i,j,k).in_address_i; + if (`TCDM_ADAPTER(i,j,k).in_write_i) begin + print_sw_d = 1'b1; + sw_d = `TCDM_ADAPTER(i,j,k).in_wdata_i; + end else begin + print_lw_d = 1'b1; + end + end + end + + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) begin + stall_q <= 0; + increment_head_q <= '0; + increment_tail_q <= '0; + vld_amo_op_q <= '0; + q_push_data_q <= '0; + print_stall_q <= '0; + print_lw_q <= '0; + print_sw_q <= '0; + in_addr_q <= '0; + sw_q <= '0; + end else begin + stall_q <= stall_d; + `ifdef XQUEUE_TCDM_ADAPTER + increment_head_q <= `TCDM_ADAPTER(i,j,k).increment_head; + increment_tail_q <= `TCDM_ADAPTER(i,j,k).increment_tail; + vld_amo_op_q <= `TCDM_ADAPTER(i,j,k).vld_amo_op && `TCDM_ADAPTER(i,j,k).req_accepted; + `else + increment_head_q <= '0; + increment_tail_q <= '0; + vld_amo_op_q <= '0; + `endif + q_push_data_q <= q_push_data_d; + print_stall_q <= print_stall_d; + print_lw_q <= print_lw_d; + print_sw_q <= print_sw_d; + in_addr_q <= in_addr_d; + sw_q <= sw_d; + //Print when a Bank Operation is retired + if (BankTrace && `TCDM_ADAPTER(i,j,k).in_valid_o)begin + `ifdef XQUEUE_TCDM_ADAPTER + //AMO excluding Qpush and Qpop + if(vld_amo_op_q)begin + trace_entry = $sformatf("%t: (%1d,%2d,%2d): %s, init=(%1d,%2d,%2d), address= 0x%h, data= %d\n",$time,i,j,k,`TCDM_ADAPTER(i,j,k).amo_op_q, ini_group, ini_tile, ini_core, `TCDM_ADAPTER(i,j,k).addr_q,`TCDM_ADAPTER(i,j,k).amo_result); + $fwrite(f, trace_entry); + end + //Queue operations + if(increment_head_q || increment_tail_q) begin + if (increment_head_q) begin + trace_entry = $sformatf("%t: (%1d,%2d,%2d): Qpop ,",$time,i,j,k); + trace_entry = $sformatf("%s init=(%1d,%2d,%2d), data= %d", trace_entry, ini_group, ini_tile, ini_core, `TCDM_ADAPTER(i,j,k).in_rdata_o); + end else if (increment_tail_q)begin + trace_entry = $sformatf("%t: (%1d,%2d,%2d): Qpush,",$time,i,j,k); + trace_entry = $sformatf("%s init=(%1d,%2d,%2d), data= %d", trace_entry, ini_group, ini_tile, ini_core, q_push_data_q); + end + if(print_stall_q) begin + trace_entry = $sformatf("%s: Qstall=%d\n", trace_entry, stall); + end else begin + trace_entry = $sformatf("%s\n",trace_entry); + end + $fwrite(f, trace_entry); + end + `endif + //Load + if (print_lw_q) begin + trace_entry = $sformatf("%t: (%1d,%2d,%2d): Load Word , init=(%1d,%2d,%2d), address= 0x%h, data = %d\n",$time,i,j,k, ini_group, ini_tile, ini_core, in_addr_q, `TCDM_ADAPTER(i,j,k).in_rdata_o); + $fwrite(f, trace_entry); + end + //Store + if (print_sw_q) begin + trace_entry = $sformatf("%t: (%1d,%2d,%2d): Store Word, init=(%1d,%2d,%2d), address= 0x%h, data = %d\n",$time,i,j,k, ini_group, ini_tile, ini_core, in_addr_q, sw_q); + $fwrite(f, trace_entry); + end + end + end + end + end + end + end + endgenerate + + final begin + $fclose(f); + end + `endif `endif diff --git a/software/apps/memcpy/main.c b/software/apps/memcpy/main.c index c92a688a1..f93d2e0d1 100644 --- a/software/apps/memcpy/main.c +++ b/software/apps/memcpy/main.c @@ -27,7 +27,7 @@ #ifndef SIZE #define SIZE ((NUM_CORES) * (NUM_CORES)*2) #endif -#define BANKING_FACTOR (4) +// Assume banking factor of 4 uint32_t l2_data_a[SIZE] __attribute__((section(".l2"))) __attribute__((aligned(NUM_CORES * 4 * 4))); diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c new file mode 100644 index 000000000..f4c4339b8 --- /dev/null +++ b/software/apps/systolic/conv_xqueue/main.c @@ -0,0 +1,151 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Gua Hao Khov, ETH Zurich + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" +#include "systolic/conv_xqueue.h" + +// Dimensions of matrix X +#define DIM_X_M 258 +#define DIM_X_N 61 + +// Dimensions of matrix Y +#define DIM_Y_M (DIM_X_M - 2) +#define DIM_Y_N (DIM_X_N - 2) + +uint32_t *tile_map; +uint32_t *core_map; + +int32_t *matrix_X; +int32_t *matrix_Y; + +int32_t weights[3][3] = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; +int32_t *matrix_W = (int32_t *)weights; + +void generate_gradient_matrix(int32_t **matrix, uint32_t num_rows, + uint32_t num_cols) { + int32_t *new_matrix = (int32_t *)simple_malloc(num_rows * num_cols * 4); + for (uint32_t y = 0; y < num_rows; ++y) { + for (uint32_t x = 0; x < num_cols; ++x) { + new_matrix[y * num_cols + x] = (int32_t)(y + x); + } + } + *matrix = new_matrix; +} + +void print_matrix(int32_t const *matrix, uint32_t num_rows, + uint32_t num_columns) { + printf("Matrix at 0x%8X\n", (uint32_t)matrix); + for (uint32_t i = 0; i < num_rows; ++i) { + for (uint32_t j = 0; j < num_columns; ++j) { + printf("%5d ", matrix[i * num_columns + j]); + } + printf("\n"); + } +} + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t tile_id = core_id / 4; + + // Initialize synchronization variables + mempool_barrier_init(core_id); + + // Initialization + mempool_init(core_id, num_cores); + + // Allocate tile and core maps + if (core_id == 0) { + tile_map = (uint32_t *)simple_malloc(num_cores * 4); + core_map = (uint32_t *)simple_malloc(num_cores * 4); + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Set tile and core maps + tile_map[core_id] = tile_id; + core_map[core_id] = core_id; + + // Wait for all cores + mempool_barrier(num_cores); + + // Setup + if (core_id == 0) { + printf("> Initialize\n"); + + // Print out maps + // print_matrix((int32_t *)tile_map, 1, num_cores); + // print_matrix((int32_t *)core_map, 1, num_cores); + + // Initialize systolic array + systolic_init(tile_map, core_map); + + // Create and initialize matrices + generate_gradient_matrix(&matrix_X, DIM_X_M, DIM_X_N); + matrix_Y = (int32_t *)simple_malloc(DIM_Y_M * DIM_Y_N * 4); + + // Print out matrix X + // printf("> Print Matrix X\n"); + // print_matrix(matrix_X, DIM_X_M, DIM_X_N); + } + + // Wait for all cores + mempool_barrier(num_cores); + + if (core_id == 0) { + // Start benchmark + printf("> Start\n"); + // mempool_start_benchmark(); + } + + // Start benchmark for all cores + mempool_barrier(num_cores); + mempool_start_benchmark(); + + // Wait for all cores + mempool_barrier(num_cores); + + switch (core_id) { + case 0: + systolic_conv_front(DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y); + break; + case (NUM_CORES - 1): + systolic_conv_end(core_id, DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y); + break; + default: + systolic_conv_mid(core_id, DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y); + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Stop benchmark for all cores + mempool_stop_benchmark(); + mempool_barrier(num_cores); + + // Print out benchmark + if (core_id == 0) { + // Stop benchmark + // mempool_stop_benchmark(); + printf("> End\n"); + + // Print out matrix Y + // printf("> Print Matrix Y\n"); + // print_matrix(matrix_Y, DIM_Y_M, DIM_Y_N); + } + + // wait until all cores have finished + mempool_barrier(num_cores); + return 0; +} diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c new file mode 100644 index 000000000..5c69fde7e --- /dev/null +++ b/software/apps/systolic/matmul_xqueue/main.c @@ -0,0 +1,224 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Gua Hao Khov, ETH Zurich + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" +#include "systolic/matmul_xqueue.h" + +// Dimensions of matrices +#define DIM_M 24 +#define DIM_N 24 +#define DIM_P 24 + +uint32_t *tile_mapping; +uint32_t *core_mapping; + +int32_t *matrix_A; +int32_t *matrix_B; + +uint32_t rep_count; + +systolic_matrix_t *syst_matrix_A; +systolic_matrix_t *syst_matrix_B; +systolic_matrix_t *syst_matrix_C; + +void generate_gradient_matrix(int32_t **matrix, uint32_t num_rows, + uint32_t num_cols) { + int32_t *new_matrix = (int32_t *)simple_malloc(num_rows * num_cols * 4); + for (uint32_t y = 0; y < num_rows; ++y) { + for (uint32_t x = 0; x < num_cols; ++x) { + new_matrix[y * num_cols + x] = (int32_t)(y + x); + } + } + *matrix = new_matrix; +} + +void print_matrix(int32_t const *matrix, uint32_t num_rows, + uint32_t num_columns) { + printf("Matrix at 0x%8X\n", (uint32_t)matrix); + for (uint32_t i = 0; i < num_rows; ++i) { + for (uint32_t j = 0; j < num_columns; ++j) { + printf("%5d ", matrix[i * num_columns + j]); + } + printf("\n"); + } +} + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t tile_id = core_id / 4; + + // Initialize synchronization variables + mempool_barrier_init(core_id); + + // Initialization + mempool_init(core_id, num_cores); + + // Allocate systolic grid mapping + if (core_id == 0) { + tile_mapping = (uint32_t *)simple_malloc(num_cores * 4); + core_mapping = (uint32_t *)simple_malloc(num_cores * 4); + } + +#if NUM_CORES == 16 + // ---------- + // 16 CORES + // ---------- + + // Assign grid position (row wise) + // uint32_t col_idx = core_id % 4; + // uint32_t row_idx = core_id / 4; + + // Assign grid position (col wise) + uint32_t col_idx = core_id / 4; + uint32_t row_idx = core_id % 4; + + // Assign grid position (square wise) + // uint32_t col_idx = tile_id % 2; + // col_idx *= 2; + // col_idx += core_id % 2; + // uint32_t row_idx = tile_id / 2; + // row_idx *= 2; + // row_idx += (core_id % 4) / 2; +#elif NUM_CORES == 256 + // ---------- + // 256 CORES + // ---------- + + // Assign grid position (row wise) + // uint32_t col_idx = core_id % 16; + // uint32_t row_idx = core_id / 16; + + // Assign grid position (col wise) + uint32_t col_idx = core_id / 16; + uint32_t row_idx = core_id % 16; + + // Assign grid position (square wise) + // uint32_t col_idx = tile_id % 8; + // col_idx *= 2; + // col_idx += core_id % 2; + // uint32_t row_idx = tile_id / 8; + // row_idx *= 2; + // row_idx += (core_id % 4) / 2; + + // Assign grid position (square square wise) + // uint32_t group_id = tile_id / 16; + // uint32_t add_col = group_id % 2; + // uint32_t add_row = group_id / 2; + // uint32_t col_idx = tile_id % 4; + // col_idx *= 2; + // col_idx += core_id % 2; + // col_idx += add_col * 8; + // uint32_t row_idx = (tile_id % 16) / 4; + // row_idx *= 2; + // row_idx += (core_id % 4) / 2; + // row_idx += add_row * 8; +#else +#error Unsupported NUM_CORES +#endif + + // Wait for all cores + mempool_barrier(num_cores); + + // Set tile and core mapping + tile_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = tile_id; + core_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = core_id; + + // Wait for all cores + mempool_barrier(num_cores); + + // Setup + if (core_id == 0) { + printf("> Initialize\n"); + + // Print out tile mapping + //print_matrix((int32_t *)tile_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE); + + // Print out core mapping + //print_matrix((int32_t *)core_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE); + + // Initialize systolic array + systolic_init(tile_mapping, core_mapping); + + // Create systolic matrices + generate_gradient_matrix(&matrix_A, DIM_M, DIM_N); + systolic_matrix_create(&syst_matrix_A, matrix_A, DIM_M, DIM_N); + simple_free(matrix_A); + generate_gradient_matrix(&matrix_B, DIM_N, DIM_P); + systolic_matrix_create(&syst_matrix_B, matrix_B, DIM_N, DIM_P); + simple_free(matrix_B); + systolic_matrix_allocate(&syst_matrix_C, DIM_M, DIM_P); + + // Print out systolic matrices A & B + // printf("> Print Systolic Matrices A & B\n"); + // systolic_matrix_print(syst_matrix_A); + // systolic_matrix_print(syst_matrix_B); + + // Set repetition count per submatrix of C (A->num_cols == B->num_rows) + rep_count = syst_matrix_A->num_cols / 2; + } + + // Wait for all cores + mempool_barrier(num_cores); + + if (core_id == 0) { + // Start benchmark + printf("> Start\n"); + mempool_start_benchmark(); + } + + // Start benchmark for all cores + // mempool_barrier(num_cores); + // mempool_start_benchmark(); + + // Wait for all cores + mempool_barrier(num_cores); + + if ((row_idx == 0) && (col_idx == 0)) { + systolic_rcp_pe(rep_count, syst_matrix_A, syst_matrix_B, syst_matrix_C); + } + + if ((row_idx == 0) && (col_idx != 0)) { + systolic_cp_pe(col_idx, rep_count, syst_matrix_B, syst_matrix_C); + } + + if ((row_idx != 0) && (col_idx == 0)) { + systolic_rp_pe(row_idx, rep_count, syst_matrix_A, syst_matrix_C); + } + + if ((row_idx != 0) && (col_idx != 0)) { + systolic_np_pe(row_idx, col_idx, rep_count, syst_matrix_C); + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Stop benchmark for all cores + // mempool_stop_benchmark(); + // mempool_barrier(num_cores); + + // Print out benchmark + if (core_id == 0) { + // Stop benchmark + mempool_stop_benchmark(); + printf("> End\n"); + + // Print out systolic matrix C + // printf("> Print Systolic Matrix C\n"); + // systolic_matrix_print(syst_matrix_C); + } + + // wait until all cores have finished + mempool_barrier(num_cores); + return 0; +} diff --git a/software/apps/systolic/xqueue_test/main.c b/software/apps/systolic/xqueue_test/main.c new file mode 100644 index 000000000..ee4b7ee92 --- /dev/null +++ b/software/apps/systolic/xqueue_test/main.c @@ -0,0 +1,100 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Gua Hao Khov, ETH Zurich + +#include +#include + +#include "alloc.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +int32_t *queue = 0; + +int32_t producer_check, consumer_check, dummy_check; + +// queue push +static inline int32_t queue_push(void *const queue, int32_t data) { + int32_t ret; + asm volatile("q.push.w %0, %1, (%2)" : "=r"(ret) : "r"(data), "r"(queue)); + return ret; +} + +// queue pop +inline int32_t queue_pop(void *const queue) { + int32_t ret; + asm volatile("q.pop.w %0, 0(%1)" : "=r"(ret) : "r"(queue)); + return ret; +} + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + extern int32_t __seq_start; + + // Initialize synchronization variables + mempool_barrier_init(core_id); + + // Initialization + mempool_init(core_id, num_cores); + + // Setup + if (core_id == 0) { + printf("Initialize\n"); + queue = &__seq_start; + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Producer + if (core_id == 0) { + int32_t data[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + int32_t check = 0; + int32_t resp; + int32_t dummy = 0; + for (uint32_t i = 0; i < 16; ++i) { + resp = queue_push(queue, data[i]); + dummy += resp; + } + for (uint32_t i = 0; i < 16; ++i) { + resp = queue_push(queue, data[i]); + dummy += resp; + check += data[i]; + } + producer_check = check; + dummy_check = dummy; + } + + // Consumer + if (core_id == 1) { + int32_t read_data; + int32_t check = 0; + for (uint32_t i = 0; i < 16; ++i) { + read_data = queue_pop(queue); + printf("Rx: %d\n", read_data); + } + printf("Burst Test\n"); + for (uint32_t i = 0; i < 16; ++i) { + read_data = queue_pop(queue); + check += read_data; + } + consumer_check = check; + } + + // Wait for all cores + mempool_barrier(num_cores); + + // Print both checks + if (core_id == 0) { + printf("Check: %d/%d/%d\n", producer_check, consumer_check, dummy_check); + } + + // wait until all cores have finished + mempool_barrier(num_cores); + return 0; +} diff --git a/software/runtime/encoding.h b/software/runtime/encoding.h index ce0ce72fa..ee518a9ea 100644 --- a/software/runtime/encoding.h +++ b/software/runtime/encoding.h @@ -2148,6 +2148,10 @@ #define MASK_PV_PACKHI_B 0xfe00707f #define MATCH_PV_PACKLO_B 0xe0001057 #define MASK_PV_PACKLO_B 0xfe00707f +#define MATCH_Q_PUSH 0x3800202f +#define MASK_Q_PUSH 0xf800707f +#define MATCH_Q_POP 0x3000202f +#define MASK_Q_POP 0xf9f0707f #define CSR_FFLAGS 0x1 #define CSR_FRM 0x2 #define CSR_FCSR 0x3 @@ -3379,6 +3383,8 @@ DECLARE_INSN(pv_pack, MATCH_PV_PACK, MASK_PV_PACK) DECLARE_INSN(pv_pack_h, MATCH_PV_PACK_H, MASK_PV_PACK_H) DECLARE_INSN(pv_packhi_b, MATCH_PV_PACKHI_B, MASK_PV_PACKHI_B) DECLARE_INSN(pv_packlo_b, MATCH_PV_PACKLO_B, MASK_PV_PACKLO_B) +DECLARE_INSN(q_push, MATCH_Q_PUSH, MASK_Q_PUSH) +DECLARE_INSN(q_pop, MATCH_Q_POP, MASK_Q_POP) #endif #ifdef DECLARE_CSR DECLARE_CSR(fflags, CSR_FFLAGS) diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 108e217a5..12376e52d 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -11,6 +11,8 @@ #include #include +#define NUM_BANKS_PER_TILE NUM_CORES_PER_TILE *BANKING_FACTOR + extern char l1_alloc_base; extern uint32_t atomic_barrier; extern volatile uint32_t wake_up_reg; diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 717a432d9..ee4003663 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -64,6 +64,7 @@ DEFINES += -DPRINTF_DISABLE_SUPPORT_FLOAT -DPRINTF_DISABLE_SUPPORT_LONG_LONG -DP DEFINES += -DNUM_CORES=$(num_cores) DEFINES += -DNUM_GROUPS=$(num_groups) DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile) +DEFINES += -DBANKING_FACTOR=$(banking_factor) DEFINES += -DNUM_CORES_PER_GROUP=$(shell awk 'BEGIN{print $(num_cores)/$(num_groups)}') DEFINES += -DNUM_TILES_PER_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)}') DEFINES += -DLOG2_NUM_CORES_PER_TILE=$(shell awk 'BEGIN{print log($(num_cores_per_tile))/log(2)}') diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h new file mode 100644 index 000000000..8e6e251de --- /dev/null +++ b/software/runtime/systolic/conv_xqueue.h @@ -0,0 +1,1065 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Gua Hao Khov, ETH Zurich + +/* This library implements a simple systolic architecture emulation + * using global code based orchestration + */ + +/* TODO DESCRIPTION + * TODO: LIMITATION NUM_COLS_Y >= 2 <=> NUM_COLS >= 4 + * TODO: COMPLETELY FIXED TO KERNEL SIZE OF 3 + * + * + * + * + */ + +#include "alloc.h" +#include "printf.h" + +// Array of queue ptrs in row-major order (concatenated kernels) +int32_t *queues_x_0[NUM_CORES]; +int32_t *queues_x_1[NUM_CORES]; + +// queue push +static inline void queue_push(void *const queue, int32_t data, + int32_t *const ret) { + asm volatile("q.push.w %0, %1, (%2)" + : "+r"(*ret) + : "r"(data), "r"(queue) + : "memory"); +} + +// queue pop +inline void queue_pop(void *const queue, int32_t *const ret) { + asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue) : "memory"); +} + +void systolic_init(uint32_t const *tile_map, uint32_t const *core_map) { + // Create systolic array via queues + extern int32_t __seq_start; + uint32_t tile_id; + uint32_t core_id; + uint32_t tile_offset; + uint32_t core_offset; + + for (uint32_t i = 0; i < NUM_CORES; ++i) { + tile_id = tile_map[i]; + core_id = core_map[i]; + tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4; + core_offset = core_id % 4 * 4; + queues_x_0[i] = &__seq_start + tile_offset + core_offset + 0; + queues_x_1[i] = &__seq_start + tile_offset + core_offset + 1; + } + + // Print out queue addresses + // printf("queues_x_0\n"); + // for (uint32_t i = 0; i < NUM_CORES; ++i) { + // printf("%5d ", queues_x_0[i]); + // } + // printf("\n"); + // printf("queues_x_1\n"); + // for (uint32_t i = 0; i < NUM_CORES; ++i) { + // printf("%5d ", queues_x_1[i]); + // } + // printf("\n"); +} + +void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, + int32_t const *__restrict__ X, + int32_t const *__restrict__ W, + int32_t *__restrict__ Y) { + int32_t *queue_next_x_0; + int32_t *queue_next_x_1; + int32_t resp_x_0 __attribute__((unused)) = 0; + int32_t resp_x_1 __attribute__((unused)) = 0; + int32_t weights[3][3]; + int32_t curr_x[3]; + register int32_t acc_y[3] = {0, 0, 0}; + uint32_t row; + uint32_t col; + uint32_t num_cols_y = num_cols - 2; + + // Assign queues + queue_next_x_0 = queues_x_0[1]; + queue_next_x_1 = queues_x_1[1]; + + // Load weights + for (uint32_t y = 0; y < 3; ++y) { + for (uint32_t x = 0; x < 3; ++x) { + weights[y][x] = W[y * 3 + x]; + } + } + + // Execute row-wise systolic 2d convolution + row = 2; + while (row < num_rows - 1) { + // ---------- + // POPULATE 0 + // ---------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + 0]; + curr_x[2] = X[(row - 0) * num_cols + 0]; + curr_x[0] = X[(row - 2) * num_cols + 0]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st column of weights + acc_y[2] += curr_x[0] * weights[0][0]; + acc_y[2] += curr_x[1] * weights[1][0]; + acc_y[2] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("" ::: "memory"); + // ---------- + // POPULATE 1 + // ---------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + 1]; + curr_x[2] = X[(row - 0) * num_cols + 1]; + curr_x[0] = X[(row - 2) * num_cols + 1]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("" ::: "memory"); + // ------------------ + // CONVOLUTION BURSTS + // ------------------ + col = 2; + while (col < num_cols_y) { + // ----------- + // ITERATION 0 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 0]; + curr_x[2] = X[(row - 0) * num_cols + col + 0]; + curr_x[0] = X[(row - 2) * num_cols + col + 0]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 1 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 1]; + curr_x[2] = X[(row - 0) * num_cols + col + 1]; + curr_x[0] = X[(row - 2) * num_cols + col + 1]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 2 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 2]; + curr_x[2] = X[(row - 0) * num_cols + col + 2]; + curr_x[0] = X[(row - 2) * num_cols + col + 2]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[1] += curr_x[0] * weights[0][2]; + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[1] += curr_x[1] * weights[1][2]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[1] += curr_x[2] * weights[2][2]; + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; + // Reset finished accumulation + acc_y[1] = 0; + __asm__ __volatile__("" ::: "memory"); + // ---------------- + // INCREMENT COLUMN + // ---------------- + col += 3; + } + __asm__ __volatile__("" ::: "memory"); + // --------------------- + // CONVOLUTION REMAINDER + // --------------------- + while (col < num_cols) { + // ----------- + // ITERATION 0 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 0]; + curr_x[2] = X[(row - 0) * num_cols + col + 0]; + curr_x[0] = X[(row - 2) * num_cols + col + 0]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; + // Increment column index + ++col; + if (col >= num_cols) + break; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 1 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 1]; + curr_x[2] = X[(row - 0) * num_cols + col + 1]; + curr_x[0] = X[(row - 2) * num_cols + col + 1]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; + } + // ------------------ + // RESET ACCUMULATORS + // ------------------ + acc_y[0] = 0; + acc_y[1] = 0; + acc_y[2] = 0; + // ------------- + // INCREMENT ROW + // ------------- + row += NUM_CORES; + } + + // Finish last row of systolic 2d convolution without pushing + if (row == num_rows - 1) { + // ---------- + // POPULATE 0 + // ---------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + 0]; + curr_x[2] = X[(row - 0) * num_cols + 0]; + curr_x[0] = X[(row - 2) * num_cols + 0]; + // MACs with 1st column of weights + acc_y[2] += curr_x[0] * weights[0][0]; + acc_y[2] += curr_x[1] * weights[1][0]; + acc_y[2] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("" ::: "memory"); + // ---------- + // POPULATE 1 + // ---------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + 1]; + curr_x[2] = X[(row - 0) * num_cols + 1]; + curr_x[0] = X[(row - 2) * num_cols + 1]; + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("" ::: "memory"); + // ------------------ + // CONVOLUTION BURSTS + // ------------------ + col = 2; + while (col < num_cols_y) { + // ----------- + // ITERATION 0 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 0]; + curr_x[2] = X[(row - 0) * num_cols + col + 0]; + curr_x[0] = X[(row - 2) * num_cols + col + 0]; + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 1 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 1]; + curr_x[2] = X[(row - 0) * num_cols + col + 1]; + curr_x[0] = X[(row - 2) * num_cols + col + 1]; + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 2 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 2]; + curr_x[2] = X[(row - 0) * num_cols + col + 2]; + curr_x[0] = X[(row - 2) * num_cols + col + 2]; + // MACs with 1st row of weights + acc_y[1] += curr_x[0] * weights[0][2]; + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[1] += curr_x[1] * weights[1][2]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[1] += curr_x[2] * weights[2][2]; + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; + // Reset finished accumulation + acc_y[1] = 0; + __asm__ __volatile__("" ::: "memory"); + // ---------------- + // INCREMENT COLUMN + // ---------------- + col += 3; + } + __asm__ __volatile__("" ::: "memory"); + // --------------------- + // CONVOLUTION REMAINDER + // --------------------- + while (col < num_cols) { + // ----------- + // ITERATION 0 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 0]; + curr_x[2] = X[(row - 0) * num_cols + col + 0]; + curr_x[0] = X[(row - 2) * num_cols + col + 0]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; + // Increment column index + ++col; + if (col >= num_cols) + break; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 1 + // ----------- + // Load x vector + curr_x[1] = X[(row - 1) * num_cols + col + 1]; + curr_x[2] = X[(row - 0) * num_cols + col + 1]; + curr_x[0] = X[(row - 2) * num_cols + col + 1]; + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; + } + } +} + +void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows, + const uint32_t num_cols, int32_t const *__restrict__ X, + int32_t const *__restrict__ W, int32_t *__restrict__ Y) { + int32_t *queue_prev_x_0; + int32_t *queue_next_x_0; + int32_t *queue_prev_x_1; + int32_t *queue_next_x_1; + int32_t resp_x_0 __attribute__((unused)) = 0; + int32_t resp_x_1 __attribute__((unused)) = 0; + int32_t weights[3][3]; + int32_t curr_x[3]; + register int32_t acc_y[3] = {0, 0, 0}; + uint32_t row; + uint32_t col; + uint32_t num_cols_y = num_cols - 2; + + // Assign queues + queue_prev_x_0 = queues_x_0[kernel_id]; + queue_next_x_0 = queues_x_0[kernel_id + 1]; + queue_prev_x_1 = queues_x_1[kernel_id]; + queue_next_x_1 = queues_x_1[kernel_id + 1]; + + // Load weights + for (uint32_t y = 0; y < 3; ++y) { + for (uint32_t x = 0; x < 3; ++x) { + weights[y][x] = W[y * 3 + x]; + } + } + + // Execute row-wise systolic 2d convolution + row = kernel_id + 2; + while (row < num_rows - 1) { + // ---------- + // POPULATE 0 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st column of weights + acc_y[2] += curr_x[0] * weights[0][0]; + acc_y[2] += curr_x[1] * weights[1][0]; + acc_y[2] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("" ::: "memory"); + // ---------- + // POPULATE 1 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("" ::: "memory"); + // ------------------ + // CONVOLUTION BURSTS + // ------------------ + col = 2; + while (col < num_cols_y) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 1 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 2 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 2]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[1] += curr_x[0] * weights[0][2]; + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[1] += curr_x[1] * weights[1][2]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[1] += curr_x[2] * weights[2][2]; + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; + // Reset finished accumulation + acc_y[1] = 0; + __asm__ __volatile__("" ::: "memory"); + // ---------------- + // INCREMENT COLUMN + // ---------------- + col += 3; + } + __asm__ __volatile__("" ::: "memory"); + // --------------------- + // CONVOLUTION REMAINDER + // --------------------- + while (col < num_cols) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; + // Increment column index + ++col; + if (col >= num_cols) + break; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 1 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // Push lower part of x vector + queue_push(queue_next_x_0, curr_x[1], &resp_x_0); + queue_push(queue_next_x_1, curr_x[2], &resp_x_1); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; + } + // ------------------ + // RESET ACCUMULATORS + // ------------------ + acc_y[0] = 0; + acc_y[1] = 0; + acc_y[2] = 0; + // ------------- + // INCREMENT ROW + // ------------- + row += NUM_CORES; + } + + // Finish last row of systolic 2d convolution without pushing + if (row == num_rows - 1) { + // ---------- + // POPULATE 0 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st column of weights + acc_y[2] += curr_x[0] * weights[0][0]; + acc_y[2] += curr_x[1] * weights[1][0]; + acc_y[2] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("" ::: "memory"); + // ---------- + // POPULATE 1 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("" ::: "memory"); + // ------------------ + // CONVOLUTION BURSTS + // ------------------ + col = 2; + while (col < num_cols_y) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 1 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 2 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 2]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[1] += curr_x[0] * weights[0][2]; + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[1] += curr_x[1] * weights[1][2]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[1] += curr_x[2] * weights[2][2]; + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; + // Reset finished accumulation + acc_y[1] = 0; + __asm__ __volatile__("" ::: "memory"); + // ---------------- + // INCREMENT COLUMN + // ---------------- + col += 3; + } + __asm__ __volatile__("" ::: "memory"); + // --------------------- + // CONVOLUTION REMAINDER + // --------------------- + while (col < num_cols) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; + // Increment column index + ++col; + if (col >= num_cols) + break; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 1 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; + } + } +} + +void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, + const uint32_t num_cols, int32_t const *__restrict__ X, + int32_t const *__restrict__ W, int32_t *__restrict__ Y) { + int32_t *queue_prev_x_0; + int32_t *queue_prev_x_1; + int32_t weights[3][3]; + int32_t curr_x[3]; + register int32_t acc_y[3] = {0, 0, 0}; + uint32_t col; + uint32_t num_cols_y = num_cols - 2; + + // Assign queues + queue_prev_x_0 = queues_x_0[kernel_id]; + queue_prev_x_1 = queues_x_1[kernel_id]; + + // Load weights + for (uint32_t y = 0; y < 3; ++y) { + for (uint32_t x = 0; x < 3; ++x) { + weights[y][x] = W[y * 3 + x]; + } + } + + // Execute row-wise systolic 2d convolution + for (uint32_t row = kernel_id + 2; row < num_rows; row += NUM_CORES) { + // ---------- + // POPULATE 0 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st column of weights + acc_y[2] += curr_x[0] * weights[0][0]; + acc_y[2] += curr_x[1] * weights[1][0]; + acc_y[2] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("" ::: "memory"); + // ---------- + // POPULATE 1 + // ---------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + __asm__ __volatile__("" ::: "memory"); + // ------------------ + // CONVOLUTION BURSTS + // ------------------ + col = 2; + while (col < num_cols_y) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 0]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2]; + // Reset finished accumulation + acc_y[2] = 0; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 1 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 1]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0]; + // Reset finished accumulation + acc_y[0] = 0; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 2 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col + 2]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[1] += curr_x[0] * weights[0][2]; + acc_y[2] += curr_x[0] * weights[0][1]; + acc_y[0] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[1] += curr_x[1] * weights[1][2]; + acc_y[2] += curr_x[1] * weights[1][1]; + acc_y[0] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[1] += curr_x[2] * weights[2][2]; + acc_y[2] += curr_x[2] * weights[2][1]; + acc_y[0] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1]; + // Reset finished accumulation + acc_y[1] = 0; + __asm__ __volatile__("" ::: "memory"); + // ---------------- + // INCREMENT COLUMN + // ---------------- + col += 3; + } + __asm__ __volatile__("" ::: "memory"); + // --------------------- + // CONVOLUTION REMAINDER + // --------------------- + while (col < num_cols) { + // ----------- + // ITERATION 0 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[2] += curr_x[0] * weights[0][2]; + acc_y[0] += curr_x[0] * weights[0][1]; + acc_y[1] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[2] += curr_x[1] * weights[1][2]; + acc_y[0] += curr_x[1] * weights[1][1]; + acc_y[1] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[2] += curr_x[2] * weights[2][2]; + acc_y[0] += curr_x[2] * weights[2][1]; + acc_y[1] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2]; + // Increment column index + ++col; + if (col >= num_cols) + break; + __asm__ __volatile__("" ::: "memory"); + // ----------- + // ITERATION 1 + // ----------- + // Pop and load x vector + queue_pop(queue_prev_x_1, &curr_x[1]); + curr_x[2] = X[row * num_cols + col]; + queue_pop(queue_prev_x_0, &curr_x[0]); + // MACs with 1st row of weights + acc_y[0] += curr_x[0] * weights[0][2]; + acc_y[1] += curr_x[0] * weights[0][1]; + acc_y[2] += curr_x[0] * weights[0][0]; + // MACs with 2nd row of weights + acc_y[0] += curr_x[1] * weights[1][2]; + acc_y[1] += curr_x[1] * weights[1][1]; + acc_y[2] += curr_x[1] * weights[1][0]; + // MACs with 3rd row of weights + acc_y[0] += curr_x[2] * weights[2][2]; + acc_y[1] += curr_x[2] * weights[2][1]; + acc_y[2] += curr_x[2] * weights[2][0]; + // Store finished accumulation + Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0]; + } + // ------------------ + // RESET ACCUMULATORS + // ------------------ + acc_y[0] = 0; + acc_y[1] = 0; + acc_y[2] = 0; + } +} diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h new file mode 100644 index 000000000..c1f8aac3b --- /dev/null +++ b/software/runtime/systolic/matmul_xqueue.h @@ -0,0 +1,996 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Gua Hao Khov, ETH Zurich + +/* This library implements a simple systolic architecture emulation + * using global code based orchestration + */ + +/* A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix + * C = AB + * (max dimension is 16-bit) + * Matrix is processed in 2x2 submatrices with the following indexing + * + * B B 0 1 + * B B 2 3 + * + * A A C C = 0 2 0 1 + * A A C C 1 3 2 3 + * + * e.g. C0 = A2 * B2 + A0 * B0 + * + * We use two interleaved queues per direction + */ + +#include "alloc.h" +#include "printf.h" + +// Dimensions of square systolic array +#define SYSTOLIC_SIZE 16 + +// Systolic matrix +typedef struct { + int32_t *matrix; + uint32_t num_rows; + uint32_t num_cols; +} systolic_matrix_t; + +// TODO: SQRT ROOT OF NUM_CORES FOR SYSTOLIC SIZE + +// Array of queue ptrs in row-major order +int32_t *queues_vert_0[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; +int32_t *queues_vert_1[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; +int32_t *queues_horz_0[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; +int32_t *queues_horz_1[SYSTOLIC_SIZE][SYSTOLIC_SIZE]; + +// queue push +static inline void queue_push(void *const queue, int32_t data, + int32_t *const ret) { + asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue)); +} + +// queue pop +inline void queue_pop(void *const queue, int32_t *const ret) { + asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue)); +} + +void systolic_init(uint32_t const *tile_mapping, uint32_t const *core_mapping) { + // Create systolic array via queues + extern int32_t __seq_start; + uint32_t grid_pos = 0; + uint32_t tile_id; + uint32_t core_id; + uint32_t tile_offset; + uint32_t core_offset; + for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { + for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { + tile_id = tile_mapping[grid_pos]; + core_id = core_mapping[grid_pos]; + tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4; + core_offset = core_id % 4 * 4; + queues_vert_0[y][x] = &__seq_start + tile_offset + core_offset + 0; + queues_vert_1[y][x] = &__seq_start + tile_offset + core_offset + 1; + queues_horz_0[y][x] = &__seq_start + tile_offset + core_offset + 2; + queues_horz_1[y][x] = &__seq_start + tile_offset + core_offset + 3; + ++grid_pos; + } + } + + // Print out queue addresses + // printf("queues_vert_0\n"); + // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { + // for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { + // printf("%5d ", queues_vert_0[y][x]); + // } + // printf("\n"); + // } + // printf("queues_vert_1\n"); + // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { + // for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { + // printf("%5d ", queues_vert_1[y][x]); + // } + // printf("\n"); + // } + // printf("queues_horz_0\n"); + // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { + // for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { + // printf("%5d ", queues_horz_0[y][x]); + // } + // printf("\n"); + // } + // printf("queues_horz_1\n"); + // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) { + // for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) { + // printf("%5d ", queues_horz_1[y][x]); + // } + // printf("\n"); + // } +} + +void systolic_matrix_allocate(systolic_matrix_t **syst_matrix, + uint32_t num_rows, uint32_t num_cols) { + // Round up row and col dimension to next multiple of two + uint32_t syst_num_rows = (uint32_t)((num_rows + 1) & 0xFFFE); + uint32_t syst_num_cols = (uint32_t)((num_cols + 1) & 0xFFFE); + + // Allocate matrix array + int32_t *array = (int32_t *)simple_malloc(syst_num_rows * syst_num_cols * 4); + + // Allocate systolic matrix + systolic_matrix_t *new_matrix = (systolic_matrix_t *)simple_malloc(3 * 4); + + // Assign values to systolic matrix + new_matrix->matrix = array; + new_matrix->num_rows = syst_num_rows; + new_matrix->num_cols = syst_num_cols; + + *syst_matrix = new_matrix; +} + +void systolic_matrix_create(systolic_matrix_t **syst_matrix, int32_t *matrix, + uint32_t num_rows, uint32_t num_cols) { + // Round up row and col dimension to next multiple of two + uint32_t syst_num_rows = (uint32_t)((num_rows + 1) & 0xFFFE); + uint32_t syst_num_cols = (uint32_t)((num_cols + 1) & 0xFFFE); + + // Allocate matrix array + int32_t *array = (int32_t *)simple_malloc(syst_num_rows * syst_num_cols * 4); + + // Copy data into new matrix array + for (uint32_t y = 0; y < num_rows; ++y) { + for (uint32_t x = 0; x < num_cols; ++x) { + array[y * syst_num_cols + x] = matrix[y * num_cols + x]; + } + } + + // Zero padding of matrix array + if (syst_num_cols != num_cols) { + for (uint32_t y = 0; y < syst_num_rows; ++y) { + array[y * syst_num_cols + syst_num_cols - 1] = 0; + } + } + if (syst_num_rows != num_rows) { + for (uint32_t x = 0; x < syst_num_cols; ++x) { + array[(syst_num_rows - 1) * syst_num_cols + x] = 0; + } + } + + // Allocate systolic matrix + systolic_matrix_t *new_matrix = (systolic_matrix_t *)simple_malloc(3 * 4); + + // Assign values to systolic matrix + new_matrix->matrix = array; + new_matrix->num_rows = syst_num_rows; + new_matrix->num_cols = syst_num_cols; + + *syst_matrix = new_matrix; +} + +void systolic_matrix_print(systolic_matrix_t *syst_matrix) { + printf("Systolic matrix at 0x%08X\n", (uint32_t)syst_matrix); + uint32_t num_rows = syst_matrix->num_rows; + uint32_t num_cols = syst_matrix->num_cols; + int32_t *matrix = syst_matrix->matrix; + for (uint32_t y = 0; y < num_rows; ++y) { + for (uint32_t x = 0; x < num_cols; ++x) { + printf("%5d ", matrix[y * num_cols + x]); + } + printf("\n"); + } +} + +// row and column producing processing element +void systolic_rcp_pe(const uint32_t rep_count, + systolic_matrix_t const *__restrict__ A, + systolic_matrix_t const *__restrict__ B, + systolic_matrix_t const *__restrict__ C) { + int32_t *queue_next_horz_0; + int32_t *queue_next_horz_1; + int32_t *queue_next_vert_0; + int32_t *queue_next_vert_1; + int32_t data_horz[4] = {0, 0, 0, 0}; + int32_t data_vert[4] = {0, 0, 0, 0}; + int32_t resp_horz_0 __attribute__((unused)) = 0; + int32_t resp_horz_1 __attribute__((unused)) = 0; + int32_t resp_vert_0 __attribute__((unused)) = 0; + int32_t resp_vert_1 __attribute__((unused)) = 0; + int32_t *matrix_A; + int32_t *matrix_B; + int32_t *matrix_C; + uint32_t num_cols_A; + uint32_t num_cols_B; + uint32_t num_rows_C; + uint32_t num_cols_C; + int32_t curr_element_0_C; + int32_t curr_element_1_C; + int32_t curr_element_2_C; + int32_t curr_element_3_C; + uint32_t anchor_row_0; + uint32_t anchor_row_1; + + // Assign queues + queue_next_horz_0 = queues_horz_0[0][1]; + queue_next_horz_1 = queues_horz_1[0][1]; + queue_next_vert_0 = queues_vert_0[1][0]; + queue_next_vert_1 = queues_vert_1[1][0]; + + // Get matrix arrays + matrix_A = A->matrix; + matrix_B = B->matrix; + matrix_C = C->matrix; + + // Get dimensions of matrices + num_cols_A = A->num_cols; + num_cols_B = B->num_cols; + num_rows_C = C->num_rows; + num_cols_C = C->num_cols; + + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + data_horz[0] = matrix_A[y * num_cols_A + i]; + data_vert[0] = matrix_B[i * num_cols_B + x]; + data_horz[1] = matrix_A[(y + 1) * num_cols_A + i]; + data_vert[1] = matrix_B[i * num_cols_B + x + 1]; + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + data_horz[2] = matrix_A[y * num_cols_A + i + 1]; + data_vert[2] = matrix_B[(i + 1) * num_cols_B + x]; + data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1]; + data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1]; + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = y * num_cols_C + x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } + } +} + +// column producing processing element +void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count, + systolic_matrix_t const *__restrict__ B, + systolic_matrix_t const *__restrict__ C) { + int32_t *queue_prev_horz_0; + int32_t *queue_prev_horz_1; + int32_t *queue_next_horz_0; + int32_t *queue_next_horz_1; + int32_t *queue_next_vert_0; + int32_t *queue_next_vert_1; + int32_t data_horz[4] = {0, 0, 0, 0}; + int32_t data_vert[4] = {0, 0, 0, 0}; + int32_t resp_horz_0 __attribute__((unused)) = 0; + int32_t resp_horz_1 __attribute__((unused)) = 0; + int32_t resp_vert_0 __attribute__((unused)) = 0; + int32_t resp_vert_1 __attribute__((unused)) = 0; + int32_t *matrix_B; + int32_t *matrix_C; + uint32_t num_cols_B; + uint32_t num_rows_C; + uint32_t num_cols_C; + uint32_t shifted_x; + int32_t curr_element_0_C; + int32_t curr_element_1_C; + int32_t curr_element_2_C; + int32_t curr_element_3_C; + uint32_t anchor_row_0; + uint32_t anchor_row_1; + + // Assign queues + queue_prev_horz_0 = queues_horz_0[0][col_idx]; + queue_prev_horz_1 = queues_horz_1[0][col_idx]; + if (col_idx == SYSTOLIC_SIZE - 1) { + queue_next_horz_0 = NULL; + queue_next_horz_1 = NULL; + } else { + queue_next_horz_0 = queues_horz_0[0][col_idx + 1]; + queue_next_horz_1 = queues_horz_1[0][col_idx + 1]; + } + queue_next_vert_0 = queues_vert_0[1][col_idx]; + queue_next_vert_1 = queues_vert_1[1][col_idx]; + + // Get matrix arrays + matrix_B = B->matrix; + matrix_C = C->matrix; + + // Get dimensions of matrices + num_cols_B = B->num_cols; + num_rows_C = C->num_rows; + num_cols_C = C->num_cols; + + // Check if PE is at the right boundary + if (queue_next_horz_0) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x + shifted_x = x + 2 * col_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + data_vert[0] = matrix_B[i * num_cols_B + shifted_x]; + queue_pop(queue_prev_horz_1, &data_horz[1]); + data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1]; + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x]; + queue_pop(queue_prev_horz_1, &data_horz[3]); + data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + } + } + } + } + } else { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x + shifted_x = x + 2 * col_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + data_vert[0] = matrix_B[i * num_cols_B + shifted_x]; + queue_pop(queue_prev_horz_1, &data_horz[1]); + data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1]; + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x]; + queue_pop(queue_prev_horz_1, &data_horz[3]); + data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1]; + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_push(queue_next_vert_0, data_horz[0], &resp_vert_0); + queue_push(queue_next_vert_1, data_horz[1], &resp_vert_1); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_push(queue_next_vert_0, data_horz[2], &resp_vert_0); + queue_push(queue_next_vert_1, data_horz[3], &resp_vert_1); + } + } + } + } + } +} + +// row producing processing element +void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count, + systolic_matrix_t const *__restrict__ A, + systolic_matrix_t const *__restrict__ C) { + int32_t *queue_next_horz_0; + int32_t *queue_next_horz_1; + int32_t *queue_prev_vert_0; + int32_t *queue_prev_vert_1; + int32_t *queue_next_vert_0; + int32_t *queue_next_vert_1; + int32_t data_horz[4] = {0, 0, 0, 0}; + int32_t data_vert[4] = {0, 0, 0, 0}; + int32_t resp_horz_0 __attribute__((unused)) = 0; + int32_t resp_horz_1 __attribute__((unused)) = 0; + int32_t resp_vert_0 __attribute__((unused)) = 0; + int32_t resp_vert_1 __attribute__((unused)) = 0; + int32_t *matrix_A; + int32_t *matrix_C; + uint32_t num_cols_A; + uint32_t num_rows_C; + uint32_t num_cols_C; + uint32_t shifted_y; + int32_t curr_element_0_C; + int32_t curr_element_1_C; + int32_t curr_element_2_C; + int32_t curr_element_3_C; + uint32_t anchor_row_0; + uint32_t anchor_row_1; + + // Assign queues + queue_next_horz_0 = queues_horz_0[row_idx][1]; + queue_next_horz_1 = queues_horz_1[row_idx][1]; + queue_prev_vert_0 = queues_vert_0[row_idx][0]; + queue_prev_vert_1 = queues_vert_1[row_idx][0]; + if (row_idx == SYSTOLIC_SIZE - 1) { + queue_next_vert_0 = NULL; + queue_next_vert_1 = NULL; + } else { + queue_next_vert_0 = queues_vert_0[row_idx + 1][0]; + queue_next_vert_1 = queues_vert_1[row_idx + 1][0]; + } + + // Get matrix arrays + matrix_A = A->matrix; + matrix_C = C->matrix; + + // Get dimensions of matrices + num_cols_A = A->num_cols; + num_rows_C = C->num_rows; + num_cols_C = C->num_cols; + + // Check if PE is at the bottom boundary + if (queue_next_vert_0) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift y + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + data_horz[0] = matrix_A[shifted_y * num_cols_A + i]; + queue_pop(queue_prev_vert_0, &data_vert[0]); + data_horz[1] = matrix_A[(shifted_y + 1) * num_cols_A + i]; + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + data_horz[2] = matrix_A[shifted_y * num_cols_A + i + 1]; + queue_pop(queue_prev_vert_0, &data_vert[2]); + data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = shifted_y * num_cols_C + x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + } + } + } + } + } else { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift y + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < 2 * rep_count; i += 2) { + data_horz[0] = matrix_A[shifted_y * num_cols_A + i]; + queue_pop(queue_prev_vert_0, &data_vert[0]); + data_horz[1] = matrix_A[(shifted_y + 1) * num_cols_A + i]; + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + data_horz[2] = matrix_A[shifted_y * num_cols_A + i + 1]; + queue_pop(queue_prev_vert_0, &data_vert[2]); + data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1]; + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = shifted_y * num_cols_C + x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_vert[0], &resp_horz_0); + queue_push(queue_next_horz_1, data_vert[1], &resp_horz_1); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_vert[2], &resp_horz_0); + queue_push(queue_next_horz_1, data_vert[3], &resp_horz_1); + } + } + } + } + } +} + +// non-producing processing element +void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx, + const uint32_t rep_count, + systolic_matrix_t const *__restrict__ C) { + int32_t *queue_prev_horz_0; + int32_t *queue_prev_horz_1; + int32_t *queue_next_horz_0; + int32_t *queue_next_horz_1; + int32_t *queue_prev_vert_0; + int32_t *queue_prev_vert_1; + int32_t *queue_next_vert_0; + int32_t *queue_next_vert_1; + int32_t data_horz[4] = {0, 0, 0, 0}; + int32_t data_vert[4] = {0, 0, 0, 0}; + int32_t data_dummy __attribute__((unused)) = 0; + int32_t resp_horz_0 __attribute__((unused)) = 0; + int32_t resp_horz_1 __attribute__((unused)) = 0; + int32_t resp_vert_0 __attribute__((unused)) = 0; + int32_t resp_vert_1 __attribute__((unused)) = 0; + int32_t *matrix_C; + uint32_t num_rows_C; + uint32_t num_cols_C; + uint32_t shifted_x; + uint32_t shifted_y; + int32_t curr_element_0_C; + int32_t curr_element_1_C; + int32_t curr_element_2_C; + int32_t curr_element_3_C; + uint32_t anchor_row_0; + uint32_t anchor_row_1; + + // Assign queues + queue_prev_horz_0 = queues_horz_0[row_idx][col_idx]; + queue_prev_horz_1 = queues_horz_1[row_idx][col_idx]; + if (col_idx == SYSTOLIC_SIZE - 1) { + queue_next_horz_0 = NULL; + queue_next_horz_1 = NULL; + } else { + queue_next_horz_0 = queues_horz_0[row_idx][col_idx + 1]; + queue_next_horz_1 = queues_horz_1[row_idx][col_idx + 1]; + } + queue_prev_vert_0 = queues_vert_0[row_idx][col_idx]; + queue_prev_vert_1 = queues_vert_1[row_idx][col_idx]; + if (row_idx == SYSTOLIC_SIZE - 1) { + queue_next_vert_0 = NULL; + queue_next_vert_1 = NULL; + } else { + queue_next_vert_0 = queues_vert_0[row_idx + 1][col_idx]; + queue_next_vert_1 = queues_vert_1[row_idx + 1][col_idx]; + } + + // Get matrix arrays + matrix_C = C->matrix; + + // Get dimensions of matrices + num_rows_C = C->num_rows; + num_cols_C = C->num_cols; + + // PE is not at a boundary + if (queue_next_horz_0 && queue_next_vert_0) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x and y + shifted_x = x + 2 * col_idx; + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C && shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = shifted_y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + } + } + } + } + } + + // PE is at the right boundary + if (!queue_next_horz_0 && queue_next_vert_0) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x and y + shifted_x = x + 2 * col_idx; + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C && shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = shifted_y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + data_vert[0] += data_horz[0]; + data_vert[1] += data_horz[1]; + queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + data_vert[2] += data_horz[2]; + data_vert[3] += data_horz[3]; + queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0); + queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1); + } + } + } + } + } + + // PE is at the bottom boundary + if (queue_next_horz_0 && !queue_next_vert_0) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x and y + shifted_x = x + 2 * col_idx; + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C && shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = shifted_y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + data_horz[0] += data_vert[0]; + data_horz[1] += data_vert[1]; + queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1); + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + data_horz[2] += data_vert[2]; + data_horz[3] += data_vert[3]; + queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0); + queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1); + } + } + } + } + } + + // PE is at the bottom right corner + if (!queue_next_horz_0 && !queue_next_vert_0) { + // Execute step-wise matrix multiplication + for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) { + for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) { + // Shift x and y + shifted_x = x + 2 * col_idx; + shifted_y = y + 2 * row_idx; + + // Check if this PE is currently within the matrix C + if (shifted_x < num_cols_C && shifted_y < num_rows_C) { + // Reset values + curr_element_0_C = 0; + curr_element_1_C = 0; + curr_element_2_C = 0; + curr_element_3_C = 0; + + // Systolic matrix multiplication through MACs + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + curr_element_0_C += data_horz[0] * data_vert[0]; + curr_element_1_C += data_horz[0] * data_vert[1]; + curr_element_2_C += data_horz[1] * data_vert[0]; + curr_element_3_C += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + curr_element_0_C += data_horz[2] * data_vert[2]; + curr_element_1_C += data_horz[2] * data_vert[3]; + curr_element_2_C += data_horz[3] * data_vert[2]; + curr_element_3_C += data_horz[3] * data_vert[3]; + } + + // Store values + anchor_row_0 = shifted_y * num_cols_C + shifted_x; + anchor_row_1 = anchor_row_0 + num_cols_C; + matrix_C[anchor_row_0] = curr_element_0_C; + matrix_C[anchor_row_0 + 1] = curr_element_1_C; + matrix_C[anchor_row_1] = curr_element_2_C; + matrix_C[anchor_row_1 + 1] = curr_element_3_C; + } else { + // Pop and push dummy data + for (uint32_t i = 0; i < rep_count; ++i) { + queue_pop(queue_prev_horz_0, &data_horz[0]); + queue_pop(queue_prev_vert_0, &data_vert[0]); + queue_pop(queue_prev_horz_1, &data_horz[1]); + queue_pop(queue_prev_vert_1, &data_vert[1]); + data_dummy += data_horz[0] * data_vert[0]; + data_dummy += data_horz[1] * data_vert[1]; + queue_pop(queue_prev_horz_0, &data_horz[2]); + queue_pop(queue_prev_vert_0, &data_vert[2]); + queue_pop(queue_prev_horz_1, &data_horz[3]); + queue_pop(queue_prev_vert_1, &data_vert[3]); + data_dummy += data_horz[2] * data_vert[2]; + data_dummy += data_horz[3] * data_vert[3]; + // TODO: FIND SAFER WAY TO ENFORCE DATA DEPENDENCY + if (!data_dummy) + break; + } + } + } + } + } +} diff --git a/toolchain/riscv-gnu-toolchain b/toolchain/riscv-gnu-toolchain index 70acebe25..3b3b3dcbc 160000 --- a/toolchain/riscv-gnu-toolchain +++ b/toolchain/riscv-gnu-toolchain @@ -1 +1 @@ -Subproject commit 70acebe256fc49114b5f068fa79f03eb9affed09 +Subproject commit 3b3b3dcbc2c759924d25833374f4402d817b4b9c diff --git a/toolchain/riscv-isa-sim/disasm/disasm.cc b/toolchain/riscv-isa-sim/disasm/disasm.cc index fbb889775..d3d92c4ac 100644 --- a/toolchain/riscv-isa-sim/disasm/disasm.cc +++ b/toolchain/riscv-isa-sim/disasm/disasm.cc @@ -1515,6 +1515,10 @@ disassembler_t::disassembler_t(int xlen) DEFINE_RTYPE(pv_shuffle2_h); DEFINE_RTYPE(pv_shuffle2_b); + // Xqueues extension + DEFINE_XAMO(q_push) + DEFINE_XAMO_LR(q_pop) + // provide a default disassembly for all instructions as a fallback #define DECLARE_INSN(code, match, mask) \ add_insn(new disasm_insn_t(#code " (args unknown)", match, mask, {})); diff --git a/toolchain/riscv-opcodes b/toolchain/riscv-opcodes index 6bda68aa8..00b89eb39 160000 --- a/toolchain/riscv-opcodes +++ b/toolchain/riscv-opcodes @@ -1 +1 @@ -Subproject commit 6bda68aa82b78b47a61cbf0c08e39cf83a03f152 +Subproject commit 00b89eb39dbe8a980dd1485732b78231d01217c3