From dfacf4d6997d98c838fb8b324f893d0d1d109773 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Wed, 14 Sep 2022 14:53:47 +0200
Subject: [PATCH 01/24] [config] Parametrize scratchpad banking factor

---
 config/mempool.mk           | 3 +++
 config/minpool.mk           | 3 +++
 config/systolic.mk          | 3 +++
 config/terapool.mk          | 3 +++
 hardware/Makefile           | 2 +-
 hardware/src/mempool_pkg.sv | 2 +-
 software/apps/memcpy/main.c | 2 +-
 software/runtime/runtime.h  | 2 ++
 software/runtime/runtime.mk | 1 +
 9 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/config/mempool.mk b/config/mempool.mk
index a3df45b35..ec2c34154 100644
--- a/config/mempool.mk
+++ b/config/mempool.mk
@@ -17,6 +17,9 @@ num_groups ?= 4
 # Number of cores per MemPool tile
 num_cores_per_tile ?= 4
 
+# L1 scratchpad banking factor
+banking_factor ?= 4
+
 # Radix for hierarchical AXI interconnect
 axi_hier_radix ?= 20
 
diff --git a/config/minpool.mk b/config/minpool.mk
index 455cd30e6..484bef548 100644
--- a/config/minpool.mk
+++ b/config/minpool.mk
@@ -17,6 +17,9 @@ num_groups ?= 4
 # Number of cores per MemPool tile
 num_cores_per_tile ?= 4
 
+# L1 scratchpad banking factor
+banking_factor ?= 4
+
 # Number of DMA backends in each group
 dmas_per_group ?= 1
 
diff --git a/config/systolic.mk b/config/systolic.mk
index 5de36e4c5..d317e0dad 100644
--- a/config/systolic.mk
+++ b/config/systolic.mk
@@ -15,6 +15,9 @@ num_groups ?= 4
 # Number of cores per MemPool tile
 num_cores_per_tile ?= 4
 
+# L1 scratchpad banking factor
+banking_factor ?= 4
+
 # Radix for hierarchical AXI interconnect
 axi_hier_radix ?= 16
 
diff --git a/config/terapool.mk b/config/terapool.mk
index 5d3f90854..a9df13cba 100644
--- a/config/terapool.mk
+++ b/config/terapool.mk
@@ -17,6 +17,9 @@ num_groups ?= 8
 # Number of cores per Terapool tile
 num_cores_per_tile ?= 8
 
+# L1 scratchpad banking factor
+banking_factor ?= 4
+
 # Radix for hierarchical AXI interconnect
 axi_hier_radix ?= 8
 
diff --git a/hardware/Makefile b/hardware/Makefile
index 7965053d4..32454e193 100644
--- a/hardware/Makefile
+++ b/hardware/Makefile
@@ -87,7 +87,7 @@ endif
 vlog_args += -suppress vlog-2583 -suppress vlog-13314 -suppress vlog-13233
 vlog_args += -work $(library)
 # Defines
-vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups)
+vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups) -DBANKING_FACTOR=$(banking_factor)
 vlog_defs += -DL2_BASE=$(l2_base) -DL2_SIZE=$(l2_size) -DL2_BANKS=$(l2_banks)
 vlog_defs += -DBOOT_ADDR=$(boot_addr) -DXPULPIMG=$(xpulpimg)
 vlog_defs += -DSNITCH_TRACE=$(snitch_trace)
diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv
index a11eeeff1..5ba3234f4 100644
--- a/hardware/src/mempool_pkg.sv
+++ b/hardware/src/mempool_pkg.sv
@@ -35,7 +35,7 @@ package mempool_pkg;
   localparam integer unsigned DataWidth        = 32;
   localparam integer unsigned BeWidth          = DataWidth / 8;
   localparam integer unsigned ByteOffset       = $clog2(BeWidth);
-  localparam integer unsigned BankingFactor    = 4;
+  localparam integer unsigned BankingFactor    = `ifdef BANKING_FACTOR `BANKING_FACTOR `else 0 `endif;
   localparam bit              LrScEnable       = 1'b1;
   localparam integer unsigned TCDMSizePerBank  = 1024; // [B]
   localparam integer unsigned NumBanks         = NumCores * BankingFactor;
diff --git a/software/apps/memcpy/main.c b/software/apps/memcpy/main.c
index c92a688a1..f93d2e0d1 100644
--- a/software/apps/memcpy/main.c
+++ b/software/apps/memcpy/main.c
@@ -27,7 +27,7 @@
 #ifndef SIZE
 #define SIZE ((NUM_CORES) * (NUM_CORES)*2)
 #endif
-#define BANKING_FACTOR (4)
+// Assume banking factor of 4
 
 uint32_t l2_data_a[SIZE] __attribute__((section(".l2")))
 __attribute__((aligned(NUM_CORES * 4 * 4)));
diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h
index 108e217a5..12376e52d 100644
--- a/software/runtime/runtime.h
+++ b/software/runtime/runtime.h
@@ -11,6 +11,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#define NUM_BANKS_PER_TILE NUM_CORES_PER_TILE *BANKING_FACTOR
+
 extern char l1_alloc_base;
 extern uint32_t atomic_barrier;
 extern volatile uint32_t wake_up_reg;
diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk
index 717a432d9..ee4003663 100644
--- a/software/runtime/runtime.mk
+++ b/software/runtime/runtime.mk
@@ -64,6 +64,7 @@ DEFINES += -DPRINTF_DISABLE_SUPPORT_FLOAT -DPRINTF_DISABLE_SUPPORT_LONG_LONG -DP
 DEFINES += -DNUM_CORES=$(num_cores)
 DEFINES += -DNUM_GROUPS=$(num_groups)
 DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile)
+DEFINES += -DBANKING_FACTOR=$(banking_factor)
 DEFINES += -DNUM_CORES_PER_GROUP=$(shell awk 'BEGIN{print $(num_cores)/$(num_groups)}')
 DEFINES += -DNUM_TILES_PER_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)}')
 DEFINES += -DLOG2_NUM_CORES_PER_TILE=$(shell awk 'BEGIN{print log($(num_cores_per_tile))/log(2)}')

From 738a8cfafae8383180cc63a35cb0d33eb69511df Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Tue, 18 May 2021 01:43:01 +0200
Subject: [PATCH 02/24] [hardware] Add support for atomic Xqueue operations to
 TCDM adapter

---
 Bender.yml                          |   1 +
 config/config.mk                    |   3 +
 config/systolic.mk                  |   8 +-
 hardware/Makefile                   |   2 +-
 hardware/src/mempool_pkg.sv         |   3 +
 hardware/src/mempool_tile.sv        |  87 +++--
 hardware/src/tcdm_adapter_xqueue.sv | 501 ++++++++++++++++++++++++++++
 7 files changed, 574 insertions(+), 31 deletions(-)
 create mode 100644 hardware/src/tcdm_adapter_xqueue.sv

diff --git a/Bender.yml b/Bender.yml
index 08b62d28c..982aa0c6b 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -28,6 +28,7 @@ sources:
   - hardware/src/mempool_cc.sv
   - hardware/src/snitch_addr_demux.sv
   - hardware/src/tcdm_adapter.sv
+  - hardware/src/tcdm_adapter_xqueue.sv
   - hardware/src/tcdm_shim.sv
   - hardware/src/tcdm_wide_narrow_mux.sv
   - hardware/src/address_scrambler.sv
diff --git a/config/config.mk b/config/config.mk
index ea0ff5425..fb01a9006 100644
--- a/config/config.mk
+++ b/config/config.mk
@@ -56,6 +56,9 @@ dmas_per_group ?= 4
 ##  Xqueues configuration  ##
 #############################
 
+# Hardware queues for systolic (atomic ISA extension in TCDM adapter)
+xqueue ?= 0
+
 # XQueue extension's queue size in each memory bank (in words)
 xqueue_size ?= 0
 
diff --git a/config/systolic.mk b/config/systolic.mk
index d317e0dad..e14ce5a99 100644
--- a/config/systolic.mk
+++ b/config/systolic.mk
@@ -32,6 +32,10 @@ seq_mem_size ?= 2048
 ##  Xqueues configuration  ##
 #############################
 
-# Xqueue extension's queue size (in queue entries)
-# in each memory bank (assume banking factor of 4)
+# Hardware queues for systolic (atomic ISA extension in TCDM adapter)
+xqueue ?= 1
+
+# Systolic queues size (assume banking factor of 4) for:
+# - software queues emulation (size measured in queue entries)
+# - hardware xqueue's queue in each memory bank (size measured in words)
 xqueue_size ?= 4
diff --git a/hardware/Makefile b/hardware/Makefile
index 32454e193..046d9ed04 100644
--- a/hardware/Makefile
+++ b/hardware/Makefile
@@ -95,7 +95,7 @@ vlog_defs += -DAXI_DATA_WIDTH=$(axi_data_width)
 vlog_defs += -DRO_LINE_WIDTH=$(ro_line_width)
 vlog_defs += -DDMAS_PER_GROUP=$(dmas_per_group)
 vlog_defs += -DAXI_HIER_RADIX=$(axi_hier_radix) -DAXI_MASTERS_PER_GROUP=$(axi_masters_per_group)
-vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE_SIZE=$(xqueue_size)
+vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE=$(xqueue) -DXQUEUE_SIZE=$(xqueue_size)
 
 # Traffic generation enabled
 ifdef tg
diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv
index 5ba3234f4..ce7915ee3 100644
--- a/hardware/src/mempool_pkg.sv
+++ b/hardware/src/mempool_pkg.sv
@@ -258,6 +258,9 @@ package mempool_pkg;
    *  QUEUE PARAMETERS  *
    **********************/
 
+  // Size of queues in words (must be a power of two)
+  localparam bit Xqueue = `ifdef XQUEUE `XQUEUE `else 1'b0 `endif;
+
   // Size of xqueues in words (must be a power of two)
   localparam int unsigned XQueueSize = `ifdef XQUEUE_SIZE `XQUEUE_SIZE `else 0 `endif;
 
diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv
index a3a6aa50b..19dacacae 100644
--- a/hardware/src/mempool_tile.sv
+++ b/hardware/src/mempool_tile.sv
@@ -381,34 +381,65 @@ module mempool_tile
     assign bank_resp_payload[b].rdata.amo     = '0; // Don't care
     assign bank_resp_wide[b]                  = meta_out.wide;
 
-    tcdm_adapter #(
-      .AddrWidth  (TCDMAddrMemWidth),
-      .DataWidth  (DataWidth       ),
-      .metadata_t (bank_metadata_t ),
-      .LrScEnable (LrScEnable      ),
-      .RegisterAmo(1'b0            )
-    ) i_tcdm_adapter (
-      .clk_i       (clk_i                                                                       ),
-      .rst_ni      (rst_ni                                                                      ),
-      .in_valid_i  (bank_req_valid[b]                                                           ),
-      .in_ready_o  (bank_req_ready[b]                                                           ),
-      .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]),
-      .in_amo_i    (bank_req_payload[b].wdata.amo                                               ),
-      .in_write_i  (bank_req_payload[b].wen                                                     ),
-      .in_wdata_i  (bank_req_payload[b].wdata.data                                              ),
-      .in_meta_i   (meta_in                                                                     ),
-      .in_be_i     (bank_req_payload[b].be                                                      ),
-      .in_valid_o  (bank_resp_valid[b]                                                          ),
-      .in_ready_i  (bank_resp_ready[b]                                                          ),
-      .in_rdata_o  (bank_resp_payload[b].rdata.data                                             ),
-      .in_meta_o   (meta_out                                                                    ),
-      .out_req_o   (req_valid                                                                   ),
-      .out_add_o   (req_addr                                                                    ),
-      .out_write_o (req_write                                                                   ),
-      .out_wdata_o (req_wdata                                                                   ),
-      .out_be_o    (req_be                                                                      ),
-      .out_rdata_i (resp_rdata                                                                  )
-    );
+    if (Xqueue) begin: gen_tcdm_adapter_xqueue
+      tcdm_adapter_xqueue #(
+        .AddrWidth  (TCDMAddrMemWidth),
+        .DataWidth  (DataWidth       ),
+        .XQueueSize (XQueueSize      ),
+        .metadata_t (bank_metadata_t ),
+        .RegisterAmo(1'b0            )
+      ) i_tcdm_adapter (
+        .clk_i       (clk_i                                                                       ),
+        .rst_ni      (rst_ni                                                                      ),
+        .in_valid_i  (bank_req_valid[b]                                                           ),
+        .in_ready_o  (bank_req_ready[b]                                                           ),
+        .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]),
+        .in_amo_i    (bank_req_payload[b].wdata.amo                                               ),
+        .in_write_i  (bank_req_payload[b].wen                                                     ),
+        .in_wdata_i  (bank_req_payload[b].wdata.data                                              ),
+        .in_meta_i   (meta_in                                                                     ),
+        .in_be_i     (bank_req_payload[b].be                                                      ),
+        .in_valid_o  (bank_resp_valid[b]                                                          ),
+        .in_ready_i  (bank_resp_ready[b]                                                          ),
+        .in_rdata_o  (bank_resp_payload[b].rdata.data                                             ),
+        .in_meta_o   (meta_out                                                                    ),
+        .out_req_o   (req_valid                                                                   ),
+        .out_add_o   (req_addr                                                                    ),
+        .out_write_o (req_write                                                                   ),
+        .out_wdata_o (req_wdata                                                                   ),
+        .out_be_o    (req_be                                                                      ),
+        .out_rdata_i (resp_rdata                                                                  )
+      );
+    end else begin: gen_tcdm_adapter
+      tcdm_adapter #(
+        .AddrWidth  (TCDMAddrMemWidth),
+        .DataWidth  (DataWidth       ),
+        .metadata_t (bank_metadata_t ),
+        .LrScEnable (LrScEnable      ),
+        .RegisterAmo(1'b0            )
+      ) i_tcdm_adapter (
+        .clk_i       (clk_i                                                                       ),
+        .rst_ni      (rst_ni                                                                      ),
+        .in_valid_i  (bank_req_valid[b]                                                           ),
+        .in_ready_o  (bank_req_ready[b]                                                           ),
+        .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]),
+        .in_amo_i    (bank_req_payload[b].wdata.amo                                               ),
+        .in_write_i  (bank_req_payload[b].wen                                                     ),
+        .in_wdata_i  (bank_req_payload[b].wdata.data                                              ),
+        .in_meta_i   (meta_in                                                                     ),
+        .in_be_i     (bank_req_payload[b].be                                                      ),
+        .in_valid_o  (bank_resp_valid[b]                                                          ),
+        .in_ready_i  (bank_resp_ready[b]                                                          ),
+        .in_rdata_o  (bank_resp_payload[b].rdata.data                                             ),
+        .in_meta_o   (meta_out                                                                    ),
+        .out_req_o   (req_valid                                                                   ),
+        .out_add_o   (req_addr                                                                    ),
+        .out_write_o (req_write                                                                   ),
+        .out_wdata_o (req_wdata                                                                   ),
+        .out_be_o    (req_be                                                                      ),
+        .out_rdata_i (resp_rdata                                                                  )
+      );
+    end
 
     // Bank
     tc_sram #(
diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv
new file mode 100644
index 000000000..5f038c2e4
--- /dev/null
+++ b/hardware/src/tcdm_adapter_xqueue.sv
@@ -0,0 +1,501 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Description: Handles the protocol conversion from valid/ready to req/gnt and correctly returns
+// the metadata. Additionally, it handles atomics. Hence, it needs to be instantiated in front of
+// an SRAM over which it has exclusive access.
+//
+// Author: Samuel Riedel <sriedel@iis.ee.ethz.ch>
+
+`include "common_cells/registers.svh"
+
+module tcdm_adapter_xqueue #(
+  parameter int unsigned AddrWidth    = 32,
+  parameter int unsigned DataWidth    = 32,
+  parameter int unsigned XQueueSize   = 4,
+  parameter type         metadata_t   = logic,
+  parameter bit          RegisterAmo  = 1'b0, // Cut path between request and response at the cost of increased AMO latency
+  // Dependent parameters. DO NOT CHANGE.
+  localparam int unsigned BeWidth     = DataWidth/8,
+  localparam int unsigned QCntWidth   = $clog2(XQueueSize)
+) (
+  input  logic                 clk_i,
+  input  logic                 rst_ni,
+  // master side
+  input  logic                 in_valid_i,   // Bank request
+  output logic                 in_ready_o,   // Bank grant
+  input  logic [AddrWidth-1:0] in_address_i, // Address
+  input  logic [3:0]           in_amo_i,     // Atomic Memory Operation
+  input  logic                 in_write_i,   // 1: Store, 0: Load
+  input  logic [DataWidth-1:0] in_wdata_i,   // Write data
+  input  metadata_t            in_meta_i,    // Meta data
+  input  logic [BeWidth-1:0]   in_be_i,      // Byte enable
+  output logic                 in_valid_o,   // Response valid
+  input  logic                 in_ready_i,   // Response ready
+  output logic [DataWidth-1:0] in_rdata_o,   // Read data
+  output metadata_t            in_meta_o,    // Meta data
+  // slave side
+  output logic                 out_req_o,   // Bank request
+  output logic [AddrWidth-1:0] out_add_o,   // Address
+  output logic                 out_write_o, // 1: Store, 0: Load
+  output logic [DataWidth-1:0] out_wdata_o, // Write data
+  output logic [BeWidth-1:0]   out_be_o,    // Bit enable
+  input  logic [DataWidth-1:0] out_rdata_i  // Read data
+);
+
+  typedef enum logic [3:0] {
+      AMONone = 4'h0,
+      AMOSwap = 4'h1,
+      AMOAdd  = 4'h2,
+      AMOAnd  = 4'h3,
+      AMOOr   = 4'h4,
+      AMOXor  = 4'h5,
+      AMOMax  = 4'h6,
+      AMOMaxu = 4'h7,
+      AMOMin  = 4'h8,
+      AMOMinu = 4'h9,
+      AMOLR   = 4'hA,
+      AMOSC   = 4'hB,
+      QPush   = 4'hC,
+      QPop    = 4'hD
+  } amo_op_t;
+
+  typedef enum logic [2:0] {
+    Idle, DoAMO, WriteBackAMO, ResolveQPushStall, ResolveQPopStall
+  } state_e;
+
+  // Stored data in spill registers and fall through register
+  metadata_t           stored_meta_data;
+  metadata_t           stored_smeta_data;
+  logic[DataWidth-1:0] resp_in_data;
+
+  // Handshake signals for spill registers and fall through register
+  logic meta_in_vld, meta_in_rdy, meta_out_vld, meta_out_rdy;
+  logic smeta_in_vld, smeta_in_rdy, smeta_out_vld, smeta_out_rdy;
+  logic rdata_in_vld_d, rdata_in_vld_q;
+  logic rdata_in_rdy, rdata_out_vld, rdata_out_rdy;
+
+  // Response meta data selection and valid signals
+  logic sresp_select_d, sresp_select_q;
+  logic resp_vld;
+  logic sresp_vld;
+
+  // FSM related signals
+  state_e state_q, state_d;
+  logic   vld_amo_op;
+  logic   req_accepted, resp_accepted;
+  logic   queue_stalled_d, queue_stalled_q;
+
+  // Temporary storage for AMO operations
+  amo_op_t              amo_op_d, amo_op_q;
+  logic [AddrWidth-1:0] addr_d, addr_q;
+
+  // AMO ALU signals
+  logic [31:0] amo_operand_a;
+  logic [31:0] amo_operand_b_d, amo_operand_b_q;
+  logic [31:0] amo_result, amo_result_q;
+
+  // Queue counters
+  logic unsigned [QCntWidth-1:0] curr_tail_d, curr_tail_q;
+  logic unsigned [QCntWidth-1:0] next_tail_d, next_tail_q;
+  logic unsigned [QCntWidth-1:0] curr_head_d, curr_head_q;
+
+  // Queue counter increment
+  logic unsigned [QCntWidth-1:0] increment_operand, increment_result;
+
+  // Queue management signals
+  logic queue_empty;
+  logic queue_full;
+  logic increment_tail, increment_head;
+  logic stalled_queue_op;
+
+  // Temporary storage of write data for stalled queue push
+  logic[DataWidth-1:0] qpush_data_d, qpush_data_q;
+
+  // Stores the metadata at handshake (except stalled queue operations)
+  spill_register #(
+    .T     (metadata_t),
+    .Bypass(1'b0      )
+  ) i_meta_register (
+    .clk_i  (clk_i           ),
+    .rst_ni (rst_ni          ),
+    .valid_i(meta_in_vld     ),
+    .ready_o(meta_in_rdy     ),
+    .data_i (in_meta_i       ),
+    .valid_o(meta_out_vld    ),
+    .ready_i(meta_out_rdy    ),
+    .data_o (stored_meta_data)
+  );
+  assign meta_in_vld  = req_accepted & !in_write_i & !stalled_queue_op;
+  assign meta_out_rdy = sresp_select_q ? 1'b0 : resp_accepted;
+
+  // Stores the metadata at handshake of stalled queue operations
+  spill_register #(
+    .T     (metadata_t),
+    .Bypass(1'b0      )
+  ) i_stallmeta_register (
+    .clk_i  (clk_i            ),
+    .rst_ni (rst_ni           ),
+    .valid_i(smeta_in_vld     ),
+    .ready_o(smeta_in_rdy     ),
+    .data_i (in_meta_i        ),
+    .valid_o(smeta_out_vld    ),
+    .ready_i(smeta_out_rdy    ),
+    .data_o (stored_smeta_data)
+  );
+  assign smeta_in_vld  = req_accepted & stalled_queue_op;
+  assign smeta_out_rdy = sresp_select_q ? resp_accepted : 1'b0;
+
+  // Store response data if it's not accepted immediately
+  fall_through_register #(
+    .T(logic[DataWidth-1:0])
+  ) i_rdata_register (
+    .clk_i     (clk_i         ),
+    .rst_ni    (rst_ni        ),
+    .clr_i     (1'b0          ),
+    .testmode_i(1'b0          ),
+    .data_i    (resp_in_data  ),
+    .valid_i   (rdata_in_vld_q),
+    .ready_o   (rdata_in_rdy  ),
+    .data_o    (in_rdata_o    ),
+    .valid_o   (rdata_out_vld ),
+    .ready_i   (rdata_out_rdy )
+  );
+  assign resp_in_data  = out_rdata_i;
+  assign rdata_out_rdy = resp_accepted;
+
+  // Output response valid if both meta and read data are available (the read data will always be last)
+  assign resp_vld   = meta_out_vld  & rdata_out_vld;
+  assign sresp_vld  = smeta_out_vld & rdata_out_vld;
+  // Select output valid depending on response selection
+  assign in_valid_o = sresp_select_q ? sresp_vld         : resp_vld;
+  // Select output meta data depending on response selection
+  assign in_meta_o  = sresp_select_q ? stored_smeta_data : stored_meta_data;
+
+  // Exclude queue operations as valid amo operations
+  assign vld_amo_op    = !(amo_op_t'(in_amo_i) inside {AMONone, QPush, QPop});
+  // Request is accepted on successful input handshake
+  assign req_accepted  = in_valid_i & in_ready_o;
+  // Response is accepted on successful output handshake
+  assign resp_accepted = in_ready_i & in_valid_o;
+
+  always_comb begin
+    // Default
+    amo_op_d        = AMONone;
+    addr_d          = addr_q;
+    amo_operand_b_d = amo_operand_b_q;
+    state_d         = state_q;
+    sresp_select_d  = sresp_select_q;
+    queue_stalled_d = queue_stalled_q;
+    qpush_data_d    = qpush_data_q;
+
+    // While response is pending no requests are accepted
+    in_ready_o = in_valid_o & ~in_ready_i ? 1'b0 : 1'b1;
+
+    // Feed-through of request
+    out_req_o   = req_accepted;
+    out_add_o   = in_address_i;
+    out_write_o = in_write_i;
+    out_wdata_o = in_wdata_i;
+    out_be_o    = in_be_i;
+
+    // Response data as feed-through of read data
+    // resp_in_data   = out_rdata_i;
+
+    // Response is acquired a cycle after a read access
+    rdata_in_vld_d = out_req_o & !out_write_o;
+
+    // Flags to increment queue counters
+    increment_tail = 1'b0;
+    increment_head = 1'b0;
+
+    // FSM
+    unique case (state_q)
+      // Idle State handles normal load/stores, non-stalled queue operations
+      // and the initial read of AMO operations (single cycle operations)
+      // In case of pending queue stall or AMO operations transition away
+      Idle: begin
+        // Prepare queue push
+        if (amo_op_t'(in_amo_i) == QPush) begin
+          // Write data at tail of queue
+          out_add_o   = curr_tail_q;
+          out_write_o = 1'b1;
+        end
+
+        // Prepare queue pop
+        if (amo_op_t'(in_amo_i) == QPop) begin
+          // Read data at head of queue
+          out_add_o = curr_head_q;
+        end
+
+        // Request accepted (triggers memory access)
+        if (req_accepted) begin
+          // Reset meta data selection to default meta data
+          sresp_select_d = 1'b0;
+
+          // AMO operation
+          if (vld_amo_op) begin
+            amo_op_d        = amo_op_t'(in_amo_i);
+            addr_d          = in_address_i;
+            amo_operand_b_d = in_wdata_i;
+            state_d         = DoAMO;
+          end
+
+          // Queue push
+          if (amo_op_t'(in_amo_i) == QPush) begin
+            if (queue_full) begin
+              // Set flag and store queue push data for later
+              queue_stalled_d = 1'b1;
+              qpush_data_d    = in_wdata_i; // TODO: MIGHT NOT BE NEEDED
+              // Prevent acquisition of response data (TODO: might not be needed)
+              rdata_in_vld_d  = 1'b0;
+            end else begin
+              // Set increment flag
+              increment_tail = 1'b1;
+              // Force acquisition of response data despite a write access
+              // Response data will match the write data of the write access
+              rdata_in_vld_d = 1'b1;
+              // Previous queue pop failed due to empty queue
+              if (queue_stalled_q) begin
+                queue_stalled_d = 1'b0;
+                state_d         = ResolveQPopStall;
+              end
+            end
+          end
+
+          // Queue pop
+          if (amo_op_t'(in_amo_i) == QPop) begin
+            if (queue_empty) begin
+              // Set flag
+              queue_stalled_d = 1'b1;
+              // Prevent acquisition of response data despite read access
+              rdata_in_vld_d  = 1'b0;
+            end else begin
+              // Set increment flag
+              increment_head = 1'b1;
+              // Previous queue push failed due to full queue
+              if (queue_stalled_q) begin
+                queue_stalled_d = 1'b0;
+                state_d         = ResolveQPushStall;
+              end
+            end
+          end
+        end
+      end
+
+      // DoAMO & WriteBackAMO State claims the memory interface for AMO write
+      DoAMO, WriteBackAMO: begin
+        in_ready_o  = 1'b0;
+        // Return to Idle one cycle later if we cut the path
+        state_d     = (RegisterAmo && state_q != WriteBackAMO) ?  WriteBackAMO : Idle;
+        // Commit AMO
+        out_req_o   = 1'b1;
+        out_write_o = 1'b1;
+        out_add_o   = addr_q;
+        out_be_o    = 4'b1111;
+        // serve from register if we cut the path
+        if (RegisterAmo) begin
+          out_wdata_o = amo_result_q;
+        end else begin
+          out_wdata_o = amo_result;
+        end
+      end
+
+      // ResolveQPushStall State blocks any requests until queue pop response
+      // has been accepted and then executes the queue push
+      ResolveQPushStall: begin
+        // Do not accept any requests during resolve
+        in_ready_o  = 1'b0;
+        // Prepare queue push (write data at tail of queue)
+        // TODO: INSTEAD READ STORED DATA FOR PUSH RESPONSE
+        out_add_o   = curr_tail_q;
+        out_write_o = 1'b1;
+        out_wdata_o = qpush_data_q;
+        out_be_o    = 4'b1111;
+        // Wait until pop response accepted
+        if (resp_accepted) begin
+          // Set success flag
+          increment_tail = 1'b1;
+          // Trigger memory access
+          out_req_o      = 1'b1;
+          // Force acquisition of response data despite a write access
+          // Response data will match the write data of the write access
+          rdata_in_vld_d = 1'b1;
+          // Set meta data selection to stalled meta data
+          sresp_select_d = 1'b1;
+          // Return to Idle
+          state_d        = Idle;
+        end
+      end
+
+      // ResolveQPushStall State blocks any requests until queue push response
+      // has been accepted and then executes the queue pop
+      ResolveQPopStall: begin
+        // Do not accept any requests during resolve
+        in_ready_o  = 1'b0;
+        // Prepare queue pop (read data at head of queue)
+        out_add_o   = curr_head_q;
+        out_write_o = 1'b0;
+        out_be_o    = 4'b1111;
+        // Wait until push response accepted
+        if (resp_accepted) begin
+          // Set success flag
+          increment_head = 1'b1;
+          // Trigger memory access
+          out_req_o      = 1'b1;
+          // Set meta data selection to stalled meta data
+          sresp_select_d = 1'b1;
+          // Return to Idle
+          state_d        = Idle;
+        end
+      end
+      default:;
+    endcase
+  end
+
+  // ----------------
+  // AMO ALU
+  // ----------------
+  logic [33:0] adder_sum;
+  logic [32:0] adder_operand_a, adder_operand_b;
+
+  assign amo_operand_a = out_rdata_i;
+  assign adder_sum     = adder_operand_a + adder_operand_b;
+  /* verilator lint_off WIDTH */
+  always_comb begin : amo_alu
+
+    adder_operand_a = $signed(amo_operand_a);
+    adder_operand_b = $signed(amo_operand_b_q);
+
+    amo_result = amo_operand_b_q;
+
+    unique case (amo_op_q)
+      // the default is to output operand_b
+      AMOSwap:;
+      AMOAdd: amo_result = adder_sum[31:0];
+      AMOAnd: amo_result = amo_operand_a & amo_operand_b_q;
+      AMOOr:  amo_result = amo_operand_a | amo_operand_b_q;
+      AMOXor: amo_result = amo_operand_a ^ amo_operand_b_q;
+      AMOMax: begin
+        adder_operand_b = -$signed(amo_operand_b_q);
+        amo_result = adder_sum[32] ? amo_operand_b_q : amo_operand_a;
+      end
+      AMOMin: begin
+        adder_operand_b = -$signed(amo_operand_b_q);
+        amo_result = adder_sum[32] ? amo_operand_a : amo_operand_b_q;
+      end
+      AMOMaxu: begin
+        adder_operand_a = $unsigned(amo_operand_a);
+        adder_operand_b = -$unsigned(amo_operand_b_q);
+        amo_result = adder_sum[32] ? amo_operand_b_q : amo_operand_a;
+      end
+      AMOMinu: begin
+        adder_operand_a = $unsigned(amo_operand_a);
+        adder_operand_b = -$unsigned(amo_operand_b_q);
+        amo_result = adder_sum[32] ? amo_operand_a : amo_operand_b_q;
+      end
+      default: amo_result = '0;
+    endcase
+  end
+
+  if (RegisterAmo) begin : gen_amo_slice
+    `FFLNR(amo_result_q, amo_result, (state_q == DoAMO), clk_i)
+  end else begin : gen_amo_slice
+    assign amo_result_q = '0;
+  end
+
+  // ----------------
+  // QUEUE MANAGEMENT
+  // ----------------
+  assign queue_empty = (curr_head_q == curr_tail_q);
+  assign queue_full  = (curr_head_q == next_tail_q);
+
+  assign increment_result = increment_operand + 1;
+
+  always_comb begin : queue_management
+    // Default
+    curr_tail_d = curr_tail_q;
+    next_tail_d = next_tail_q;
+    curr_head_d = curr_head_q;
+
+    // Increment queue counters
+    increment_operand = curr_head_q;
+    if (increment_tail) begin
+      increment_operand = next_tail_q;
+      curr_tail_d       = next_tail_q;
+      next_tail_d       = increment_result;
+    end
+    if (increment_head) begin
+      increment_operand = curr_head_q;
+      curr_head_d       = increment_result;
+    end
+
+    // Select spill register for meta data
+    unique case (amo_op_t'(in_amo_i))
+      QPush:   stalled_queue_op = queue_full;
+      QPop:    stalled_queue_op = queue_empty;
+      default: stalled_queue_op = 1'b0;
+    endcase
+  end
+
+  // ----------------
+  // SEQUENTIAL PROCESS
+  // ----------------
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      state_q         <= Idle;
+      amo_op_q        <= amo_op_t'('0);
+      addr_q          <= '0;
+      amo_operand_b_q <= '0;
+      rdata_in_vld_q  <= 1'b0;
+      sresp_select_q  <= 1'b0;
+      curr_tail_q     <= 0;
+      next_tail_q     <= 1;
+      curr_head_q     <= 0;
+      queue_stalled_q <= 1'b0;
+      qpush_data_q    <= '0;
+    end else begin
+      state_q         <= state_d;
+      amo_op_q        <= amo_op_d;
+      addr_q          <= addr_d;
+      amo_operand_b_q <= amo_operand_b_d;
+      rdata_in_vld_q  <= rdata_in_vld_d;
+      sresp_select_q  <= sresp_select_d;
+      curr_tail_q     <= curr_tail_d;
+      next_tail_q     <= next_tail_d;
+      curr_head_q     <= curr_head_d;
+      queue_stalled_q <= queue_stalled_d;
+      qpush_data_q    <= qpush_data_d;
+    end
+  end
+
+  // ----------------
+  // ASSERTIONS
+  // ----------------
+  // pragma translate_off
+  // Check for unsupported parameters
+  if (DataWidth != 32) begin
+    $error($sformatf("Module currently only supports DataWidth = 32. DataWidth is currently set to: %0d", DataWidth));
+  end
+
+  `ifndef VERILATOR
+    meta_full : assert property(
+      @(posedge clk_i) disable iff (~rst_ni) (meta_in_vld |-> meta_in_rdy))
+      else $fatal (1, "Trying to push new data although the i_meta_register is not ready.");
+  `endif
+
+  `ifndef VERILATOR
+    smeta_full : assert property(
+      @(posedge clk_i) disable iff (~rst_ni) (smeta_in_vld |-> smeta_in_rdy))
+      else $fatal (1, "Trying to push new data although the i_stallmeta_register is not ready.");
+  `endif
+
+  `ifndef VERILATOR
+    rdata_full : assert property(
+      @(posedge clk_i) disable iff (~rst_ni) (rdata_in_vld_q |-> rdata_in_rdy))
+      else $fatal (1, "Trying to push new data although the i_rdata_register is not ready.");
+  `endif
+  // pragma translate_on
+
+endmodule

From 49bc8a5c691f6bb3c224c1a352670302a59a0dcf Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Mon, 22 Aug 2022 14:16:05 +0200
Subject: [PATCH 03/24] [toolchain] Add toolchain support to xqueues extension
 (in standard atomic extension, illegal)

---
 hardware/deps/snitch/src/riscv_instr.sv  | 2 ++
 hardware/src/tcdm_adapter_xqueue.sv      | 4 +++-
 software/runtime/encoding.h              | 6 ++++++
 toolchain/riscv-gnu-toolchain            | 2 +-
 toolchain/riscv-isa-sim/disasm/disasm.cc | 4 ++++
 toolchain/riscv-opcodes                  | 2 +-
 6 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/hardware/deps/snitch/src/riscv_instr.sv b/hardware/deps/snitch/src/riscv_instr.sv
index 23107aa70..afbd2cd7c 100644
--- a/hardware/deps/snitch/src/riscv_instr.sv
+++ b/hardware/deps/snitch/src/riscv_instr.sv
@@ -935,6 +935,8 @@ package riscv_instr;
   localparam logic [31:0] PV_PACK_H          = 32'b1101001??????????000?????1010111;
   localparam logic [31:0] PV_PACKHI_B        = 32'b1101100??????????001?????1010111;
   localparam logic [31:0] PV_PACKLO_B        = 32'b1110000??????????001?????1010111;
+  localparam logic [31:0] Q_PUSH             = 32'b00111????????????010?????0101111;
+  localparam logic [31:0] Q_POP              = 32'b00110??00000?????010?????0101111;
   /* CSR Addresses */
   localparam logic [11:0] CSR_FFLAGS = 12'h1;
   localparam logic [11:0] CSR_FRM = 12'h2;
diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv
index 5f038c2e4..07426d57f 100644
--- a/hardware/src/tcdm_adapter_xqueue.sv
+++ b/hardware/src/tcdm_adapter_xqueue.sv
@@ -10,6 +10,8 @@
 
 `include "common_cells/registers.svh"
 
+import cf_math_pkg::idx_width;
+
 module tcdm_adapter_xqueue #(
   parameter int unsigned AddrWidth    = 32,
   parameter int unsigned DataWidth    = 32,
@@ -18,7 +20,7 @@ module tcdm_adapter_xqueue #(
   parameter bit          RegisterAmo  = 1'b0, // Cut path between request and response at the cost of increased AMO latency
   // Dependent parameters. DO NOT CHANGE.
   localparam int unsigned BeWidth     = DataWidth/8,
-  localparam int unsigned QCntWidth   = $clog2(XQueueSize)
+  localparam int unsigned QCntWidth   = idx_width(XQueueSize)
 ) (
   input  logic                 clk_i,
   input  logic                 rst_ni,
diff --git a/software/runtime/encoding.h b/software/runtime/encoding.h
index ce0ce72fa..ee518a9ea 100644
--- a/software/runtime/encoding.h
+++ b/software/runtime/encoding.h
@@ -2148,6 +2148,10 @@
 #define MASK_PV_PACKHI_B  0xfe00707f
 #define MATCH_PV_PACKLO_B 0xe0001057
 #define MASK_PV_PACKLO_B  0xfe00707f
+#define MATCH_Q_PUSH 0x3800202f
+#define MASK_Q_PUSH  0xf800707f
+#define MATCH_Q_POP 0x3000202f
+#define MASK_Q_POP  0xf9f0707f
 #define CSR_FFLAGS 0x1
 #define CSR_FRM 0x2
 #define CSR_FCSR 0x3
@@ -3379,6 +3383,8 @@ DECLARE_INSN(pv_pack, MATCH_PV_PACK, MASK_PV_PACK)
 DECLARE_INSN(pv_pack_h, MATCH_PV_PACK_H, MASK_PV_PACK_H)
 DECLARE_INSN(pv_packhi_b, MATCH_PV_PACKHI_B, MASK_PV_PACKHI_B)
 DECLARE_INSN(pv_packlo_b, MATCH_PV_PACKLO_B, MASK_PV_PACKLO_B)
+DECLARE_INSN(q_push, MATCH_Q_PUSH, MASK_Q_PUSH)
+DECLARE_INSN(q_pop, MATCH_Q_POP, MASK_Q_POP)
 #endif
 #ifdef DECLARE_CSR
 DECLARE_CSR(fflags, CSR_FFLAGS)
diff --git a/toolchain/riscv-gnu-toolchain b/toolchain/riscv-gnu-toolchain
index 70acebe25..3b3b3dcbc 160000
--- a/toolchain/riscv-gnu-toolchain
+++ b/toolchain/riscv-gnu-toolchain
@@ -1 +1 @@
-Subproject commit 70acebe256fc49114b5f068fa79f03eb9affed09
+Subproject commit 3b3b3dcbc2c759924d25833374f4402d817b4b9c
diff --git a/toolchain/riscv-isa-sim/disasm/disasm.cc b/toolchain/riscv-isa-sim/disasm/disasm.cc
index fbb889775..d3d92c4ac 100644
--- a/toolchain/riscv-isa-sim/disasm/disasm.cc
+++ b/toolchain/riscv-isa-sim/disasm/disasm.cc
@@ -1515,6 +1515,10 @@ disassembler_t::disassembler_t(int xlen)
   DEFINE_RTYPE(pv_shuffle2_h);
   DEFINE_RTYPE(pv_shuffle2_b);
 
+  // Xqueues extension
+  DEFINE_XAMO(q_push)
+  DEFINE_XAMO_LR(q_pop)
+
   // provide a default disassembly for all instructions as a fallback
   #define DECLARE_INSN(code, match, mask) \
    add_insn(new disasm_insn_t(#code " (args unknown)", match, mask, {}));
diff --git a/toolchain/riscv-opcodes b/toolchain/riscv-opcodes
index 6bda68aa8..00b89eb39 160000
--- a/toolchain/riscv-opcodes
+++ b/toolchain/riscv-opcodes
@@ -1 +1 @@
-Subproject commit 6bda68aa82b78b47a61cbf0c08e39cf83a03f152
+Subproject commit 00b89eb39dbe8a980dd1485732b78231d01217c3

From d7bad0a60bd0b1906cecd19c9937611b515fab77 Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Tue, 18 May 2021 19:39:41 +0200
Subject: [PATCH 04/24] [snitch] Add xqueues extension to instruction decoder

---
 hardware/deps/snitch/src/snitch.sv | 43 ++++++++++++++++++++++++++++--
 hardware/src/mempool_cc.sv         |  9 ++++---
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/hardware/deps/snitch/src/snitch.sv b/hardware/deps/snitch/src/snitch.sv
index e4d48bb18..fd2927834 100644
--- a/hardware/deps/snitch/src/snitch.sv
+++ b/hardware/deps/snitch/src/snitch.sv
@@ -18,7 +18,8 @@ module snitch
   parameter logic [31:0] MTVEC     = BootAddr, // Exception Base Address (see privileged spec 3.1.7)
   parameter bit          RVE       = 0,   // Reduced-register Extension
   parameter bit          RVM       = 1,   // Enable IntegerMmultiplication & Division Extension
-  parameter int    RegNrWritePorts = 2    // Implement one or two write ports into the register file
+  parameter int    RegNrWritePorts = 2,   // Implement one or two write ports into the register file
+  parameter bit          Xqueue    = 0
 ) (
   input  logic          clk_i,
   input  logic          rst_i,
@@ -152,7 +153,10 @@ module snitch
     AMOMin  = 4'h8,
     AMOMinu = 4'h9,
     AMOLR   = 4'hA,
-    AMOSC   = 4'hB
+    AMOSC   = 4'hB,
+    // TODO(smazzola): parametrize
+    QPush   = 4'hC, // Only used when Xqueue is enabled
+    QPop    = 4'hD  // Only used when Xqueue is enabled
   } ls_amo;
 
   logic [31:0] ld_result;
@@ -1324,6 +1328,41 @@ module snitch
       end
 /* end of Xpulpimg extension */
 
+/* Xqueues extension */
+      // TODO(khovg): Add define to include instr
+      riscv_instr::Q_PUSH: begin
+        if (Xqueue) begin
+          alu_op = BypassA;
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          is_load = 1'b1;
+          is_signed = 1'b1;
+          ls_size = Word;
+          ls_amo = QPush;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      // TODO(khovg): Two source registers are unnnecessary
+      riscv_instr::Q_POP: begin
+        if (Xqueue) begin
+          alu_op = BypassA;
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          is_load = 1'b1;
+          is_signed = 1'b1;
+          ls_size = Word;
+          ls_amo = QPop;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+/* end of Xqueues extension */
+
       // TODO(zarubaf): Illegal Instructions
       default: begin
         illegal_inst = 1'b1;
diff --git a/hardware/src/mempool_cc.sv b/hardware/src/mempool_cc.sv
index 096156608..3c86b19d4 100644
--- a/hardware/src/mempool_cc.sv
+++ b/hardware/src/mempool_cc.sv
@@ -57,10 +57,11 @@ module mempool_cc
 
   // Snitch Integer Core
   snitch #(
-    .BootAddr ( BootAddr ),
-    .MTVEC    ( MTVEC    ),
-    .RVE      ( RVE      ),
-    .RVM      ( RVM      )
+    .BootAddr ( BootAddr            ),
+    .MTVEC    ( MTVEC               ),
+    .RVE      ( RVE                 ),
+    .RVM      ( RVM                 ),
+    .Xqueue   ( mempool_pkg::Xqueue )
   ) i_snitch (
     .clk_i                                   ,
     .rst_i                                   ,

From 1b6ea2b3b1fcfacdda7f46850f45c93692c0268c Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Wed, 19 May 2021 00:58:44 +0200
Subject: [PATCH 05/24] [hardware] Fix response acquisition

---
 hardware/src/tcdm_adapter_xqueue.sv | 39 ++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv
index 07426d57f..d89c7a9ee 100644
--- a/hardware/src/tcdm_adapter_xqueue.sv
+++ b/hardware/src/tcdm_adapter_xqueue.sv
@@ -83,6 +83,11 @@ module tcdm_adapter_xqueue #(
   logic resp_vld;
   logic sresp_vld;
 
+  // Helper signals to determine response data acquisition
+  logic mem_read_req;
+  logic force_rdata_acq;
+  logic prevent_rdata_acq;
+
   // FSM related signals
   state_e state_q, state_d;
   logic   vld_amo_op;
@@ -167,6 +172,11 @@ module tcdm_adapter_xqueue #(
   assign resp_in_data  = out_rdata_i;
   assign rdata_out_rdy = resp_accepted;
 
+  // Set if memory read request occurs this cycle
+  assign mem_read_req = out_req_o & !out_write_o;
+  // Acquire response data a cycle after a memory read request (can be forced or prevented)
+  assign rdata_in_vld_d = force_rdata_acq | (mem_read_req & !prevent_rdata_acq);
+
   // Output response valid if both meta and read data are available (the read data will always be last)
   assign resp_vld   = meta_out_vld  & rdata_out_vld;
   assign sresp_vld  = smeta_out_vld & rdata_out_vld;
@@ -205,8 +215,9 @@ module tcdm_adapter_xqueue #(
     // Response data as feed-through of read data
     // resp_in_data   = out_rdata_i;
 
-    // Response is acquired a cycle after a read access
-    rdata_in_vld_d = out_req_o & !out_write_o;
+    // Flags to force or prevent response acquisition
+    force_rdata_acq   = 1'b0;
+    prevent_rdata_acq = 1'b0;
 
     // Flags to increment queue counters
     increment_tail = 1'b0;
@@ -248,16 +259,16 @@ module tcdm_adapter_xqueue #(
           if (amo_op_t'(in_amo_i) == QPush) begin
             if (queue_full) begin
               // Set flag and store queue push data for later
-              queue_stalled_d = 1'b1;
-              qpush_data_d    = in_wdata_i; // TODO: MIGHT NOT BE NEEDED
+              queue_stalled_d   = 1'b1;
+              qpush_data_d      = in_wdata_i; // TODO: MIGHT NOT BE NEEDED
               // Prevent acquisition of response data (TODO: might not be needed)
-              rdata_in_vld_d  = 1'b0;
+              prevent_rdata_acq = 1'b1;
             end else begin
               // Set increment flag
-              increment_tail = 1'b1;
+              increment_tail  = 1'b1;
               // Force acquisition of response data despite a write access
               // Response data will match the write data of the write access
-              rdata_in_vld_d = 1'b1;
+              force_rdata_acq = 1'b1;
               // Previous queue pop failed due to empty queue
               if (queue_stalled_q) begin
                 queue_stalled_d = 1'b0;
@@ -270,9 +281,9 @@ module tcdm_adapter_xqueue #(
           if (amo_op_t'(in_amo_i) == QPop) begin
             if (queue_empty) begin
               // Set flag
-              queue_stalled_d = 1'b1;
+              queue_stalled_d   = 1'b1;
               // Prevent acquisition of response data despite read access
-              rdata_in_vld_d  = 1'b0;
+              prevent_rdata_acq = 1'b1;
             end else begin
               // Set increment flag
               increment_head = 1'b1;
@@ -318,16 +329,16 @@ module tcdm_adapter_xqueue #(
         // Wait until pop response accepted
         if (resp_accepted) begin
           // Set success flag
-          increment_tail = 1'b1;
+          increment_tail  = 1'b1;
           // Trigger memory access
-          out_req_o      = 1'b1;
+          out_req_o       = 1'b1;
           // Force acquisition of response data despite a write access
           // Response data will match the write data of the write access
-          rdata_in_vld_d = 1'b1;
+          force_rdata_acq = 1'b1;
           // Set meta data selection to stalled meta data
-          sresp_select_d = 1'b1;
+          sresp_select_d  = 1'b1;
           // Return to Idle
-          state_d        = Idle;
+          state_d         = Idle;
         end
       end
 

From 6224f5f19ec15469d92d37cbe3377355bdda97da Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Wed, 19 May 2021 01:44:38 +0200
Subject: [PATCH 06/24] [hardware] Remove qpush data registers by abusing
 buffer slot

---
 hardware/src/tcdm_adapter_xqueue.sv | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv
index d89c7a9ee..cb454d368 100644
--- a/hardware/src/tcdm_adapter_xqueue.sv
+++ b/hardware/src/tcdm_adapter_xqueue.sv
@@ -117,9 +117,6 @@ module tcdm_adapter_xqueue #(
   logic increment_tail, increment_head;
   logic stalled_queue_op;
 
-  // Temporary storage of write data for stalled queue push
-  logic[DataWidth-1:0] qpush_data_d, qpush_data_q;
-
   // Stores the metadata at handshake (except stalled queue operations)
   spill_register #(
     .T     (metadata_t),
@@ -200,7 +197,6 @@ module tcdm_adapter_xqueue #(
     state_d         = state_q;
     sresp_select_d  = sresp_select_q;
     queue_stalled_d = queue_stalled_q;
-    qpush_data_d    = qpush_data_q;
 
     // While response is pending no requests are accepted
     in_ready_o = in_valid_o & ~in_ready_i ? 1'b0 : 1'b1;
@@ -258,10 +254,10 @@ module tcdm_adapter_xqueue #(
           // Queue push
           if (amo_op_t'(in_amo_i) == QPush) begin
             if (queue_full) begin
-              // Set flag and store queue push data for later
+              // Note: Memory write is still executed but the tail is not incremented
+              // Set stalled flag
               queue_stalled_d   = 1'b1;
-              qpush_data_d      = in_wdata_i; // TODO: MIGHT NOT BE NEEDED
-              // Prevent acquisition of response data (TODO: might not be needed)
+              // Prevent acquisition of response data
               prevent_rdata_acq = 1'b1;
             end else begin
               // Set increment flag
@@ -280,7 +276,7 @@ module tcdm_adapter_xqueue #(
           // Queue pop
           if (amo_op_t'(in_amo_i) == QPop) begin
             if (queue_empty) begin
-              // Set flag
+              // Set stalled flag
               queue_stalled_d   = 1'b1;
               // Prevent acquisition of response data despite read access
               prevent_rdata_acq = 1'b1;
@@ -316,19 +312,18 @@ module tcdm_adapter_xqueue #(
       end
 
       // ResolveQPushStall State blocks any requests until queue pop response
-      // has been accepted and then executes the queue push
+      // has been accepted and then prepares the queue push response
+      // (queue push stores data even in full queue but does not update tail)
       ResolveQPushStall: begin
         // Do not accept any requests during resolve
         in_ready_o  = 1'b0;
-        // Prepare queue push (write data at tail of queue)
-        // TODO: INSTEAD READ STORED DATA FOR PUSH RESPONSE
+        // Retrieve queue push data as dummy response (read data at tail of queue)
         out_add_o   = curr_tail_q;
-        out_write_o = 1'b1;
-        out_wdata_o = qpush_data_q;
+        out_write_o = 1'b0;
         out_be_o    = 4'b1111;
         // Wait until pop response accepted
         if (resp_accepted) begin
-          // Set success flag
+          // Set increment flag
           increment_tail  = 1'b1;
           // Trigger memory access
           out_req_o       = 1'b1;
@@ -353,7 +348,7 @@ module tcdm_adapter_xqueue #(
         out_be_o    = 4'b1111;
         // Wait until push response accepted
         if (resp_accepted) begin
-          // Set success flag
+          // Set increment flag
           increment_head = 1'b1;
           // Trigger memory access
           out_req_o      = 1'b1;
@@ -467,7 +462,6 @@ module tcdm_adapter_xqueue #(
       next_tail_q     <= 1;
       curr_head_q     <= 0;
       queue_stalled_q <= 1'b0;
-      qpush_data_q    <= '0;
     end else begin
       state_q         <= state_d;
       amo_op_q        <= amo_op_d;
@@ -479,7 +473,6 @@ module tcdm_adapter_xqueue #(
       next_tail_q     <= next_tail_d;
       curr_head_q     <= curr_head_d;
       queue_stalled_q <= queue_stalled_d;
-      qpush_data_q    <= qpush_data_d;
     end
   end
 

From 03d622e3a5928d8d77acdfe1c4c72cd68694f094 Mon Sep 17 00:00:00 2001
From: Samuel Riedel <sriedel@iis.ee.ethz.ch>
Date: Wed, 23 Mar 2022 22:54:06 +0100
Subject: [PATCH 07/24] [apps/hardware] Implement xqueue_test app

---
 hardware/src/tcdm_adapter_xqueue.sv       |   6 ++
 software/apps/systolic/xqueue_test/main.c | 112 ++++++++++++++++++++++
 2 files changed, 118 insertions(+)
 create mode 100644 software/apps/systolic/xqueue_test/main.c

diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv
index cb454d368..4adb3f415 100644
--- a/hardware/src/tcdm_adapter_xqueue.sv
+++ b/hardware/src/tcdm_adapter_xqueue.sv
@@ -502,6 +502,12 @@ module tcdm_adapter_xqueue #(
       @(posedge clk_i) disable iff (~rst_ni) (rdata_in_vld_q |-> rdata_in_rdy))
       else $fatal (1, "Trying to push new data although the i_rdata_register is not ready.");
   `endif
+
+  `ifndef VERILATOR
+    stalled_queue : assert property(
+      @(posedge clk_i) disable iff (~rst_ni) (!(queue_stalled_q && smeta_in_vld)))
+      else $fatal (1, "Trying to stall a queue operation despite an already stalled queue.");
+  `endif
   // pragma translate_on
 
 endmodule
diff --git a/software/apps/systolic/xqueue_test/main.c b/software/apps/systolic/xqueue_test/main.c
new file mode 100644
index 000000000..4cd39ca5c
--- /dev/null
+++ b/software/apps/systolic/xqueue_test/main.c
@@ -0,0 +1,112 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Gua Hao Khov, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "alloc.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+int32_t *queue = 0;
+
+int32_t producer_check, consumer_check, dummy_check;
+
+// queue push
+static inline int32_t queue_push(void *const queue, int32_t data) {
+  int32_t ret;
+  asm volatile ("q.push.w %0, %1, (%2)" : "=r"(ret) : "r"(data), "r"(queue));
+  return ret;
+}
+
+// queue pop
+inline int32_t queue_pop(void *const queue) {
+  int32_t ret;
+  asm volatile ("q.pop.w %0, 0(%1)" : "=r"(ret) : "r"(queue));
+  return ret;
+}
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  extern int32_t __seq_start;
+
+  // Initialize synchronization variables
+  mempool_barrier_init(core_id);
+
+  // Initialization
+  mempool_init(core_id, num_cores);
+
+  // Setup
+  if (core_id == 0) {
+    printf("Initialize\n");
+    queue = &__seq_start;
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Producer
+  if (core_id == 0) {
+    int32_t data[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+    int32_t check = 0;
+    int32_t resp;
+    int32_t dummy = 0;
+    for (uint32_t i = 0; i < 16; ++i) {
+      resp = queue_push(queue, data[i]);
+      dummy += resp;
+    }
+    for (uint32_t i = 0; i < 16; ++i) {
+      resp = queue_push(queue, data[i]);
+      dummy += resp;
+      check += data[i];
+    }
+    producer_check = check;
+    dummy_check = dummy;
+  }
+
+  // Consumer
+  if (core_id == 1) {
+    int32_t read_data;
+    int32_t check = 0;
+    for (uint32_t i = 0; i < 16; ++i) {
+      read_data = queue_pop(queue);
+      printf("Rx: %d\n", read_data);
+    }
+    printf("Burst Test\n");
+    for (uint32_t i = 0; i < 16; ++i) {
+      read_data = queue_pop(queue);
+      check += read_data;
+    }
+    consumer_check = check;
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Print both checks
+  if (core_id == 0) {
+    printf("Check: %d/%d/%d\n", producer_check, consumer_check, dummy_check);
+  }
+
+  // wait until all cores have finished
+  mempool_barrier(num_cores);
+  return 0;
+}

From 20248cbf627e5b1000cc1ef26f9b605a47ec2e82 Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Wed, 19 May 2021 17:24:30 +0200
Subject: [PATCH 08/24] [apps] Implement systolic matmul_xqueue (1x1 matmul)

---
 software/apps/systolic/matmul_xqueue/main.c | 210 ++++++++++
 software/runtime/systolic/matmul_xqueue.h   | 401 ++++++++++++++++++++
 2 files changed, 611 insertions(+)
 create mode 100644 software/apps/systolic/matmul_xqueue/main.c
 create mode 100644 software/runtime/systolic/matmul_xqueue.h

diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c
new file mode 100644
index 000000000..6123c8764
--- /dev/null
+++ b/software/apps/systolic/matmul_xqueue/main.c
@@ -0,0 +1,210 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Gua Hao Khov, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "alloc.h"
+#include "encoding.h"
+#include "systolic/matmul_xqueue.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+// Dimensions of matrices
+#define DIM_M 12
+#define DIM_N 12
+#define DIM_P 12
+
+uint32_t *grid_mapping;
+
+int32_t *matrix_A;
+int32_t *matrix_B;
+
+uint32_t rep_count;
+
+systolic_matrix_t *syst_matrix_A;
+systolic_matrix_t *syst_matrix_B;
+systolic_matrix_t *syst_matrix_C;
+
+void generate_gradient_matrix(int32_t **matrix, uint32_t num_rows,
+                              uint32_t num_cols) {
+  int32_t *new_matrix = (int32_t *)simple_malloc(num_rows * num_cols * 4);
+  for (uint32_t y = 0; y < num_rows; ++y) {
+    for (uint32_t x = 0; x < num_cols; ++x) {
+      new_matrix[y * num_cols + x] = (int32_t)(y + x);
+    }
+  }
+  *matrix = new_matrix;
+}
+
+void print_matrix(int32_t const *matrix, uint32_t num_rows,
+                  uint32_t num_columns) {
+  printf("Matrix at 0x%8X\n", (uint32_t)matrix);
+  for (uint32_t i = 0; i < num_rows; ++i) {
+    for (uint32_t j = 0; j < num_columns; ++j) {
+      printf("%5d ", matrix[i * num_columns + j]);
+    }
+    printf("\n");
+  }
+}
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t tile_id = core_id / 4;
+
+  // Initialize synchronization variables
+  mempool_barrier_init(core_id);
+
+  // Initialization
+  mempool_init(core_id, num_cores);
+
+  // Allocate systolic grid mapping
+  if (core_id == 0) {
+    grid_mapping = (uint32_t *)simple_malloc(num_cores * 4);
+  }
+
+  // ----------
+  // 16 CORES
+  // ----------
+
+  // Assign grid position (row wise)
+  // uint32_t col_idx = core_id % 4;
+  // uint32_t row_idx = core_id / 4;
+
+  // Assign grid position (col wise)
+  uint32_t col_idx = core_id / 4;
+  uint32_t row_idx = core_id % 4;
+
+  // Assign grid position (tile wise)
+  // uint32_t col_idx;
+  // uint32_t row_idx;
+  // if (core_id < 4) {
+  //   col_idx = core_id % 2;
+  //   row_idx = core_id / 2;
+  // } else if (core_id < 8) {
+  //   col_idx = core_id % 2 + 2;
+  //   row_idx = core_id / 6;
+  // } else if (core_id < 12) {
+  //   col_idx = core_id % 2;
+  //   row_idx = core_id / 10 + 2;
+  // } else {
+  //   col_idx = core_id % 2 + 2;
+  //   row_idx = core_id / 14 + 2;
+  // }
+
+  // uint32_t mapped_tile = tile_id;
+
+  // ----------
+  // 256 CORES
+  // ----------
+
+  // Assign grid position (col wise)
+  // uint32_t col_idx = core_id / 16;
+  // uint32_t row_idx = core_id % 16;
+
+  // Assign grid position (tile wise)
+  // uint32_t mapped_group = core_id % 4;
+  // uint32_t col_idx = tile_id / 4;
+  // uint32_t row_idx = (tile_id % 4) + (mapped_group * 4);
+  // uint32_t mapped_tile = (tile_id % 16) + (mapped_group * 16);
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Set systolic grid mapping
+  grid_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = tile_id;
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Setup
+  if (core_id == 0) {
+    printf("> Initialize\n");
+
+    // Print out grid mapping
+    // print_matrix((int32_t *)grid_mapping, 4, 4);
+
+    // Initialize systolic array
+    systolic_init(grid_mapping);
+
+    // Create systolic matrices
+    generate_gradient_matrix(&matrix_A, DIM_M, DIM_N);
+    systolic_matrix_create(&syst_matrix_A, matrix_A, DIM_M, DIM_N);
+    simple_free(matrix_A);
+    generate_gradient_matrix(&matrix_B, DIM_N, DIM_P);
+    systolic_matrix_create(&syst_matrix_B, matrix_B, DIM_N, DIM_P);
+    simple_free(matrix_B);
+    systolic_matrix_allocate(&syst_matrix_C, DIM_M, DIM_P);
+
+    // Print out systolic matrices A & B
+    // printf("> Print Systolic Matrices A & B\n");
+    // systolic_matrix_print(syst_matrix_A);
+    // systolic_matrix_print(syst_matrix_B);
+
+    // Set repetition count per submatrix of C (A->num_cols == B->num_rows)
+    rep_count = syst_matrix_A->num_cols;
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    // Start benchmark
+    printf("> Start\n");
+    mempool_start_benchmark();
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  if ((row_idx == 0) && (col_idx == 0)) {
+    systolic_rcp_pe(rep_count, syst_matrix_A, syst_matrix_B, syst_matrix_C);
+  }
+
+  if ((row_idx == 0) && (col_idx != 0)) {
+    systolic_cp_pe(col_idx, rep_count, syst_matrix_B, syst_matrix_C);
+  }
+
+  if ((row_idx != 0) && (col_idx == 0)) {
+    systolic_rp_pe(row_idx, rep_count, syst_matrix_A, syst_matrix_C);
+  }
+
+  if ((row_idx != 0) && (col_idx != 0)) {
+    systolic_np_pe(row_idx, col_idx, rep_count, syst_matrix_C);
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Print out benchmark
+  if (core_id == 0) {
+    // Stop benchmark
+    mempool_stop_benchmark();
+    printf("> End\n");
+
+    // Print out systolic matrix C
+    // printf("> Print Systolic Matrix C\n");
+    // systolic_matrix_print(syst_matrix_C);
+  }
+
+  // wait until all cores have finished
+  mempool_barrier(num_cores);
+  return 0;
+}
diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h
new file mode 100644
index 000000000..4b923db11
--- /dev/null
+++ b/software/runtime/systolic/matmul_xqueue.h
@@ -0,0 +1,401 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Gua Hao Khov, ETH Zurich
+
+/* This library implements a simple systolic architecture emulation
+ * using global code based orchestration
+ */
+
+/* A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
+ * C = AB
+ * (max dimension is 16-bit)
+ */
+
+#include "alloc.h"
+#include "printf.h"
+
+// Dimensions of square systolic array
+#define SYSTOLIC_SIZE 4
+
+// Systolic matrix
+typedef struct {
+  int32_t *matrix;
+  uint32_t num_rows;
+  uint32_t num_cols;
+} systolic_matrix_t;
+
+// TODO: SQRT ROOT OF NUM_CORES FOR SYSTOLIC SIZE
+
+// Array of queue ptrs in row-major order
+int32_t *queues_vert[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
+int32_t *queues_horz[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
+
+// TODO: GENERALIZE FOR ANY NUMBER OF TILES
+void systolic_init(uint32_t const *grid_mapping) {
+  // Create systolic array via queues
+  extern int32_t __seq_start;
+  uint32_t grid_pos = 0;
+  uint32_t tile_id;
+  uint32_t tile_offset;
+  uint32_t bank_sel[4] = {0, 0, 0, 0};
+  for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
+    for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
+      tile_id = grid_mapping[grid_pos];
+      tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4;
+      queues_vert[y][x] = &__seq_start + tile_offset + bank_sel[tile_id];
+      queues_horz[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 1;
+      bank_sel[tile_id] += 2;
+      ++grid_pos;
+    }
+  }
+  // TODO: PRINT OUT THE ADDRESSES TO CHECK
+}
+
+void systolic_matrix_allocate(systolic_matrix_t **syst_matrix,
+                              uint32_t num_rows, uint32_t num_cols) {
+  // Allocate matrix array
+  int32_t *array = (int32_t *)simple_malloc(num_rows * num_cols * 4);
+
+  // Allocate systolic matrix
+  systolic_matrix_t *new_matrix = (systolic_matrix_t *)simple_malloc(3 * 4);
+
+  // Assign values to systolic matrix
+  new_matrix->matrix = array;
+  new_matrix->num_rows = num_rows;
+  new_matrix->num_cols = num_cols;
+
+  *syst_matrix = new_matrix;
+}
+
+void systolic_matrix_create(systolic_matrix_t **syst_matrix, int32_t *matrix,
+                            uint32_t num_rows, uint32_t num_cols) {
+  // Allocate matrix array
+  int32_t *array = (int32_t *)simple_malloc(num_rows * num_cols * 4);
+
+  // Copy data into new matrix array
+  for (uint32_t y = 0; y < num_rows; ++y) {
+    for (uint32_t x = 0; x < num_cols; ++x) {
+      array[y * num_cols + x] = matrix[y * num_cols + x];
+    }
+  }
+
+  // Allocate systolic matrix
+  systolic_matrix_t *new_matrix = (systolic_matrix_t *)simple_malloc(3 * 4);
+
+  // Assign values to systolic matrix
+  new_matrix->matrix = array;
+  new_matrix->num_rows = num_rows;
+  new_matrix->num_cols = num_cols;
+
+  *syst_matrix = new_matrix;
+}
+
+void systolic_matrix_print(systolic_matrix_t *syst_matrix) {
+  printf("Systolic matrix at 0x%08X\n", (uint32_t)syst_matrix);
+  uint32_t num_rows = syst_matrix->num_rows;
+  uint32_t num_cols = syst_matrix->num_cols;
+  int32_t *matrix = syst_matrix->matrix;
+  for (uint32_t y = 0; y < num_rows; ++y) {
+    for (uint32_t x = 0; x < num_cols; ++x) {
+      printf("%5d ", matrix[y * num_cols + x]);
+    }
+    printf("\n");
+  }
+}
+
+// row and column producing processing element
+void systolic_rcp_pe(const uint32_t rep_count,
+                     systolic_matrix_t const *__restrict__ A,
+                     systolic_matrix_t const *__restrict__ B,
+                     systolic_matrix_t const *__restrict__ C) {
+  int32_t *q_next_horz;
+  int32_t *q_next_vert;
+  int32_t data_horz = 0;
+  int32_t data_vert = 0;
+  int32_t *matrix_A;
+  int32_t *matrix_B;
+  int32_t *matrix_C;
+  uint32_t num_cols_A;
+  uint32_t num_cols_B;
+  uint32_t num_rows_C;
+  uint32_t num_cols_C;
+  int32_t curr_element_C;
+
+  // Assign queues
+  q_next_horz = queues_horz[0][1];
+  q_next_vert = queues_vert[1][0];
+
+  // Get matrix arrays
+  matrix_A = A->matrix;
+  matrix_B = B->matrix;
+  matrix_C = C->matrix;
+
+  // Get dimensions of matrices
+  num_cols_A = A->num_cols;
+  num_cols_B = B->num_cols;
+  num_rows_C = C->num_rows;
+  num_cols_C = C->num_cols;
+
+  // Execute step-wise matrix multiplication
+  for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) {
+    for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) {
+      // Reset value
+      curr_element_C = 0;
+
+      // Systolic matrix multiplication through MACs
+      for (uint32_t i = 0; i < rep_count; ++i) {
+        data_horz = matrix_A[y * num_cols_A + i];
+        data_vert = matrix_B[i * num_cols_B + x];
+        __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+        __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+        curr_element_C += data_horz * data_vert;
+      }
+
+      // Store value
+      matrix_C[y * num_cols_C + x] = curr_element_C;
+    }
+  }
+}
+
+// column producing processing element
+void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
+                    systolic_matrix_t const *__restrict__ B,
+                    systolic_matrix_t const *__restrict__ C) {
+  int32_t *q_prev_horz;
+  int32_t *q_next_horz;
+  int32_t *q_next_vert;
+  int32_t data_horz = 0;
+  int32_t data_vert = 0;
+  int32_t *matrix_B;
+  int32_t *matrix_C;
+  uint32_t num_cols_B;
+  uint32_t num_rows_C;
+  uint32_t num_cols_C;
+  uint32_t shifted_x;
+  int32_t curr_element_C;
+
+  // Assign queues
+  q_prev_horz = queues_horz[0][col_idx];
+  if (col_idx == SYSTOLIC_SIZE - 1) {
+    q_next_horz = NULL;
+  } else {
+    q_next_horz = queues_horz[0][col_idx + 1];
+  }
+  q_next_vert = queues_vert[1][col_idx];
+
+  // Get matrix arrays
+  matrix_B = B->matrix;
+  matrix_C = C->matrix;
+
+  // Get dimensions of matrices
+  num_cols_B = B->num_cols;
+  num_rows_C = C->num_rows;
+  num_cols_C = C->num_cols;
+
+  // Execute step-wise matrix multiplication
+  for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) {
+    for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) {
+      // Shift x
+      shifted_x = x + col_idx;
+
+      // Check if this PE is currently within the matrix C
+      if (shifted_x < num_cols_C) {
+        // Reset value
+        curr_element_C = 0;
+
+        // Systolic matrix multiplication through MACs
+        for (uint32_t i = 0; i < rep_count; ++i) {
+          data_vert = matrix_B[i * num_cols_B + shifted_x];
+          data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          if (q_next_horz) {
+            __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+          }
+          __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+          curr_element_C += data_horz * data_vert;
+        }
+
+        // Store value
+        matrix_C[y * num_cols_C + shifted_x] = curr_element_C;
+      } else {
+        // Pop and push dummy data
+        for (uint32_t i = 0; i < rep_count; ++i) {
+          data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          if (q_next_horz) {
+            __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+          }
+          __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+        }
+      }
+    }
+  }
+}
+
+// row producing processing element
+void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
+                    systolic_matrix_t const *__restrict__ A,
+                    systolic_matrix_t const *__restrict__ C) {
+  int32_t *q_next_horz;
+  int32_t *q_prev_vert;
+  int32_t *q_next_vert;
+  int32_t data_horz = 0;
+  int32_t data_vert = 0;
+  int32_t *matrix_A;
+  int32_t *matrix_C;
+  uint32_t num_cols_A;
+  uint32_t num_rows_C;
+  uint32_t num_cols_C;
+  uint32_t shifted_y;
+  int32_t curr_element_C;
+
+  // Assign queues
+  q_next_horz = queues_horz[row_idx][1];
+  q_prev_vert = queues_vert[row_idx][0];
+  if (row_idx == SYSTOLIC_SIZE - 1) {
+    q_next_vert = NULL;
+  } else {
+    q_next_vert = queues_vert[row_idx + 1][0];
+  }
+
+  // Get matrix arrays
+  matrix_A = A->matrix;
+  matrix_C = C->matrix;
+
+  // Get dimensions of matrices
+  num_cols_A = A->num_cols;
+  num_rows_C = C->num_rows;
+  num_cols_C = C->num_cols;
+
+  // Execute step-wise matrix multiplication
+  for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) {
+    for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) {
+      // Shift y
+      shifted_y = y + row_idx;
+
+      // Check if this PE is currently within the matrix C
+      if (shifted_y < num_rows_C) {
+        // Reset value
+        curr_element_C = 0;
+
+        // Systolic matrix multiplication through MACs
+        for (uint32_t i = 0; i < rep_count; ++i) {
+          data_horz = matrix_A[shifted_y * num_cols_A + i];
+          data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+          if (q_next_vert) {
+            __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+          }
+          curr_element_C += data_horz * data_vert;
+        }
+
+        // Store value
+        matrix_C[shifted_y * num_cols_C + x] = curr_element_C;
+      } else {
+        // Pop and push dummy data
+        for (uint32_t i = 0; i < rep_count; ++i) {
+          data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+          if (q_next_vert) {
+            __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+          }
+        }
+      }
+    }
+  }
+}
+
+// non-producing processing element
+void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
+                    const uint32_t rep_count,
+                    systolic_matrix_t const *__restrict__ C) {
+  int32_t *q_prev_horz;
+  int32_t *q_next_horz;
+  int32_t *q_prev_vert;
+  int32_t *q_next_vert;
+  int32_t data_horz = 0;
+  int32_t data_vert = 0;
+  int32_t *matrix_C;
+  uint32_t num_rows_C;
+  uint32_t num_cols_C;
+  uint32_t shifted_x;
+  uint32_t shifted_y;
+  int32_t curr_element_C;
+
+  // Assign queues
+  q_prev_horz = queues_horz[row_idx][col_idx];
+  if (col_idx == SYSTOLIC_SIZE - 1) {
+    q_next_horz = NULL;
+  } else {
+    q_next_horz = queues_horz[row_idx][col_idx + 1];
+  }
+  q_prev_vert = queues_vert[row_idx][col_idx];
+  if (row_idx == SYSTOLIC_SIZE - 1) {
+    q_next_vert = NULL;
+  } else {
+    q_next_vert = queues_vert[row_idx + 1][col_idx];
+  }
+
+  // Get matrix arrays
+  matrix_C = C->matrix;
+
+  // Get dimensions of matrices
+  num_rows_C = C->num_rows;
+  num_cols_C = C->num_cols;
+
+  // Execute step-wise matrix multiplication
+  for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) {
+    for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) {
+      // Shift x and y
+      shifted_x = x + col_idx;
+      shifted_y = y + row_idx;
+
+      // Check if this PE is currently within the matrix C
+      if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
+        // Reset value
+        curr_element_C = 0;
+
+        // Systolic matrix multiplication through MACs
+        for (uint32_t i = 0; i < rep_count; ++i) {
+          data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          if (q_next_horz) {
+            __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+          }
+          if (q_next_vert) {
+            __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+          }
+          curr_element_C += data_horz * data_vert;
+        }
+
+        // Store values
+        matrix_C[shifted_y * num_cols_C + shifted_x] = curr_element_C;
+      } else {
+        // Pop and push dummy data
+        for (uint32_t i = 0; i < rep_count; ++i) {
+          data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          if (q_next_horz) {
+            __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+
+          }
+          if (q_next_vert) {
+            __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+          }
+        }
+      }
+    }
+  }
+}

From ac43b0c35300e0d279fff08fdc77604674732ceb Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Wed, 19 May 2021 17:24:51 +0200
Subject: [PATCH 09/24] [apps] Optimize systolic matmul_xqueue for 2x2 matmul

---
 software/apps/systolic/matmul_xqueue/main.c |   6 +-
 software/runtime/systolic/matmul_xqueue.h   | 337 +++++++++++++++-----
 2 files changed, 260 insertions(+), 83 deletions(-)

diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c
index 6123c8764..99ac34f80 100644
--- a/software/apps/systolic/matmul_xqueue/main.c
+++ b/software/apps/systolic/matmul_xqueue/main.c
@@ -159,7 +159,7 @@ int main() {
     // systolic_matrix_print(syst_matrix_B);
 
     // Set repetition count per submatrix of C (A->num_cols == B->num_rows)
-    rep_count = syst_matrix_A->num_cols;
+    rep_count = syst_matrix_A->num_cols / 2;
   }
 
   // Wait for all cores
@@ -200,8 +200,8 @@ int main() {
     printf("> End\n");
 
     // Print out systolic matrix C
-    // printf("> Print Systolic Matrix C\n");
-    // systolic_matrix_print(syst_matrix_C);
+    printf("> Print Systolic Matrix C\n");
+    systolic_matrix_print(syst_matrix_C);
   }
 
   // wait until all cores have finished
diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h
index 4b923db11..01ddfc9fa 100644
--- a/software/runtime/systolic/matmul_xqueue.h
+++ b/software/runtime/systolic/matmul_xqueue.h
@@ -67,29 +67,49 @@ void systolic_init(uint32_t const *grid_mapping) {
 
 void systolic_matrix_allocate(systolic_matrix_t **syst_matrix,
                               uint32_t num_rows, uint32_t num_cols) {
+  // Round up row and col dimension to next multiple of two
+  uint32_t syst_num_rows = (uint32_t)((num_rows + 1) & 0xFFFE);
+  uint32_t syst_num_cols = (uint32_t)((num_cols + 1) & 0xFFFE);
+
   // Allocate matrix array
-  int32_t *array = (int32_t *)simple_malloc(num_rows * num_cols * 4);
+  int32_t *array = (int32_t *)simple_malloc(syst_num_rows * syst_num_cols * 4);
 
   // Allocate systolic matrix
   systolic_matrix_t *new_matrix = (systolic_matrix_t *)simple_malloc(3 * 4);
 
   // Assign values to systolic matrix
   new_matrix->matrix = array;
-  new_matrix->num_rows = num_rows;
-  new_matrix->num_cols = num_cols;
+  new_matrix->num_rows = syst_num_rows;
+  new_matrix->num_cols = syst_num_cols;
 
   *syst_matrix = new_matrix;
 }
 
 void systolic_matrix_create(systolic_matrix_t **syst_matrix, int32_t *matrix,
                             uint32_t num_rows, uint32_t num_cols) {
+  // Round up row and col dimension to next multiple of two
+  uint32_t syst_num_rows = (uint32_t)((num_rows + 1) & 0xFFFE);
+  uint32_t syst_num_cols = (uint32_t)((num_cols + 1) & 0xFFFE);
+
   // Allocate matrix array
-  int32_t *array = (int32_t *)simple_malloc(num_rows * num_cols * 4);
+  int32_t *array = (int32_t *)simple_malloc(syst_num_rows * syst_num_cols * 4);
 
   // Copy data into new matrix array
   for (uint32_t y = 0; y < num_rows; ++y) {
     for (uint32_t x = 0; x < num_cols; ++x) {
-      array[y * num_cols + x] = matrix[y * num_cols + x];
+      array[y * syst_num_cols + x] = matrix[y * num_cols + x];
+    }
+  }
+
+  // Zero padding of matrix array
+  if (syst_num_cols != num_cols) {
+    for (uint32_t y = 0; y < syst_num_rows; ++y) {
+      array[y * syst_num_cols + syst_num_cols - 1] = 0;
+    }
+  }
+  if (syst_num_rows != num_rows) {
+    for (uint32_t x = 0; x < syst_num_cols; ++x) {
+      array[(syst_num_rows - 1) * syst_num_cols + x] = 0;
     }
   }
 
@@ -98,8 +118,8 @@ void systolic_matrix_create(systolic_matrix_t **syst_matrix, int32_t *matrix,
 
   // Assign values to systolic matrix
   new_matrix->matrix = array;
-  new_matrix->num_rows = num_rows;
-  new_matrix->num_cols = num_cols;
+  new_matrix->num_rows = syst_num_rows;
+  new_matrix->num_cols = syst_num_cols;
 
   *syst_matrix = new_matrix;
 }
@@ -124,8 +144,8 @@ void systolic_rcp_pe(const uint32_t rep_count,
                      systolic_matrix_t const *__restrict__ C) {
   int32_t *q_next_horz;
   int32_t *q_next_vert;
-  int32_t data_horz = 0;
-  int32_t data_vert = 0;
+  int32_t data_horz[4] = {0, 0, 0, 0};
+  int32_t data_vert[4] = {0, 0, 0, 0};
   int32_t *matrix_A;
   int32_t *matrix_B;
   int32_t *matrix_C;
@@ -133,7 +153,12 @@ void systolic_rcp_pe(const uint32_t rep_count,
   uint32_t num_cols_B;
   uint32_t num_rows_C;
   uint32_t num_cols_C;
-  int32_t curr_element_C;
+  int32_t curr_element_0_C;
+  int32_t curr_element_1_C;
+  int32_t curr_element_2_C;
+  int32_t curr_element_3_C;
+  uint32_t anchor_row_0;
+  uint32_t anchor_row_1;
 
   // Assign queues
   q_next_horz = queues_horz[0][1];
@@ -151,22 +176,49 @@ void systolic_rcp_pe(const uint32_t rep_count,
   num_cols_C = C->num_cols;
 
   // Execute step-wise matrix multiplication
-  for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) {
-    for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) {
-      // Reset value
-      curr_element_C = 0;
+  for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+    for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+      // Reset values
+      curr_element_0_C = 0;
+      curr_element_1_C = 0;
+      curr_element_2_C = 0;
+      curr_element_3_C = 0;
 
       // Systolic matrix multiplication through MACs
-      for (uint32_t i = 0; i < rep_count; ++i) {
-        data_horz = matrix_A[y * num_cols_A + i];
-        data_vert = matrix_B[i * num_cols_B + x];
-        __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
-        __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
-        curr_element_C += data_horz * data_vert;
+      for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+        data_horz[0] = matrix_A[y * num_cols_A + i];
+        data_horz[1] = matrix_A[y * num_cols_A + i + 1];
+        data_horz[2] = matrix_A[(y + 1) * num_cols_A + i];
+        data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1];
+        data_vert[0] = matrix_B[i * num_cols_B + x];
+        data_vert[1] = matrix_B[i * num_cols_B + x + 1];
+        data_vert[2] = matrix_B[(i + 1) * num_cols_B + x];
+        data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1];
+        __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
+        __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
+        __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
+        __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
+        __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
+        __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
+        __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
+        __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
+        curr_element_0_C += data_horz[1] * data_vert[2];
+        curr_element_1_C += data_horz[1] * data_vert[3];
+        curr_element_2_C += data_horz[3] * data_vert[2];
+        curr_element_3_C += data_horz[3] * data_vert[3];
+        curr_element_0_C += data_horz[0] * data_vert[0];
+        curr_element_1_C += data_horz[0] * data_vert[1];
+        curr_element_2_C += data_horz[2] * data_vert[0];
+        curr_element_3_C += data_horz[2] * data_vert[1];
       }
 
-      // Store value
-      matrix_C[y * num_cols_C + x] = curr_element_C;
+      // Store values
+      anchor_row_0 = y * num_cols_C + x;
+      anchor_row_1 = anchor_row_0 + num_cols_C;
+      matrix_C[anchor_row_0] = curr_element_0_C;
+      matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+      matrix_C[anchor_row_1] = curr_element_2_C;
+      matrix_C[anchor_row_1 + 1] = curr_element_3_C;
     }
   }
 }
@@ -178,15 +230,20 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
   int32_t *q_prev_horz;
   int32_t *q_next_horz;
   int32_t *q_next_vert;
-  int32_t data_horz = 0;
-  int32_t data_vert = 0;
+  int32_t data_horz[4] = {0, 0, 0, 0};
+  int32_t data_vert[4] = {0, 0, 0, 0};
   int32_t *matrix_B;
   int32_t *matrix_C;
   uint32_t num_cols_B;
   uint32_t num_rows_C;
   uint32_t num_cols_C;
   uint32_t shifted_x;
-  int32_t curr_element_C;
+  int32_t curr_element_0_C;
+  int32_t curr_element_1_C;
+  int32_t curr_element_2_C;
+  int32_t curr_element_3_C;
+  uint32_t anchor_row_0;
+  uint32_t anchor_row_1;
 
   // Assign queues
   q_prev_horz = queues_horz[0][col_idx];
@@ -207,37 +264,73 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
   num_cols_C = C->num_cols;
 
   // Execute step-wise matrix multiplication
-  for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) {
-    for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) {
+  for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+    for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
       // Shift x
-      shifted_x = x + col_idx;
+      shifted_x = x + 2 * col_idx;
 
       // Check if this PE is currently within the matrix C
       if (shifted_x < num_cols_C) {
-        // Reset value
-        curr_element_C = 0;
+        // Reset values
+        curr_element_0_C = 0;
+        curr_element_1_C = 0;
+        curr_element_2_C = 0;
+        curr_element_3_C = 0;
 
         // Systolic matrix multiplication through MACs
-        for (uint32_t i = 0; i < rep_count; ++i) {
-          data_vert = matrix_B[i * num_cols_B + shifted_x];
-          data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+        for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+          data_vert[0] = matrix_B[i * num_cols_B + shifted_x];
+          data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1];
+          data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x];
+          data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
+          data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
           if (q_next_horz) {
-            __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
           }
-          __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
-          curr_element_C += data_horz * data_vert;
+          __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
+          curr_element_0_C += data_horz[1] * data_vert[2];
+          curr_element_1_C += data_horz[1] * data_vert[3];
+          curr_element_2_C += data_horz[3] * data_vert[2];
+          curr_element_3_C += data_horz[3] * data_vert[3];
+          curr_element_0_C += data_horz[0] * data_vert[0];
+          curr_element_1_C += data_horz[0] * data_vert[1];
+          curr_element_2_C += data_horz[2] * data_vert[0];
+          curr_element_3_C += data_horz[2] * data_vert[1];
         }
 
-        // Store value
-        matrix_C[y * num_cols_C + shifted_x] = curr_element_C;
+        // Store values
+        anchor_row_0 = y * num_cols_C + shifted_x;
+        anchor_row_1 = anchor_row_0 + num_cols_C;
+        matrix_C[anchor_row_0] = curr_element_0_C;
+        matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+        matrix_C[anchor_row_1] = curr_element_2_C;
+        matrix_C[anchor_row_1 + 1] = curr_element_3_C;
       } else {
         // Pop and push dummy data
         for (uint32_t i = 0; i < rep_count; ++i) {
-          data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
           if (q_next_horz) {
-            __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
           }
-          __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
         }
       }
     }
@@ -251,15 +344,20 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
   int32_t *q_next_horz;
   int32_t *q_prev_vert;
   int32_t *q_next_vert;
-  int32_t data_horz = 0;
-  int32_t data_vert = 0;
+  int32_t data_horz[4] = {0, 0, 0, 0};
+  int32_t data_vert[4] = {0, 0, 0, 0};
   int32_t *matrix_A;
   int32_t *matrix_C;
   uint32_t num_cols_A;
   uint32_t num_rows_C;
   uint32_t num_cols_C;
   uint32_t shifted_y;
-  int32_t curr_element_C;
+  int32_t curr_element_0_C;
+  int32_t curr_element_1_C;
+  int32_t curr_element_2_C;
+  int32_t curr_element_3_C;
+  uint32_t anchor_row_0;
+  uint32_t anchor_row_1;
 
   // Assign queues
   q_next_horz = queues_horz[row_idx][1];
@@ -280,36 +378,72 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
   num_cols_C = C->num_cols;
 
   // Execute step-wise matrix multiplication
-  for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) {
-    for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) {
+  for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+    for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
       // Shift y
-      shifted_y = y + row_idx;
+      shifted_y = y + 2 * row_idx;
 
       // Check if this PE is currently within the matrix C
       if (shifted_y < num_rows_C) {
-        // Reset value
-        curr_element_C = 0;
+        // Reset values
+        curr_element_0_C = 0;
+        curr_element_1_C = 0;
+        curr_element_2_C = 0;
+        curr_element_3_C = 0;
 
         // Systolic matrix multiplication through MACs
-        for (uint32_t i = 0; i < rep_count; ++i) {
-          data_horz = matrix_A[shifted_y * num_cols_A + i];
-          data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+        for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+          data_horz[0] = matrix_A[shifted_y * num_cols_A + i];
+          data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1];
+          data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i];
+          data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
+          data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
           if (q_next_vert) {
-            __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
           }
-          curr_element_C += data_horz * data_vert;
+          curr_element_0_C += data_horz[1] * data_vert[2];
+          curr_element_1_C += data_horz[1] * data_vert[3];
+          curr_element_2_C += data_horz[3] * data_vert[2];
+          curr_element_3_C += data_horz[3] * data_vert[3];
+          curr_element_0_C += data_horz[0] * data_vert[0];
+          curr_element_1_C += data_horz[0] * data_vert[1];
+          curr_element_2_C += data_horz[2] * data_vert[0];
+          curr_element_3_C += data_horz[2] * data_vert[1];
         }
 
-        // Store value
-        matrix_C[shifted_y * num_cols_C + x] = curr_element_C;
+        // Store values
+        anchor_row_0 = shifted_y * num_cols_C + x;
+        anchor_row_1 = anchor_row_0 + num_cols_C;
+        matrix_C[anchor_row_0] = curr_element_0_C;
+        matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+        matrix_C[anchor_row_1] = curr_element_2_C;
+        matrix_C[anchor_row_1 + 1] = curr_element_3_C;
       } else {
         // Pop and push dummy data
         for (uint32_t i = 0; i < rep_count; ++i) {
-          data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+          data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
+          __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
           if (q_next_vert) {
-            __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
           }
         }
       }
@@ -325,14 +459,19 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
   int32_t *q_next_horz;
   int32_t *q_prev_vert;
   int32_t *q_next_vert;
-  int32_t data_horz = 0;
-  int32_t data_vert = 0;
+  int32_t data_horz[4] = {0, 0, 0, 0};
+  int32_t data_vert[4] = {0, 0, 0, 0};
   int32_t *matrix_C;
   uint32_t num_rows_C;
   uint32_t num_cols_C;
   uint32_t shifted_x;
   uint32_t shifted_y;
-  int32_t curr_element_C;
+  int32_t curr_element_0_C;
+  int32_t curr_element_1_C;
+  int32_t curr_element_2_C;
+  int32_t curr_element_3_C;
+  uint32_t anchor_row_0;
+  uint32_t anchor_row_1;
 
   // Assign queues
   q_prev_horz = queues_horz[row_idx][col_idx];
@@ -356,43 +495,81 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
   num_cols_C = C->num_cols;
 
   // Execute step-wise matrix multiplication
-  for (uint32_t y = 0; y < num_rows_C; y += SYSTOLIC_SIZE) {
-    for (uint32_t x = 0; x < num_cols_C; x += SYSTOLIC_SIZE) {
+  for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+    for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
       // Shift x and y
-      shifted_x = x + col_idx;
-      shifted_y = y + row_idx;
+      shifted_x = x + 2 * col_idx;
+      shifted_y = y + 2 * row_idx;
 
       // Check if this PE is currently within the matrix C
       if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
-        // Reset value
-        curr_element_C = 0;
+        // Reset values
+        curr_element_0_C = 0;
+        curr_element_1_C = 0;
+        curr_element_2_C = 0;
+        curr_element_3_C = 0;
 
         // Systolic matrix multiplication through MACs
         for (uint32_t i = 0; i < rep_count; ++i) {
-          data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
           if (q_next_horz) {
-            __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
           }
           if (q_next_vert) {
-            __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
           }
-          curr_element_C += data_horz * data_vert;
+          curr_element_0_C += data_horz[1] * data_vert[2];
+          curr_element_1_C += data_horz[1] * data_vert[3];
+          curr_element_2_C += data_horz[3] * data_vert[2];
+          curr_element_3_C += data_horz[3] * data_vert[3];
+          curr_element_0_C += data_horz[0] * data_vert[0];
+          curr_element_1_C += data_horz[0] * data_vert[1];
+          curr_element_2_C += data_horz[2] * data_vert[0];
+          curr_element_3_C += data_horz[2] * data_vert[1];
         }
 
         // Store values
-        matrix_C[shifted_y * num_cols_C + shifted_x] = curr_element_C;
+        anchor_row_0 = shifted_y * num_cols_C + shifted_x;
+        anchor_row_1 = anchor_row_0 + num_cols_C;
+        matrix_C[anchor_row_0] = curr_element_0_C;
+        matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+        matrix_C[anchor_row_1] = curr_element_2_C;
+        matrix_C[anchor_row_1 + 1] = curr_element_3_C;
       } else {
         // Pop and push dummy data
         for (uint32_t i = 0; i < rep_count; ++i) {
-          data_horz = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_vert = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
+          data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
+          data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
           if (q_next_horz) {
-            __atomic_fetch_and(q_next_horz, data_horz, __ATOMIC_SEQ_CST);
-
+            __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
           }
           if (q_next_vert) {
-            __atomic_fetch_and(q_next_vert, data_vert, __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
+            __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
           }
         }
       }

From 51b6eb5b9609d5f3c519e32fba40d9e67ffc3693 Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Thu, 20 May 2021 20:39:32 +0200
Subject: [PATCH 10/24] [apps] Improve matmul_xqueue code - add interleaving
 and fix data dependency - hotfix to enforce data dependency in dummy pop/push

---
 software/apps/systolic/matmul_xqueue/main.c |   4 +-
 software/runtime/systolic/matmul_xqueue.h   | 845 ++++++++++++++------
 2 files changed, 603 insertions(+), 246 deletions(-)

diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c
index 99ac34f80..da46fbe35 100644
--- a/software/apps/systolic/matmul_xqueue/main.c
+++ b/software/apps/systolic/matmul_xqueue/main.c
@@ -200,8 +200,8 @@ int main() {
     printf("> End\n");
 
     // Print out systolic matrix C
-    printf("> Print Systolic Matrix C\n");
-    systolic_matrix_print(syst_matrix_C);
+    //printf("> Print Systolic Matrix C\n");
+    //systolic_matrix_print(syst_matrix_C);
   }
 
   // wait until all cores have finished
diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h
index 01ddfc9fa..113091293 100644
--- a/software/runtime/systolic/matmul_xqueue.h
+++ b/software/runtime/systolic/matmul_xqueue.h
@@ -23,6 +23,15 @@
 /* A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
  * C = AB
  * (max dimension is 16-bit)
+ * Matrix is processed in 2x2 submatrices with the following indexing
+ *
+ *        B B          0 2
+ *        B B          1 3
+ *
+ *   A A  C C  =  0 1  0 1
+ *   A A  C C     2 3  2 3
+ *
+ * e.g. C0 = A1 * B1 + A0 * B0
  */
 
 #include "alloc.h"
@@ -44,6 +53,16 @@ typedef struct {
 int32_t *queues_vert[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
 int32_t *queues_horz[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
 
+// queue push
+inline int32_t queue_push(int32_t *queue, int32_t data) {
+  return __atomic_fetch_and(queue, data, __ATOMIC_RELAXED);
+}
+
+// queue pop
+inline int32_t queue_pop(int32_t *queue) {
+  return __atomic_fetch_or(queue, 0, __ATOMIC_RELAXED);
+}
+
 // TODO: GENERALIZE FOR ANY NUMBER OF TILES
 void systolic_init(uint32_t const *grid_mapping) {
   // Create systolic array via queues
@@ -142,10 +161,12 @@ void systolic_rcp_pe(const uint32_t rep_count,
                      systolic_matrix_t const *__restrict__ A,
                      systolic_matrix_t const *__restrict__ B,
                      systolic_matrix_t const *__restrict__ C) {
-  int32_t *q_next_horz;
-  int32_t *q_next_vert;
+  int32_t *queue_next_horz;
+  int32_t *queue_next_vert;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
+  int32_t resp_horz __attribute__((unused));
+  int32_t resp_vert __attribute__((unused));
   int32_t *matrix_A;
   int32_t *matrix_B;
   int32_t *matrix_C;
@@ -161,8 +182,8 @@ void systolic_rcp_pe(const uint32_t rep_count,
   uint32_t anchor_row_1;
 
   // Assign queues
-  q_next_horz = queues_horz[0][1];
-  q_next_vert = queues_vert[1][0];
+  queue_next_horz = queues_horz[0][1];
+  queue_next_vert = queues_vert[1][0];
 
   // Get matrix arrays
   matrix_A = A->matrix;
@@ -187,29 +208,29 @@ void systolic_rcp_pe(const uint32_t rep_count,
       // Systolic matrix multiplication through MACs
       for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
         data_horz[0] = matrix_A[y * num_cols_A + i];
+        data_vert[0] = matrix_B[i * num_cols_B + x];
+        resp_horz = queue_push(queue_next_horz, data_horz[0]);
+        resp_vert = queue_push(queue_next_vert, data_vert[0]);
+        curr_element_0_C += data_horz[0] * data_vert[0];
         data_horz[1] = matrix_A[y * num_cols_A + i + 1];
+        data_vert[1] = matrix_B[(i + 1) * num_cols_B + x];
+        resp_horz = queue_push(queue_next_horz, data_horz[1]);
+        resp_vert = queue_push(queue_next_vert, data_vert[1]);
+        curr_element_0_C += data_horz[1] * data_vert[1];
         data_horz[2] = matrix_A[(y + 1) * num_cols_A + i];
+        data_vert[2] = matrix_B[i * num_cols_B + x + 1];
+        resp_horz = queue_push(queue_next_horz, data_horz[1]);
+        resp_vert = queue_push(queue_next_vert, data_vert[1]);
+        curr_element_1_C += data_horz[0] * data_vert[2];
+        curr_element_2_C += data_horz[2] * data_vert[0];
+        curr_element_3_C += data_horz[2] * data_vert[2];
         data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1];
-        data_vert[0] = matrix_B[i * num_cols_B + x];
-        data_vert[1] = matrix_B[i * num_cols_B + x + 1];
-        data_vert[2] = matrix_B[(i + 1) * num_cols_B + x];
         data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1];
-        __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
-        __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
-        __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
-        __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
-        __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
-        __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
-        __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
-        __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
-        curr_element_0_C += data_horz[1] * data_vert[2];
+        resp_horz = queue_push(queue_next_horz, data_horz[3]);
+        resp_vert = queue_push(queue_next_vert, data_vert[3]);
         curr_element_1_C += data_horz[1] * data_vert[3];
-        curr_element_2_C += data_horz[3] * data_vert[2];
+        curr_element_2_C += data_horz[3] * data_vert[1];
         curr_element_3_C += data_horz[3] * data_vert[3];
-        curr_element_0_C += data_horz[0] * data_vert[0];
-        curr_element_1_C += data_horz[0] * data_vert[1];
-        curr_element_2_C += data_horz[2] * data_vert[0];
-        curr_element_3_C += data_horz[2] * data_vert[1];
       }
 
       // Store values
@@ -227,11 +248,13 @@ void systolic_rcp_pe(const uint32_t rep_count,
 void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
                     systolic_matrix_t const *__restrict__ B,
                     systolic_matrix_t const *__restrict__ C) {
-  int32_t *q_prev_horz;
-  int32_t *q_next_horz;
-  int32_t *q_next_vert;
+  int32_t *queue_prev_horz;
+  int32_t *queue_next_horz;
+  int32_t *queue_next_vert;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
+  int32_t resp_horz __attribute__((unused));
+  int32_t resp_vert __attribute__((unused));
   int32_t *matrix_B;
   int32_t *matrix_C;
   uint32_t num_cols_B;
@@ -246,13 +269,13 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
   uint32_t anchor_row_1;
 
   // Assign queues
-  q_prev_horz = queues_horz[0][col_idx];
+  queue_prev_horz = queues_horz[0][col_idx];
   if (col_idx == SYSTOLIC_SIZE - 1) {
-    q_next_horz = NULL;
+    queue_next_horz = NULL;
   } else {
-    q_next_horz = queues_horz[0][col_idx + 1];
+    queue_next_horz = queues_horz[0][col_idx + 1];
   }
-  q_next_vert = queues_vert[1][col_idx];
+  queue_next_vert = queues_vert[1][col_idx];
 
   // Get matrix arrays
   matrix_B = B->matrix;
@@ -263,74 +286,134 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
   num_rows_C = C->num_rows;
   num_cols_C = C->num_cols;
 
-  // Execute step-wise matrix multiplication
-  for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
-    for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
-      // Shift x
-      shifted_x = x + 2 * col_idx;
-
-      // Check if this PE is currently within the matrix C
-      if (shifted_x < num_cols_C) {
-        // Reset values
-        curr_element_0_C = 0;
-        curr_element_1_C = 0;
-        curr_element_2_C = 0;
-        curr_element_3_C = 0;
-
-        // Systolic matrix multiplication through MACs
-        for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
-          data_vert[0] = matrix_B[i * num_cols_B + shifted_x];
-          data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1];
-          data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x];
-          data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
-          data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          if (q_next_horz) {
-            __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
+  // Check if PE is at the right boundary
+  if (queue_next_horz) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x
+        shifted_x = x + 2 * col_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+            data_vert[0] = matrix_B[i * num_cols_B + shifted_x];
+            data_horz[0] = queue_pop(queue_prev_horz);
+            resp_horz = queue_push(queue_next_horz, data_horz[0]);
+            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x];
+            data_horz[1] = queue_pop(queue_prev_horz);
+            resp_horz = queue_push(queue_next_horz, data_horz[1]);
+            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1];
+            data_horz[2] = queue_pop(queue_prev_horz);
+            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            curr_element_1_C += data_horz[0] * data_vert[2];
+            curr_element_2_C += data_horz[2] * data_vert[0];
+            curr_element_3_C += data_horz[2] * data_vert[2];
+            data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
+            data_horz[3] = queue_pop(queue_prev_horz);
+            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            curr_element_1_C += data_horz[1] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_horz[0] = queue_pop(queue_prev_horz);
+            resp_horz = queue_push(queue_next_horz, data_horz[0]);
+            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            data_horz[1] = queue_pop(queue_prev_horz);
+            resp_horz = queue_push(queue_next_horz, data_horz[1]);
+            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            data_horz[2] = queue_pop(queue_prev_horz);
+            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            data_horz[3] = queue_pop(queue_prev_horz);
+            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            resp_vert = queue_push(queue_next_vert, data_vert[3]);
           }
-          __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
-          curr_element_0_C += data_horz[1] * data_vert[2];
-          curr_element_1_C += data_horz[1] * data_vert[3];
-          curr_element_2_C += data_horz[3] * data_vert[2];
-          curr_element_3_C += data_horz[3] * data_vert[3];
-          curr_element_0_C += data_horz[0] * data_vert[0];
-          curr_element_1_C += data_horz[0] * data_vert[1];
-          curr_element_2_C += data_horz[2] * data_vert[0];
-          curr_element_3_C += data_horz[2] * data_vert[1];
         }
+      }
+    }
+  } else {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x
+        shifted_x = x + 2 * col_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+            data_vert[0] = matrix_B[i * num_cols_B + shifted_x];
+            data_horz[0] = queue_pop(queue_prev_horz);
+            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x];
+            data_horz[1] = queue_pop(queue_prev_horz);
+            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1];
+            data_horz[2] = queue_pop(queue_prev_horz);
+            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            curr_element_1_C += data_horz[0] * data_vert[2];
+            curr_element_2_C += data_horz[2] * data_vert[0];
+            curr_element_3_C += data_horz[2] * data_vert[2];
+            data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
+            data_horz[3] = queue_pop(queue_prev_horz);
+            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            curr_element_1_C += data_horz[1] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
 
-        // Store values
-        anchor_row_0 = y * num_cols_C + shifted_x;
-        anchor_row_1 = anchor_row_0 + num_cols_C;
-        matrix_C[anchor_row_0] = curr_element_0_C;
-        matrix_C[anchor_row_0 + 1] = curr_element_1_C;
-        matrix_C[anchor_row_1] = curr_element_2_C;
-        matrix_C[anchor_row_1 + 1] = curr_element_3_C;
-      } else {
-        // Pop and push dummy data
-        for (uint32_t i = 0; i < rep_count; ++i) {
-          data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          if (q_next_horz) {
-            __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
+          // Store values
+          anchor_row_0 = y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_horz[0] = queue_pop(queue_prev_horz);
+            resp_vert = queue_push(queue_next_vert, data_horz[0]);
+            data_horz[1] = queue_pop(queue_prev_horz);
+            resp_vert = queue_push(queue_next_vert, data_horz[1]);
+            data_horz[2] = queue_pop(queue_prev_horz);
+            resp_vert = queue_push(queue_next_vert, data_horz[2]);
+            data_horz[3] = queue_pop(queue_prev_horz);
+            resp_vert = queue_push(queue_next_vert, data_horz[3]);
           }
-          __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
         }
       }
     }
@@ -341,11 +424,13 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
 void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
                     systolic_matrix_t const *__restrict__ A,
                     systolic_matrix_t const *__restrict__ C) {
-  int32_t *q_next_horz;
-  int32_t *q_prev_vert;
-  int32_t *q_next_vert;
+  int32_t *queue_next_horz;
+  int32_t *queue_prev_vert;
+  int32_t *queue_next_vert;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
+  int32_t resp_horz __attribute__((unused));
+  int32_t resp_vert __attribute__((unused));
   int32_t *matrix_A;
   int32_t *matrix_C;
   uint32_t num_cols_A;
@@ -360,12 +445,12 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
   uint32_t anchor_row_1;
 
   // Assign queues
-  q_next_horz = queues_horz[row_idx][1];
-  q_prev_vert = queues_vert[row_idx][0];
+  queue_next_horz = queues_horz[row_idx][1];
+  queue_prev_vert = queues_vert[row_idx][0];
   if (row_idx == SYSTOLIC_SIZE - 1) {
-    q_next_vert = NULL;
+    queue_next_vert = NULL;
   } else {
-    q_next_vert = queues_vert[row_idx + 1][0];
+    queue_next_vert = queues_vert[row_idx + 1][0];
   }
 
   // Get matrix arrays
@@ -377,73 +462,133 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
   num_rows_C = C->num_rows;
   num_cols_C = C->num_cols;
 
-  // Execute step-wise matrix multiplication
-  for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
-    for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
-      // Shift y
-      shifted_y = y + 2 * row_idx;
-
-      // Check if this PE is currently within the matrix C
-      if (shifted_y < num_rows_C) {
-        // Reset values
-        curr_element_0_C = 0;
-        curr_element_1_C = 0;
-        curr_element_2_C = 0;
-        curr_element_3_C = 0;
-
-        // Systolic matrix multiplication through MACs
-        for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
-          data_horz[0] = matrix_A[shifted_y * num_cols_A + i];
-          data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1];
-          data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i];
-          data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
-          data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
-          if (q_next_vert) {
-            __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
+  // Check if PE is at the bottom boundary
+  if (queue_next_vert) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift y
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+            data_horz[0] = matrix_A[shifted_y * num_cols_A + i];
+            data_vert[0] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[0]);
+            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1];
+            data_vert[1] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[1]);
+            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i];
+            data_vert[2] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            curr_element_1_C += data_horz[0] * data_vert[2];
+            curr_element_2_C += data_horz[2] * data_vert[0];
+            curr_element_3_C += data_horz[2] * data_vert[2];
+            data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
+            data_vert[3] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            curr_element_1_C += data_horz[1] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_vert[0] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[0]);
+            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            data_vert[1] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[1]);
+            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            data_vert[2] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            data_vert[3] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            resp_vert = queue_push(queue_next_vert, data_vert[3]);
           }
-          curr_element_0_C += data_horz[1] * data_vert[2];
-          curr_element_1_C += data_horz[1] * data_vert[3];
-          curr_element_2_C += data_horz[3] * data_vert[2];
-          curr_element_3_C += data_horz[3] * data_vert[3];
-          curr_element_0_C += data_horz[0] * data_vert[0];
-          curr_element_1_C += data_horz[0] * data_vert[1];
-          curr_element_2_C += data_horz[2] * data_vert[0];
-          curr_element_3_C += data_horz[2] * data_vert[1];
         }
+      }
+    }
+  } else {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift y
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+            data_horz[0] = matrix_A[shifted_y * num_cols_A + i];
+            data_vert[0] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[0]);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1];
+            data_vert[1] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[1]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i];
+            data_vert[2] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            curr_element_1_C += data_horz[0] * data_vert[2];
+            curr_element_2_C += data_horz[2] * data_vert[0];
+            curr_element_3_C += data_horz[2] * data_vert[2];
+            data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
+            data_vert[3] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            curr_element_1_C += data_horz[1] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
 
-        // Store values
-        anchor_row_0 = shifted_y * num_cols_C + x;
-        anchor_row_1 = anchor_row_0 + num_cols_C;
-        matrix_C[anchor_row_0] = curr_element_0_C;
-        matrix_C[anchor_row_0 + 1] = curr_element_1_C;
-        matrix_C[anchor_row_1] = curr_element_2_C;
-        matrix_C[anchor_row_1 + 1] = curr_element_3_C;
-      } else {
-        // Pop and push dummy data
-        for (uint32_t i = 0; i < rep_count; ++i) {
-          data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
-          __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
-          if (q_next_vert) {
-            __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_vert[0] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_vert[0]);
+            data_vert[1] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_vert[1]);
+            data_vert[2] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_vert[2]);
+            data_vert[3] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_vert[3]);
           }
         }
       }
@@ -455,12 +600,15 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
 void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
                     const uint32_t rep_count,
                     systolic_matrix_t const *__restrict__ C) {
-  int32_t *q_prev_horz;
-  int32_t *q_next_horz;
-  int32_t *q_prev_vert;
-  int32_t *q_next_vert;
+  int32_t *queue_prev_horz;
+  int32_t *queue_next_horz;
+  int32_t *queue_prev_vert;
+  int32_t *queue_next_vert;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
+  int32_t data_dummy __attribute__((unused)) = 0;
+  int32_t resp_horz __attribute__((unused));
+  int32_t resp_vert __attribute__((unused));
   int32_t *matrix_C;
   uint32_t num_rows_C;
   uint32_t num_cols_C;
@@ -474,17 +622,17 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
   uint32_t anchor_row_1;
 
   // Assign queues
-  q_prev_horz = queues_horz[row_idx][col_idx];
+  queue_prev_horz = queues_horz[row_idx][col_idx];
   if (col_idx == SYSTOLIC_SIZE - 1) {
-    q_next_horz = NULL;
+    queue_next_horz = NULL;
   } else {
-    q_next_horz = queues_horz[row_idx][col_idx + 1];
+    queue_next_horz = queues_horz[row_idx][col_idx + 1];
   }
-  q_prev_vert = queues_vert[row_idx][col_idx];
+  queue_prev_vert = queues_vert[row_idx][col_idx];
   if (row_idx == SYSTOLIC_SIZE - 1) {
-    q_next_vert = NULL;
+    queue_next_vert = NULL;
   } else {
-    q_next_vert = queues_vert[row_idx + 1][col_idx];
+    queue_next_vert = queues_vert[row_idx + 1][col_idx];
   }
 
   // Get matrix arrays
@@ -494,82 +642,291 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
   num_rows_C = C->num_rows;
   num_cols_C = C->num_cols;
 
-  // Execute step-wise matrix multiplication
-  for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
-    for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
-      // Shift x and y
-      shifted_x = x + 2 * col_idx;
-      shifted_y = y + 2 * row_idx;
-
-      // Check if this PE is currently within the matrix C
-      if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
-        // Reset values
-        curr_element_0_C = 0;
-        curr_element_1_C = 0;
-        curr_element_2_C = 0;
-        curr_element_3_C = 0;
-
-        // Systolic matrix multiplication through MACs
-        for (uint32_t i = 0; i < rep_count; ++i) {
-          data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          if (q_next_horz) {
-            __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
+  // PE is not at a boundary
+  if (queue_next_horz && queue_next_vert) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x and y
+        shifted_x = x + 2 * col_idx;
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_horz[0] = queue_pop(queue_prev_horz);
+            data_vert[0] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[0]);
+            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            data_horz[1] = queue_pop(queue_prev_horz);
+            data_vert[1] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[1]);
+            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            data_horz[2] = queue_pop(queue_prev_horz);
+            data_vert[2] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            curr_element_1_C += data_horz[0] * data_vert[2];
+            curr_element_2_C += data_horz[2] * data_vert[0];
+            curr_element_3_C += data_horz[2] * data_vert[2];
+            data_horz[3] = queue_pop(queue_prev_horz);
+            data_vert[3] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            curr_element_1_C += data_horz[1] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_horz[0] = queue_pop(queue_prev_horz);
+            data_vert[0] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[0]);
+            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            data_horz[1] = queue_pop(queue_prev_horz);
+            data_vert[1] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[1]);
+            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            data_horz[2] = queue_pop(queue_prev_horz);
+            data_vert[2] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            data_horz[3] = queue_pop(queue_prev_horz);
+            data_vert[3] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+          }
+        }
+      }
+    }
+  }
+
+  // PE is at the right boundary
+  if (!queue_next_horz && queue_next_vert) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x and y
+        shifted_x = x + 2 * col_idx;
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_horz[0] = queue_pop(queue_prev_horz);
+            data_vert[0] = queue_pop(queue_prev_vert);
+            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            data_horz[1] = queue_pop(queue_prev_horz);
+            data_vert[1] = queue_pop(queue_prev_vert);
+            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            data_horz[2] = queue_pop(queue_prev_horz);
+            data_vert[2] = queue_pop(queue_prev_vert);
+            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            curr_element_1_C += data_horz[0] * data_vert[2];
+            curr_element_2_C += data_horz[2] * data_vert[0];
+            curr_element_3_C += data_horz[2] * data_vert[2];
+            data_horz[3] = queue_pop(queue_prev_horz);
+            data_vert[3] = queue_pop(queue_prev_vert);
+            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            curr_element_1_C += data_horz[1] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_3_C += data_horz[3] * data_vert[3];
           }
-          if (q_next_vert) {
-            __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_horz[0] = queue_pop(queue_prev_horz);
+            data_vert[0] = queue_pop(queue_prev_vert);
+            data_vert[0] += data_horz[0];
+            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            data_horz[1] = queue_pop(queue_prev_horz);
+            data_vert[1] = queue_pop(queue_prev_vert);
+            data_vert[1] += data_horz[1];
+            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            data_horz[2] = queue_pop(queue_prev_horz);
+            data_vert[2] = queue_pop(queue_prev_vert);
+            data_vert[2] += data_horz[2];
+            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            data_horz[3] = queue_pop(queue_prev_horz);
+            data_vert[3] = queue_pop(queue_prev_vert);
+            data_vert[3] += data_horz[3];
+            resp_vert = queue_push(queue_next_vert, data_vert[3]);
           }
-          curr_element_0_C += data_horz[1] * data_vert[2];
-          curr_element_1_C += data_horz[1] * data_vert[3];
-          curr_element_2_C += data_horz[3] * data_vert[2];
-          curr_element_3_C += data_horz[3] * data_vert[3];
-          curr_element_0_C += data_horz[0] * data_vert[0];
-          curr_element_1_C += data_horz[0] * data_vert[1];
-          curr_element_2_C += data_horz[2] * data_vert[0];
-          curr_element_3_C += data_horz[2] * data_vert[1];
         }
+      }
+    }
+  }
 
-        // Store values
-        anchor_row_0 = shifted_y * num_cols_C + shifted_x;
-        anchor_row_1 = anchor_row_0 + num_cols_C;
-        matrix_C[anchor_row_0] = curr_element_0_C;
-        matrix_C[anchor_row_0 + 1] = curr_element_1_C;
-        matrix_C[anchor_row_1] = curr_element_2_C;
-        matrix_C[anchor_row_1 + 1] = curr_element_3_C;
-      } else {
-        // Pop and push dummy data
-        for (uint32_t i = 0; i < rep_count; ++i) {
-          data_horz[0] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[1] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[2] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_horz[3] = __atomic_fetch_or(q_prev_horz, 0, __ATOMIC_SEQ_CST);
-          data_vert[0] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[1] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[2] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          data_vert[3] = __atomic_fetch_or(q_prev_vert, 0, __ATOMIC_SEQ_CST);
-          if (q_next_horz) {
-            __atomic_fetch_and(q_next_horz, data_horz[0], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[1], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[2], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_horz, data_horz[3], __ATOMIC_SEQ_CST);
+  // PE is at the bottom boundary
+  if (queue_next_horz && !queue_next_vert) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x and y
+        shifted_x = x + 2 * col_idx;
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_horz[0] = queue_pop(queue_prev_horz);
+            data_vert[0] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[0]);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            data_horz[1] = queue_pop(queue_prev_horz);
+            data_vert[1] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[1]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            data_horz[2] = queue_pop(queue_prev_horz);
+            data_vert[2] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            curr_element_1_C += data_horz[0] * data_vert[2];
+            curr_element_2_C += data_horz[2] * data_vert[0];
+            curr_element_3_C += data_horz[2] * data_vert[2];
+            data_horz[3] = queue_pop(queue_prev_horz);
+            data_vert[3] = queue_pop(queue_prev_vert);
+            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            curr_element_1_C += data_horz[1] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_3_C += data_horz[3] * data_vert[3];
           }
-          if (q_next_vert) {
-            __atomic_fetch_and(q_next_vert, data_vert[0], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[1], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[2], __ATOMIC_SEQ_CST);
-            __atomic_fetch_and(q_next_vert, data_vert[3], __ATOMIC_SEQ_CST);
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_horz[0] = queue_pop(queue_prev_horz);
+            data_vert[0] = queue_pop(queue_prev_vert);
+            data_horz[0] += data_vert[0];
+            resp_horz = queue_push(queue_next_horz, data_horz[0]);
+            data_horz[1] = queue_pop(queue_prev_horz);
+            data_vert[1] = queue_pop(queue_prev_vert);
+            data_horz[1] += data_vert[1];
+            resp_horz = queue_push(queue_next_horz, data_horz[1]);
+            data_horz[2] = queue_pop(queue_prev_horz);
+            data_vert[2] = queue_pop(queue_prev_vert);
+            data_horz[2] += data_vert[2];
+            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            data_horz[3] = queue_pop(queue_prev_horz);
+            data_vert[3] = queue_pop(queue_prev_vert);
+            data_horz[3] += data_vert[3];
+            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+          }
+        }
+      }
+    }
+  }
+
+  // PE is at the bottom right corner
+  if (!queue_next_horz && !queue_next_vert) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x and y
+        shifted_x = x + 2 * col_idx;
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_horz[0] = queue_pop(queue_prev_horz);
+            data_vert[0] = queue_pop(queue_prev_vert);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            data_horz[1] = queue_pop(queue_prev_horz);
+            data_vert[1] = queue_pop(queue_prev_vert);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            data_horz[2] = queue_pop(queue_prev_horz);
+            data_vert[2] = queue_pop(queue_prev_vert);
+            curr_element_1_C += data_horz[0] * data_vert[2];
+            curr_element_2_C += data_horz[2] * data_vert[0];
+            curr_element_3_C += data_horz[2] * data_vert[2];
+            data_horz[3] = queue_pop(queue_prev_horz);
+            data_vert[3] = queue_pop(queue_prev_vert);
+            curr_element_1_C += data_horz[1] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            data_horz[0] = queue_pop(queue_prev_horz);
+            data_vert[0] = queue_pop(queue_prev_vert);
+            data_dummy += data_horz[0] * data_vert[0];
+            data_horz[1] = queue_pop(queue_prev_horz);
+            data_vert[1] = queue_pop(queue_prev_vert);
+            data_dummy += data_horz[1] * data_vert[1];
+            data_horz[2] = queue_pop(queue_prev_horz);
+            data_vert[2] = queue_pop(queue_prev_vert);
+            data_dummy += data_horz[2] * data_vert[2];
+            data_horz[3] = queue_pop(queue_prev_horz);
+            data_vert[3] = queue_pop(queue_prev_vert);
+            data_dummy += data_horz[3] * data_vert[3];
+            // TODO: FIND SAFER WAY TO ENFORCE DATA DEPENDENCY
+            if (!data_dummy)
+              break;
           }
         }
       }

From 5f2d0bcaa54129a93a87087b7e553b3b70d5f82f Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Thu, 27 May 2021 20:20:52 +0200
Subject: [PATCH 11/24] [apps] Optimize matmul_xqueue with asm inline

---
 software/apps/systolic/matmul_xqueue/main.c |  18 +-
 software/runtime/systolic/matmul_xqueue.h   | 442 ++++++++++----------
 2 files changed, 242 insertions(+), 218 deletions(-)

diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c
index da46fbe35..fafc4fbea 100644
--- a/software/apps/systolic/matmul_xqueue/main.c
+++ b/software/apps/systolic/matmul_xqueue/main.c
@@ -27,9 +27,9 @@
 #include "synchronization.h"
 
 // Dimensions of matrices
-#define DIM_M 12
-#define DIM_N 12
-#define DIM_P 12
+#define DIM_M 16
+#define DIM_N 16
+#define DIM_P 16
 
 uint32_t *grid_mapping;
 
@@ -171,6 +171,10 @@ int main() {
     mempool_start_benchmark();
   }
 
+  // Start benchmark for all cores
+  // mempool_barrier(num_cores);
+  // mempool_start_benchmark();
+
   // Wait for all cores
   mempool_barrier(num_cores);
 
@@ -193,6 +197,10 @@ int main() {
   // Wait for all cores
   mempool_barrier(num_cores);
 
+  // Stop benchmark for all cores
+  // mempool_stop_benchmark();
+  // mempool_barrier(num_cores);
+
   // Print out benchmark
   if (core_id == 0) {
     // Stop benchmark
@@ -200,8 +208,8 @@ int main() {
     printf("> End\n");
 
     // Print out systolic matrix C
-    //printf("> Print Systolic Matrix C\n");
-    //systolic_matrix_print(syst_matrix_C);
+    // printf("> Print Systolic Matrix C\n");
+    // systolic_matrix_print(syst_matrix_C);
   }
 
   // wait until all cores have finished
diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h
index 113091293..2ba8f317d 100644
--- a/software/runtime/systolic/matmul_xqueue.h
+++ b/software/runtime/systolic/matmul_xqueue.h
@@ -54,13 +54,14 @@ int32_t *queues_vert[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
 int32_t *queues_horz[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
 
 // queue push
-inline int32_t queue_push(int32_t *queue, int32_t data) {
-  return __atomic_fetch_and(queue, data, __ATOMIC_RELAXED);
+static inline void queue_push(void *const queue, int32_t data,
+                              int32_t *const ret) {
+  asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue));
 }
 
 // queue pop
-inline int32_t queue_pop(int32_t *queue) {
-  return __atomic_fetch_or(queue, 0, __ATOMIC_RELAXED);
+inline void queue_pop(void *const queue, int32_t *const ret) {
+  asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue));
 }
 
 // TODO: GENERALIZE FOR ANY NUMBER OF TILES
@@ -81,7 +82,22 @@ void systolic_init(uint32_t const *grid_mapping) {
       ++grid_pos;
     }
   }
-  // TODO: PRINT OUT THE ADDRESSES TO CHECK
+
+  // Print out queue addresses
+  // printf("queues_vert\n");
+  // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
+  //     printf("%5d ", queues_vert[y][x]);
+  //   }
+  //   printf("\n");
+  // }
+  // printf("queues_horz\n");
+  // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
+  //     printf("%5d ", queues_horz[y][x]);
+  //   }
+  //   printf("\n");
+  // }
 }
 
 void systolic_matrix_allocate(systolic_matrix_t **syst_matrix,
@@ -165,8 +181,8 @@ void systolic_rcp_pe(const uint32_t rep_count,
   int32_t *queue_next_vert;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
-  int32_t resp_horz __attribute__((unused));
-  int32_t resp_vert __attribute__((unused));
+  int32_t resp_horz __attribute__((unused)) = 0;
+  int32_t resp_vert __attribute__((unused)) = 0;
   int32_t *matrix_A;
   int32_t *matrix_B;
   int32_t *matrix_C;
@@ -209,25 +225,25 @@ void systolic_rcp_pe(const uint32_t rep_count,
       for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
         data_horz[0] = matrix_A[y * num_cols_A + i];
         data_vert[0] = matrix_B[i * num_cols_B + x];
-        resp_horz = queue_push(queue_next_horz, data_horz[0]);
-        resp_vert = queue_push(queue_next_vert, data_vert[0]);
-        curr_element_0_C += data_horz[0] * data_vert[0];
+        queue_push(queue_next_horz, data_horz[0], &resp_horz);
+        queue_push(queue_next_vert, data_vert[0], &resp_vert);
         data_horz[1] = matrix_A[y * num_cols_A + i + 1];
         data_vert[1] = matrix_B[(i + 1) * num_cols_B + x];
-        resp_horz = queue_push(queue_next_horz, data_horz[1]);
-        resp_vert = queue_push(queue_next_vert, data_vert[1]);
-        curr_element_0_C += data_horz[1] * data_vert[1];
+        curr_element_0_C += data_horz[0] * data_vert[0];
+        queue_push(queue_next_horz, data_horz[1], &resp_horz);
+        queue_push(queue_next_vert, data_vert[1], &resp_vert);
         data_horz[2] = matrix_A[(y + 1) * num_cols_A + i];
         data_vert[2] = matrix_B[i * num_cols_B + x + 1];
-        resp_horz = queue_push(queue_next_horz, data_horz[1]);
-        resp_vert = queue_push(queue_next_vert, data_vert[1]);
+        curr_element_0_C += data_horz[1] * data_vert[1];
+        queue_push(queue_next_horz, data_horz[1], &resp_horz);
+        queue_push(queue_next_vert, data_vert[1], &resp_vert);
+        data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1];
+        data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1];
         curr_element_1_C += data_horz[0] * data_vert[2];
         curr_element_2_C += data_horz[2] * data_vert[0];
         curr_element_3_C += data_horz[2] * data_vert[2];
-        data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1];
-        data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1];
-        resp_horz = queue_push(queue_next_horz, data_horz[3]);
-        resp_vert = queue_push(queue_next_vert, data_vert[3]);
+        queue_push(queue_next_horz, data_horz[3], &resp_horz);
+        queue_push(queue_next_vert, data_vert[3], &resp_vert);
         curr_element_1_C += data_horz[1] * data_vert[3];
         curr_element_2_C += data_horz[3] * data_vert[1];
         curr_element_3_C += data_horz[3] * data_vert[3];
@@ -253,8 +269,8 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
   int32_t *queue_next_vert;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
-  int32_t resp_horz __attribute__((unused));
-  int32_t resp_vert __attribute__((unused));
+  int32_t resp_horz __attribute__((unused)) = 0;
+  int32_t resp_vert __attribute__((unused)) = 0;
   int32_t *matrix_B;
   int32_t *matrix_C;
   uint32_t num_cols_B;
@@ -305,26 +321,26 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
             data_vert[0] = matrix_B[i * num_cols_B + shifted_x];
-            data_horz[0] = queue_pop(queue_prev_horz);
-            resp_horz = queue_push(queue_next_horz, data_horz[0]);
-            resp_vert = queue_push(queue_next_vert, data_vert[0]);
-            curr_element_0_C += data_horz[0] * data_vert[0];
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_push(queue_next_horz, data_horz[0], &resp_horz);
+            queue_push(queue_next_vert, data_vert[0], &resp_vert);
             data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x];
-            data_horz[1] = queue_pop(queue_prev_horz);
-            resp_horz = queue_push(queue_next_horz, data_horz[1]);
-            resp_vert = queue_push(queue_next_vert, data_vert[1]);
-            curr_element_0_C += data_horz[1] * data_vert[1];
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_push(queue_next_horz, data_horz[1], &resp_horz);
+            queue_push(queue_next_vert, data_vert[1], &resp_vert);
             data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1];
-            data_horz[2] = queue_pop(queue_prev_horz);
-            resp_horz = queue_push(queue_next_horz, data_horz[2]);
-            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_push(queue_next_horz, data_horz[2], &resp_horz);
+            queue_push(queue_next_vert, data_vert[2], &resp_vert);
+            data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
             curr_element_1_C += data_horz[0] * data_vert[2];
             curr_element_2_C += data_horz[2] * data_vert[0];
             curr_element_3_C += data_horz[2] * data_vert[2];
-            data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
-            data_horz[3] = queue_pop(queue_prev_horz);
-            resp_horz = queue_push(queue_next_horz, data_horz[3]);
-            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_push(queue_next_horz, data_horz[3], &resp_horz);
+            queue_push(queue_next_vert, data_vert[3], &resp_vert);
             curr_element_1_C += data_horz[1] * data_vert[3];
             curr_element_2_C += data_horz[3] * data_vert[1];
             curr_element_3_C += data_horz[3] * data_vert[3];
@@ -340,18 +356,18 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_horz[0] = queue_pop(queue_prev_horz);
-            resp_horz = queue_push(queue_next_horz, data_horz[0]);
-            resp_vert = queue_push(queue_next_vert, data_vert[0]);
-            data_horz[1] = queue_pop(queue_prev_horz);
-            resp_horz = queue_push(queue_next_horz, data_horz[1]);
-            resp_vert = queue_push(queue_next_vert, data_vert[1]);
-            data_horz[2] = queue_pop(queue_prev_horz);
-            resp_horz = queue_push(queue_next_horz, data_horz[2]);
-            resp_vert = queue_push(queue_next_vert, data_vert[2]);
-            data_horz[3] = queue_pop(queue_prev_horz);
-            resp_horz = queue_push(queue_next_horz, data_horz[3]);
-            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_push(queue_next_horz, data_horz[0], &resp_horz);
+            queue_push(queue_next_vert, data_vert[0], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_push(queue_next_horz, data_horz[1], &resp_horz);
+            queue_push(queue_next_vert, data_vert[1], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_push(queue_next_horz, data_horz[2], &resp_horz);
+            queue_push(queue_next_vert, data_vert[2], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_push(queue_next_horz, data_horz[3], &resp_horz);
+            queue_push(queue_next_vert, data_vert[3], &resp_vert);
           }
         }
       }
@@ -374,22 +390,22 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
             data_vert[0] = matrix_B[i * num_cols_B + shifted_x];
-            data_horz[0] = queue_pop(queue_prev_horz);
-            resp_vert = queue_push(queue_next_vert, data_vert[0]);
-            curr_element_0_C += data_horz[0] * data_vert[0];
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_push(queue_next_vert, data_vert[0], &resp_vert);
             data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x];
-            data_horz[1] = queue_pop(queue_prev_horz);
-            resp_vert = queue_push(queue_next_vert, data_vert[1]);
-            curr_element_0_C += data_horz[1] * data_vert[1];
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_push(queue_next_vert, data_vert[1], &resp_vert);
             data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1];
-            data_horz[2] = queue_pop(queue_prev_horz);
-            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_push(queue_next_vert, data_vert[2], &resp_vert);
+            data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
             curr_element_1_C += data_horz[0] * data_vert[2];
             curr_element_2_C += data_horz[2] * data_vert[0];
             curr_element_3_C += data_horz[2] * data_vert[2];
-            data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
-            data_horz[3] = queue_pop(queue_prev_horz);
-            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_push(queue_next_vert, data_vert[3], &resp_vert);
             curr_element_1_C += data_horz[1] * data_vert[3];
             curr_element_2_C += data_horz[3] * data_vert[1];
             curr_element_3_C += data_horz[3] * data_vert[3];
@@ -405,14 +421,14 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_horz[0] = queue_pop(queue_prev_horz);
-            resp_vert = queue_push(queue_next_vert, data_horz[0]);
-            data_horz[1] = queue_pop(queue_prev_horz);
-            resp_vert = queue_push(queue_next_vert, data_horz[1]);
-            data_horz[2] = queue_pop(queue_prev_horz);
-            resp_vert = queue_push(queue_next_vert, data_horz[2]);
-            data_horz[3] = queue_pop(queue_prev_horz);
-            resp_vert = queue_push(queue_next_vert, data_horz[3]);
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_push(queue_next_vert, data_horz[0], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_push(queue_next_vert, data_horz[1], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_push(queue_next_vert, data_horz[2], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_push(queue_next_vert, data_horz[3], &resp_vert);
           }
         }
       }
@@ -429,8 +445,8 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
   int32_t *queue_next_vert;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
-  int32_t resp_horz __attribute__((unused));
-  int32_t resp_vert __attribute__((unused));
+  int32_t resp_horz __attribute__((unused)) = 0;
+  int32_t resp_vert __attribute__((unused)) = 0;
   int32_t *matrix_A;
   int32_t *matrix_C;
   uint32_t num_cols_A;
@@ -481,26 +497,26 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
             data_horz[0] = matrix_A[shifted_y * num_cols_A + i];
-            data_vert[0] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[0]);
-            resp_vert = queue_push(queue_next_vert, data_vert[0]);
-            curr_element_0_C += data_horz[0] * data_vert[0];
+            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_push(queue_next_horz, data_horz[0], &resp_horz);
+            queue_push(queue_next_vert, data_vert[0], &resp_vert);
             data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1];
-            data_vert[1] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[1]);
-            resp_vert = queue_push(queue_next_vert, data_vert[1]);
-            curr_element_0_C += data_horz[1] * data_vert[1];
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            queue_pop(queue_prev_vert, &data_vert[1]);
+            queue_push(queue_next_horz, data_horz[1], &resp_horz);
+            queue_push(queue_next_vert, data_vert[1], &resp_vert);
             data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i];
-            data_vert[2] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[2]);
-            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_push(queue_next_horz, data_horz[2], &resp_horz);
+            queue_push(queue_next_vert, data_vert[2], &resp_vert);
+            data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
             curr_element_1_C += data_horz[0] * data_vert[2];
             curr_element_2_C += data_horz[2] * data_vert[0];
             curr_element_3_C += data_horz[2] * data_vert[2];
-            data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
-            data_vert[3] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[3]);
-            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            queue_pop(queue_prev_vert, &data_vert[3]);
+            queue_push(queue_next_horz, data_horz[3], &resp_horz);
+            queue_push(queue_next_vert, data_vert[3], &resp_vert);
             curr_element_1_C += data_horz[1] * data_vert[3];
             curr_element_2_C += data_horz[3] * data_vert[1];
             curr_element_3_C += data_horz[3] * data_vert[3];
@@ -516,18 +532,18 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_vert[0] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[0]);
-            resp_vert = queue_push(queue_next_vert, data_vert[0]);
-            data_vert[1] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[1]);
-            resp_vert = queue_push(queue_next_vert, data_vert[1]);
-            data_vert[2] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[2]);
-            resp_vert = queue_push(queue_next_vert, data_vert[2]);
-            data_vert[3] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[3]);
-            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_push(queue_next_horz, data_horz[0], &resp_horz);
+            queue_push(queue_next_vert, data_vert[0], &resp_vert);
+            queue_pop(queue_prev_vert, &data_vert[1]);
+            queue_push(queue_next_horz, data_horz[1], &resp_horz);
+            queue_push(queue_next_vert, data_vert[1], &resp_vert);
+            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_push(queue_next_horz, data_horz[2], &resp_horz);
+            queue_push(queue_next_vert, data_vert[2], &resp_vert);
+            queue_pop(queue_prev_vert, &data_vert[3]);
+            queue_push(queue_next_horz, data_horz[3], &resp_horz);
+            queue_push(queue_next_vert, data_vert[3], &resp_vert);
           }
         }
       }
@@ -550,22 +566,22 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
             data_horz[0] = matrix_A[shifted_y * num_cols_A + i];
-            data_vert[0] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[0]);
-            curr_element_0_C += data_horz[0] * data_vert[0];
+            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_push(queue_next_horz, data_horz[0], &resp_horz);
             data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1];
-            data_vert[1] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[1]);
-            curr_element_0_C += data_horz[1] * data_vert[1];
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            queue_pop(queue_prev_vert, &data_vert[1]);
+            queue_push(queue_next_horz, data_horz[1], &resp_horz);
             data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i];
-            data_vert[2] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            curr_element_0_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_push(queue_next_horz, data_horz[2], &resp_horz);
+            data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
             curr_element_1_C += data_horz[0] * data_vert[2];
             curr_element_2_C += data_horz[2] * data_vert[0];
             curr_element_3_C += data_horz[2] * data_vert[2];
-            data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
-            data_vert[3] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            queue_pop(queue_prev_vert, &data_vert[3]);
+            queue_push(queue_next_horz, data_horz[3], &resp_horz);
             curr_element_1_C += data_horz[1] * data_vert[3];
             curr_element_2_C += data_horz[3] * data_vert[1];
             curr_element_3_C += data_horz[3] * data_vert[3];
@@ -581,14 +597,14 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_vert[0] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_vert[0]);
-            data_vert[1] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_vert[1]);
-            data_vert[2] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_vert[2]);
-            data_vert[3] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_vert[3]);
+            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_push(queue_next_horz, data_vert[0], &resp_horz);
+            queue_pop(queue_prev_vert, &data_vert[1]);
+            queue_push(queue_next_horz, data_vert[1], &resp_horz);
+            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_push(queue_next_horz, data_vert[2], &resp_horz);
+            queue_pop(queue_prev_vert, &data_vert[3]);
+            queue_push(queue_next_horz, data_vert[3], &resp_horz);
           }
         }
       }
@@ -607,8 +623,8 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
   int32_t data_dummy __attribute__((unused)) = 0;
-  int32_t resp_horz __attribute__((unused));
-  int32_t resp_vert __attribute__((unused));
+  int32_t resp_horz __attribute__((unused)) = 0;
+  int32_t resp_vert __attribute__((unused)) = 0;
   int32_t *matrix_C;
   uint32_t num_rows_C;
   uint32_t num_cols_C;
@@ -661,27 +677,27 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
 
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_horz[0] = queue_pop(queue_prev_horz);
-            data_vert[0] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[0]);
-            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_push(queue_next_horz, data_horz[0], &resp_horz);
+            queue_push(queue_next_vert, data_vert[0], &resp_vert);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            data_horz[1] = queue_pop(queue_prev_horz);
-            data_vert[1] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[1]);
-            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_pop(queue_prev_vert, &data_vert[1]);
+            queue_push(queue_next_horz, data_horz[1], &resp_horz);
+            queue_push(queue_next_vert, data_vert[1], &resp_vert);
             curr_element_0_C += data_horz[1] * data_vert[1];
-            data_horz[2] = queue_pop(queue_prev_horz);
-            data_vert[2] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[2]);
-            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_push(queue_next_horz, data_horz[2], &resp_horz);
+            queue_push(queue_next_vert, data_vert[2], &resp_vert);
             curr_element_1_C += data_horz[0] * data_vert[2];
             curr_element_2_C += data_horz[2] * data_vert[0];
             curr_element_3_C += data_horz[2] * data_vert[2];
-            data_horz[3] = queue_pop(queue_prev_horz);
-            data_vert[3] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[3]);
-            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_pop(queue_prev_vert, &data_vert[3]);
+            queue_push(queue_next_horz, data_horz[3], &resp_horz);
+            queue_push(queue_next_vert, data_vert[3], &resp_vert);
             curr_element_1_C += data_horz[1] * data_vert[3];
             curr_element_2_C += data_horz[3] * data_vert[1];
             curr_element_3_C += data_horz[3] * data_vert[3];
@@ -697,22 +713,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_horz[0] = queue_pop(queue_prev_horz);
-            data_vert[0] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[0]);
-            resp_vert = queue_push(queue_next_vert, data_vert[0]);
-            data_horz[1] = queue_pop(queue_prev_horz);
-            data_vert[1] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[1]);
-            resp_vert = queue_push(queue_next_vert, data_vert[1]);
-            data_horz[2] = queue_pop(queue_prev_horz);
-            data_vert[2] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[2]);
-            resp_vert = queue_push(queue_next_vert, data_vert[2]);
-            data_horz[3] = queue_pop(queue_prev_horz);
-            data_vert[3] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[3]);
-            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_push(queue_next_horz, data_horz[0], &resp_horz);
+            queue_push(queue_next_vert, data_vert[0], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_pop(queue_prev_vert, &data_vert[1]);
+            queue_push(queue_next_horz, data_horz[1], &resp_horz);
+            queue_push(queue_next_vert, data_vert[1], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_push(queue_next_horz, data_horz[2], &resp_horz);
+            queue_push(queue_next_vert, data_vert[2], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_pop(queue_prev_vert, &data_vert[3]);
+            queue_push(queue_next_horz, data_horz[3], &resp_horz);
+            queue_push(queue_next_vert, data_vert[3], &resp_vert);
           }
         }
       }
@@ -738,23 +754,23 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
 
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_horz[0] = queue_pop(queue_prev_horz);
-            data_vert[0] = queue_pop(queue_prev_vert);
-            resp_vert = queue_push(queue_next_vert, data_vert[0]);
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_push(queue_next_vert, data_vert[0], &resp_vert);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            data_horz[1] = queue_pop(queue_prev_horz);
-            data_vert[1] = queue_pop(queue_prev_vert);
-            resp_vert = queue_push(queue_next_vert, data_vert[1]);
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_pop(queue_prev_vert, &data_vert[1]);
+            queue_push(queue_next_vert, data_vert[1], &resp_vert);
             curr_element_0_C += data_horz[1] * data_vert[1];
-            data_horz[2] = queue_pop(queue_prev_horz);
-            data_vert[2] = queue_pop(queue_prev_vert);
-            resp_vert = queue_push(queue_next_vert, data_vert[2]);
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_push(queue_next_vert, data_vert[2], &resp_vert);
             curr_element_1_C += data_horz[0] * data_vert[2];
             curr_element_2_C += data_horz[2] * data_vert[0];
             curr_element_3_C += data_horz[2] * data_vert[2];
-            data_horz[3] = queue_pop(queue_prev_horz);
-            data_vert[3] = queue_pop(queue_prev_vert);
-            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_pop(queue_prev_vert, &data_vert[3]);
+            queue_push(queue_next_vert, data_vert[3], &resp_vert);
             curr_element_1_C += data_horz[1] * data_vert[3];
             curr_element_2_C += data_horz[3] * data_vert[1];
             curr_element_3_C += data_horz[3] * data_vert[3];
@@ -770,22 +786,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_horz[0] = queue_pop(queue_prev_horz);
-            data_vert[0] = queue_pop(queue_prev_vert);
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_pop(queue_prev_vert, &data_vert[0]);
             data_vert[0] += data_horz[0];
-            resp_vert = queue_push(queue_next_vert, data_vert[0]);
-            data_horz[1] = queue_pop(queue_prev_horz);
-            data_vert[1] = queue_pop(queue_prev_vert);
+            queue_push(queue_next_vert, data_vert[0], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_pop(queue_prev_vert, &data_vert[1]);
             data_vert[1] += data_horz[1];
-            resp_vert = queue_push(queue_next_vert, data_vert[1]);
-            data_horz[2] = queue_pop(queue_prev_horz);
-            data_vert[2] = queue_pop(queue_prev_vert);
+            queue_push(queue_next_vert, data_vert[1], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_pop(queue_prev_vert, &data_vert[2]);
             data_vert[2] += data_horz[2];
-            resp_vert = queue_push(queue_next_vert, data_vert[2]);
-            data_horz[3] = queue_pop(queue_prev_horz);
-            data_vert[3] = queue_pop(queue_prev_vert);
+            queue_push(queue_next_vert, data_vert[2], &resp_vert);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_pop(queue_prev_vert, &data_vert[3]);
             data_vert[3] += data_horz[3];
-            resp_vert = queue_push(queue_next_vert, data_vert[3]);
+            queue_push(queue_next_vert, data_vert[3], &resp_vert);
           }
         }
       }
@@ -811,23 +827,23 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
 
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_horz[0] = queue_pop(queue_prev_horz);
-            data_vert[0] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[0]);
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_push(queue_next_horz, data_horz[0], &resp_horz);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            data_horz[1] = queue_pop(queue_prev_horz);
-            data_vert[1] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[1]);
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_pop(queue_prev_vert, &data_vert[1]);
+            queue_push(queue_next_horz, data_horz[1], &resp_horz);
             curr_element_0_C += data_horz[1] * data_vert[1];
-            data_horz[2] = queue_pop(queue_prev_horz);
-            data_vert[2] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[2]);
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_push(queue_next_horz, data_horz[2], &resp_horz);
             curr_element_1_C += data_horz[0] * data_vert[2];
             curr_element_2_C += data_horz[2] * data_vert[0];
             curr_element_3_C += data_horz[2] * data_vert[2];
-            data_horz[3] = queue_pop(queue_prev_horz);
-            data_vert[3] = queue_pop(queue_prev_vert);
-            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_pop(queue_prev_vert, &data_vert[3]);
+            queue_push(queue_next_horz, data_horz[3], &resp_horz);
             curr_element_1_C += data_horz[1] * data_vert[3];
             curr_element_2_C += data_horz[3] * data_vert[1];
             curr_element_3_C += data_horz[3] * data_vert[3];
@@ -843,22 +859,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_horz[0] = queue_pop(queue_prev_horz);
-            data_vert[0] = queue_pop(queue_prev_vert);
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_pop(queue_prev_vert, &data_vert[0]);
             data_horz[0] += data_vert[0];
-            resp_horz = queue_push(queue_next_horz, data_horz[0]);
-            data_horz[1] = queue_pop(queue_prev_horz);
-            data_vert[1] = queue_pop(queue_prev_vert);
+            queue_push(queue_next_horz, data_horz[0], &resp_horz);
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_pop(queue_prev_vert, &data_vert[1]);
             data_horz[1] += data_vert[1];
-            resp_horz = queue_push(queue_next_horz, data_horz[1]);
-            data_horz[2] = queue_pop(queue_prev_horz);
-            data_vert[2] = queue_pop(queue_prev_vert);
+            queue_push(queue_next_horz, data_horz[1], &resp_horz);
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_pop(queue_prev_vert, &data_vert[2]);
             data_horz[2] += data_vert[2];
-            resp_horz = queue_push(queue_next_horz, data_horz[2]);
-            data_horz[3] = queue_pop(queue_prev_horz);
-            data_vert[3] = queue_pop(queue_prev_vert);
+            queue_push(queue_next_horz, data_horz[2], &resp_horz);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_pop(queue_prev_vert, &data_vert[3]);
             data_horz[3] += data_vert[3];
-            resp_horz = queue_push(queue_next_horz, data_horz[3]);
+            queue_push(queue_next_horz, data_horz[3], &resp_horz);
           }
         }
       }
@@ -884,19 +900,19 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
 
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_horz[0] = queue_pop(queue_prev_horz);
-            data_vert[0] = queue_pop(queue_prev_vert);
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_pop(queue_prev_vert, &data_vert[0]);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            data_horz[1] = queue_pop(queue_prev_horz);
-            data_vert[1] = queue_pop(queue_prev_vert);
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_pop(queue_prev_vert, &data_vert[1]);
             curr_element_0_C += data_horz[1] * data_vert[1];
-            data_horz[2] = queue_pop(queue_prev_horz);
-            data_vert[2] = queue_pop(queue_prev_vert);
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_pop(queue_prev_vert, &data_vert[2]);
             curr_element_1_C += data_horz[0] * data_vert[2];
             curr_element_2_C += data_horz[2] * data_vert[0];
             curr_element_3_C += data_horz[2] * data_vert[2];
-            data_horz[3] = queue_pop(queue_prev_horz);
-            data_vert[3] = queue_pop(queue_prev_vert);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_pop(queue_prev_vert, &data_vert[3]);
             curr_element_1_C += data_horz[1] * data_vert[3];
             curr_element_2_C += data_horz[3] * data_vert[1];
             curr_element_3_C += data_horz[3] * data_vert[3];
@@ -912,17 +928,17 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            data_horz[0] = queue_pop(queue_prev_horz);
-            data_vert[0] = queue_pop(queue_prev_vert);
+            queue_pop(queue_prev_horz, &data_horz[0]);
+            queue_pop(queue_prev_vert, &data_vert[0]);
             data_dummy += data_horz[0] * data_vert[0];
-            data_horz[1] = queue_pop(queue_prev_horz);
-            data_vert[1] = queue_pop(queue_prev_vert);
+            queue_pop(queue_prev_horz, &data_horz[1]);
+            queue_pop(queue_prev_vert, &data_vert[1]);
             data_dummy += data_horz[1] * data_vert[1];
-            data_horz[2] = queue_pop(queue_prev_horz);
-            data_vert[2] = queue_pop(queue_prev_vert);
+            queue_pop(queue_prev_horz, &data_horz[2]);
+            queue_pop(queue_prev_vert, &data_vert[2]);
             data_dummy += data_horz[2] * data_vert[2];
-            data_horz[3] = queue_pop(queue_prev_horz);
-            data_vert[3] = queue_pop(queue_prev_vert);
+            queue_pop(queue_prev_horz, &data_horz[3]);
+            queue_pop(queue_prev_vert, &data_vert[3]);
             data_dummy += data_horz[3] * data_vert[3];
             // TODO: FIND SAFER WAY TO ENFORCE DATA DEPENDENCY
             if (!data_dummy)

From 72441bc90a90f8298077cf1dc89993cfda50db92 Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Thu, 10 Jun 2021 17:46:17 +0200
Subject: [PATCH 12/24] [apps] Use 2 interleaved queues per direction in
 matmul_xqueue

---
 software/runtime/systolic/matmul_xqueue.h | 675 ++++++++++++----------
 1 file changed, 365 insertions(+), 310 deletions(-)

diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h
index 2ba8f317d..d9afe9a12 100644
--- a/software/runtime/systolic/matmul_xqueue.h
+++ b/software/runtime/systolic/matmul_xqueue.h
@@ -25,13 +25,15 @@
  * (max dimension is 16-bit)
  * Matrix is processed in 2x2 submatrices with the following indexing
  *
- *        B B          0 2
- *        B B          1 3
+ *        B B          0 1
+ *        B B          2 3
  *
- *   A A  C C  =  0 1  0 1
- *   A A  C C     2 3  2 3
+ *   A A  C C  =  0 2  0 1
+ *   A A  C C     1 3  2 3
  *
- * e.g. C0 = A1 * B1 + A0 * B0
+ * e.g. C0 = A2 * B2 + A0 * B0
+ *
+ * We use two interleaved queues per direction
  */
 
 #include "alloc.h"
@@ -50,18 +52,19 @@ typedef struct {
 // TODO: SQRT ROOT OF NUM_CORES FOR SYSTOLIC SIZE
 
 // Array of queue ptrs in row-major order
-int32_t *queues_vert[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
-int32_t *queues_horz[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
+int32_t *queues_vert_0[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
+int32_t *queues_vert_1[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
+int32_t *queues_horz_0[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
+int32_t *queues_horz_1[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
 
 // queue push
-static inline void queue_push(void *const queue, int32_t data,
-                              int32_t *const ret) {
-  asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue));
+static inline void queue_push(void *const queue, int32_t data, int32_t *const ret) {
+  asm volatile ("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue));
 }
 
 // queue pop
 inline void queue_pop(void *const queue, int32_t *const ret) {
-  asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue));
+  asm volatile ("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue));
 }
 
 // TODO: GENERALIZE FOR ANY NUMBER OF TILES
@@ -76,25 +79,41 @@ void systolic_init(uint32_t const *grid_mapping) {
     for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
       tile_id = grid_mapping[grid_pos];
       tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4;
-      queues_vert[y][x] = &__seq_start + tile_offset + bank_sel[tile_id];
-      queues_horz[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 1;
-      bank_sel[tile_id] += 2;
+      queues_vert_0[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 0;
+      queues_vert_1[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 1;
+      queues_horz_0[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 2;
+      queues_horz_1[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 3;
+      bank_sel[tile_id] += 4;
       ++grid_pos;
     }
   }
 
   // Print out queue addresses
-  // printf("queues_vert\n");
+  // printf("queues_vert_0\n");
+  // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
+  //     printf("%5d ", queues_vert_0[y][x]);
+  //   }
+  //   printf("\n");
+  // }
+  // printf("queues_vert_1\n");
+  // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
+  //     printf("%5d ", queues_vert_1[y][x]);
+  //   }
+  //   printf("\n");
+  // }
+  // printf("queues_horz_0\n");
   // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
   //   for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
-  //     printf("%5d ", queues_vert[y][x]);
+  //     printf("%5d ", queues_horz_0[y][x]);
   //   }
   //   printf("\n");
   // }
-  // printf("queues_horz\n");
+  // printf("queues_horz_1\n");
   // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
   //   for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
-  //     printf("%5d ", queues_horz[y][x]);
+  //     printf("%5d ", queues_horz_1[y][x]);
   //   }
   //   printf("\n");
   // }
@@ -177,12 +196,16 @@ void systolic_rcp_pe(const uint32_t rep_count,
                      systolic_matrix_t const *__restrict__ A,
                      systolic_matrix_t const *__restrict__ B,
                      systolic_matrix_t const *__restrict__ C) {
-  int32_t *queue_next_horz;
-  int32_t *queue_next_vert;
+  int32_t *queue_next_horz_0;
+  int32_t *queue_next_horz_1;
+  int32_t *queue_next_vert_0;
+  int32_t *queue_next_vert_1;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
-  int32_t resp_horz __attribute__((unused)) = 0;
-  int32_t resp_vert __attribute__((unused)) = 0;
+  int32_t resp_horz_0 __attribute__((unused)) = 0;
+  int32_t resp_horz_1 __attribute__((unused)) = 0;
+  int32_t resp_vert_0 __attribute__((unused)) = 0;
+  int32_t resp_vert_1 __attribute__((unused)) = 0;
   int32_t *matrix_A;
   int32_t *matrix_B;
   int32_t *matrix_C;
@@ -198,8 +221,10 @@ void systolic_rcp_pe(const uint32_t rep_count,
   uint32_t anchor_row_1;
 
   // Assign queues
-  queue_next_horz = queues_horz[0][1];
-  queue_next_vert = queues_vert[1][0];
+  queue_next_horz_0 = queues_horz_0[0][1];
+  queue_next_horz_1 = queues_horz_1[0][1];
+  queue_next_vert_0 = queues_vert_0[1][0];
+  queue_next_vert_1 = queues_vert_1[1][0];
 
   // Get matrix arrays
   matrix_A = A->matrix;
@@ -225,27 +250,27 @@ void systolic_rcp_pe(const uint32_t rep_count,
       for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
         data_horz[0] = matrix_A[y * num_cols_A + i];
         data_vert[0] = matrix_B[i * num_cols_B + x];
-        queue_push(queue_next_horz, data_horz[0], &resp_horz);
-        queue_push(queue_next_vert, data_vert[0], &resp_vert);
-        data_horz[1] = matrix_A[y * num_cols_A + i + 1];
-        data_vert[1] = matrix_B[(i + 1) * num_cols_B + x];
+        data_horz[1] = matrix_A[(y + 1) * num_cols_A + i];
+        data_vert[1] = matrix_B[i * num_cols_B + x + 1];
+        queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+        queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+        queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+        queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
         curr_element_0_C += data_horz[0] * data_vert[0];
-        queue_push(queue_next_horz, data_horz[1], &resp_horz);
-        queue_push(queue_next_vert, data_vert[1], &resp_vert);
-        data_horz[2] = matrix_A[(y + 1) * num_cols_A + i];
-        data_vert[2] = matrix_B[i * num_cols_B + x + 1];
-        curr_element_0_C += data_horz[1] * data_vert[1];
-        queue_push(queue_next_horz, data_horz[1], &resp_horz);
-        queue_push(queue_next_vert, data_vert[1], &resp_vert);
+        curr_element_1_C += data_horz[0] * data_vert[1];
+        curr_element_2_C += data_horz[1] * data_vert[0];
+        curr_element_3_C += data_horz[1] * data_vert[1];
+        data_horz[2] = matrix_A[y * num_cols_A + i + 1];
+        data_vert[2] = matrix_B[(i + 1) * num_cols_B + x];
         data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1];
         data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1];
-        curr_element_1_C += data_horz[0] * data_vert[2];
-        curr_element_2_C += data_horz[2] * data_vert[0];
-        curr_element_3_C += data_horz[2] * data_vert[2];
-        queue_push(queue_next_horz, data_horz[3], &resp_horz);
-        queue_push(queue_next_vert, data_vert[3], &resp_vert);
-        curr_element_1_C += data_horz[1] * data_vert[3];
-        curr_element_2_C += data_horz[3] * data_vert[1];
+        queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+        queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+        queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+        queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+        curr_element_0_C += data_horz[2] * data_vert[2];
+        curr_element_1_C += data_horz[2] * data_vert[3];
+        curr_element_2_C += data_horz[3] * data_vert[2];
         curr_element_3_C += data_horz[3] * data_vert[3];
       }
 
@@ -264,13 +289,18 @@ void systolic_rcp_pe(const uint32_t rep_count,
 void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
                     systolic_matrix_t const *__restrict__ B,
                     systolic_matrix_t const *__restrict__ C) {
-  int32_t *queue_prev_horz;
-  int32_t *queue_next_horz;
-  int32_t *queue_next_vert;
+  int32_t *queue_prev_horz_0;
+  int32_t *queue_prev_horz_1;
+  int32_t *queue_next_horz_0;
+  int32_t *queue_next_horz_1;
+  int32_t *queue_next_vert_0;
+  int32_t *queue_next_vert_1;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
-  int32_t resp_horz __attribute__((unused)) = 0;
-  int32_t resp_vert __attribute__((unused)) = 0;
+  int32_t resp_horz_0 __attribute__((unused)) = 0;
+  int32_t resp_horz_1 __attribute__((unused)) = 0;
+  int32_t resp_vert_0 __attribute__((unused)) = 0;
+  int32_t resp_vert_1 __attribute__((unused)) = 0;
   int32_t *matrix_B;
   int32_t *matrix_C;
   uint32_t num_cols_B;
@@ -285,13 +315,17 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
   uint32_t anchor_row_1;
 
   // Assign queues
-  queue_prev_horz = queues_horz[0][col_idx];
+  queue_prev_horz_0 = queues_horz_0[0][col_idx];
+  queue_prev_horz_1 = queues_horz_1[0][col_idx];
   if (col_idx == SYSTOLIC_SIZE - 1) {
-    queue_next_horz = NULL;
+    queue_next_horz_0 = NULL;
+    queue_next_horz_1 = NULL;
   } else {
-    queue_next_horz = queues_horz[0][col_idx + 1];
+    queue_next_horz_0 = queues_horz_0[0][col_idx + 1];
+    queue_next_horz_1 = queues_horz_1[0][col_idx + 1];
   }
-  queue_next_vert = queues_vert[1][col_idx];
+  queue_next_vert_0 = queues_vert_0[1][col_idx];
+  queue_next_vert_1 = queues_vert_1[1][col_idx];
 
   // Get matrix arrays
   matrix_B = B->matrix;
@@ -303,7 +337,7 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
   num_cols_C = C->num_cols;
 
   // Check if PE is at the right boundary
-  if (queue_next_horz) {
+  if (queue_next_horz_0) {
     // Execute step-wise matrix multiplication
     for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
       for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
@@ -320,29 +354,29 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
 
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
             data_vert[0] = matrix_B[i * num_cols_B + shifted_x];
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_push(queue_next_horz, data_horz[0], &resp_horz);
-            queue_push(queue_next_vert, data_vert[0], &resp_vert);
-            data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x];
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1];
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_push(queue_next_horz, data_horz[1], &resp_horz);
-            queue_push(queue_next_vert, data_vert[1], &resp_vert);
-            data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1];
-            curr_element_0_C += data_horz[1] * data_vert[1];
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_push(queue_next_horz, data_horz[2], &resp_horz);
-            queue_push(queue_next_vert, data_vert[2], &resp_vert);
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x];
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
             data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
-            curr_element_1_C += data_horz[0] * data_vert[2];
-            curr_element_2_C += data_horz[2] * data_vert[0];
-            curr_element_3_C += data_horz[2] * data_vert[2];
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_push(queue_next_horz, data_horz[3], &resp_horz);
-            queue_push(queue_next_vert, data_vert[3], &resp_vert);
-            curr_element_1_C += data_horz[1] * data_vert[3];
-            curr_element_2_C += data_horz[3] * data_vert[1];
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
             curr_element_3_C += data_horz[3] * data_vert[3];
           }
 
@@ -356,18 +390,18 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_push(queue_next_horz, data_horz[0], &resp_horz);
-            queue_push(queue_next_vert, data_vert[0], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_push(queue_next_horz, data_horz[1], &resp_horz);
-            queue_push(queue_next_vert, data_vert[1], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_push(queue_next_horz, data_horz[2], &resp_horz);
-            queue_push(queue_next_vert, data_vert[2], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_push(queue_next_horz, data_horz[3], &resp_horz);
-            queue_push(queue_next_vert, data_vert[3], &resp_vert);
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
           }
         }
       }
@@ -389,25 +423,25 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
 
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
             data_vert[0] = matrix_B[i * num_cols_B + shifted_x];
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_push(queue_next_vert, data_vert[0], &resp_vert);
-            data_vert[1] = matrix_B[(i + 1) * num_cols_B + shifted_x];
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1];
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_push(queue_next_vert, data_vert[1], &resp_vert);
-            data_vert[2] = matrix_B[i * num_cols_B + shifted_x + 1];
-            curr_element_0_C += data_horz[1] * data_vert[1];
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_push(queue_next_vert, data_vert[2], &resp_vert);
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x];
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
             data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
-            curr_element_1_C += data_horz[0] * data_vert[2];
-            curr_element_2_C += data_horz[2] * data_vert[0];
-            curr_element_3_C += data_horz[2] * data_vert[2];
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_push(queue_next_vert, data_vert[3], &resp_vert);
-            curr_element_1_C += data_horz[1] * data_vert[3];
-            curr_element_2_C += data_horz[3] * data_vert[1];
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
             curr_element_3_C += data_horz[3] * data_vert[3];
           }
 
@@ -421,14 +455,14 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_push(queue_next_vert, data_horz[0], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_push(queue_next_vert, data_horz[1], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_push(queue_next_vert, data_horz[2], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_push(queue_next_vert, data_horz[3], &resp_vert);
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_push(queue_next_vert_0, data_horz[0], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_horz[1], &resp_vert_1);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_push(queue_next_vert_0, data_horz[2], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_horz[3], &resp_vert_1);
           }
         }
       }
@@ -440,13 +474,18 @@ void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
 void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
                     systolic_matrix_t const *__restrict__ A,
                     systolic_matrix_t const *__restrict__ C) {
-  int32_t *queue_next_horz;
-  int32_t *queue_prev_vert;
-  int32_t *queue_next_vert;
+  int32_t *queue_next_horz_0;
+  int32_t *queue_next_horz_1;
+  int32_t *queue_prev_vert_0;
+  int32_t *queue_prev_vert_1;
+  int32_t *queue_next_vert_0;
+  int32_t *queue_next_vert_1;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
-  int32_t resp_horz __attribute__((unused)) = 0;
-  int32_t resp_vert __attribute__((unused)) = 0;
+  int32_t resp_horz_0 __attribute__((unused)) = 0;
+  int32_t resp_horz_1 __attribute__((unused)) = 0;
+  int32_t resp_vert_0 __attribute__((unused)) = 0;
+  int32_t resp_vert_1 __attribute__((unused)) = 0;
   int32_t *matrix_A;
   int32_t *matrix_C;
   uint32_t num_cols_A;
@@ -461,12 +500,16 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
   uint32_t anchor_row_1;
 
   // Assign queues
-  queue_next_horz = queues_horz[row_idx][1];
-  queue_prev_vert = queues_vert[row_idx][0];
+  queue_next_horz_0 = queues_horz_0[row_idx][1];
+  queue_next_horz_1 = queues_horz_1[row_idx][1];
+  queue_prev_vert_0 = queues_vert_0[row_idx][0];
+  queue_prev_vert_1 = queues_vert_1[row_idx][0];
   if (row_idx == SYSTOLIC_SIZE - 1) {
-    queue_next_vert = NULL;
+    queue_next_vert_0 = NULL;
+    queue_next_vert_1 = NULL;
   } else {
-    queue_next_vert = queues_vert[row_idx + 1][0];
+    queue_next_vert_0 = queues_vert_0[row_idx + 1][0];
+    queue_next_vert_1 = queues_vert_1[row_idx + 1][0];
   }
 
   // Get matrix arrays
@@ -479,7 +522,7 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
   num_cols_C = C->num_cols;
 
   // Check if PE is at the bottom boundary
-  if (queue_next_vert) {
+  if (queue_next_vert_0) {
     // Execute step-wise matrix multiplication
     for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
       for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
@@ -497,28 +540,28 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
             data_horz[0] = matrix_A[shifted_y * num_cols_A + i];
-            queue_pop(queue_prev_vert, &data_vert[0]);
-            queue_push(queue_next_horz, data_horz[0], &resp_horz);
-            queue_push(queue_next_vert, data_vert[0], &resp_vert);
-            data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1];
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            data_horz[1] = matrix_A[(shifted_y + 1) * num_cols_A + i];
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            queue_pop(queue_prev_vert, &data_vert[1]);
-            queue_push(queue_next_horz, data_horz[1], &resp_horz);
-            queue_push(queue_next_vert, data_vert[1], &resp_vert);
-            data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i];
-            curr_element_0_C += data_horz[1] * data_vert[1];
-            queue_pop(queue_prev_vert, &data_vert[2]);
-            queue_push(queue_next_horz, data_horz[2], &resp_horz);
-            queue_push(queue_next_vert, data_vert[2], &resp_vert);
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            data_horz[2] = matrix_A[shifted_y * num_cols_A + i + 1];
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
             data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
-            curr_element_1_C += data_horz[0] * data_vert[2];
-            curr_element_2_C += data_horz[2] * data_vert[0];
-            curr_element_3_C += data_horz[2] * data_vert[2];
-            queue_pop(queue_prev_vert, &data_vert[3]);
-            queue_push(queue_next_horz, data_horz[3], &resp_horz);
-            queue_push(queue_next_vert, data_vert[3], &resp_vert);
-            curr_element_1_C += data_horz[1] * data_vert[3];
-            curr_element_2_C += data_horz[3] * data_vert[1];
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
             curr_element_3_C += data_horz[3] * data_vert[3];
           }
 
@@ -532,18 +575,18 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_vert, &data_vert[0]);
-            queue_push(queue_next_horz, data_horz[0], &resp_horz);
-            queue_push(queue_next_vert, data_vert[0], &resp_vert);
-            queue_pop(queue_prev_vert, &data_vert[1]);
-            queue_push(queue_next_horz, data_horz[1], &resp_horz);
-            queue_push(queue_next_vert, data_vert[1], &resp_vert);
-            queue_pop(queue_prev_vert, &data_vert[2]);
-            queue_push(queue_next_horz, data_horz[2], &resp_horz);
-            queue_push(queue_next_vert, data_vert[2], &resp_vert);
-            queue_pop(queue_prev_vert, &data_vert[3]);
-            queue_push(queue_next_horz, data_horz[3], &resp_horz);
-            queue_push(queue_next_vert, data_vert[3], &resp_vert);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
           }
         }
       }
@@ -566,24 +609,24 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
             data_horz[0] = matrix_A[shifted_y * num_cols_A + i];
-            queue_pop(queue_prev_vert, &data_vert[0]);
-            queue_push(queue_next_horz, data_horz[0], &resp_horz);
-            data_horz[1] = matrix_A[shifted_y * num_cols_A + i + 1];
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            data_horz[1] = matrix_A[(shifted_y + 1) * num_cols_A + i];
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            queue_pop(queue_prev_vert, &data_vert[1]);
-            queue_push(queue_next_horz, data_horz[1], &resp_horz);
-            data_horz[2] = matrix_A[(shifted_y + 1) * num_cols_A + i];
-            curr_element_0_C += data_horz[1] * data_vert[1];
-            queue_pop(queue_prev_vert, &data_vert[2]);
-            queue_push(queue_next_horz, data_horz[2], &resp_horz);
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            data_horz[2] = matrix_A[shifted_y * num_cols_A + i + 1];
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
             data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
-            curr_element_1_C += data_horz[0] * data_vert[2];
-            curr_element_2_C += data_horz[2] * data_vert[0];
-            curr_element_3_C += data_horz[2] * data_vert[2];
-            queue_pop(queue_prev_vert, &data_vert[3]);
-            queue_push(queue_next_horz, data_horz[3], &resp_horz);
-            curr_element_1_C += data_horz[1] * data_vert[3];
-            curr_element_2_C += data_horz[3] * data_vert[1];
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
             curr_element_3_C += data_horz[3] * data_vert[3];
           }
 
@@ -597,14 +640,14 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_vert, &data_vert[0]);
-            queue_push(queue_next_horz, data_vert[0], &resp_horz);
-            queue_pop(queue_prev_vert, &data_vert[1]);
-            queue_push(queue_next_horz, data_vert[1], &resp_horz);
-            queue_pop(queue_prev_vert, &data_vert[2]);
-            queue_push(queue_next_horz, data_vert[2], &resp_horz);
-            queue_pop(queue_prev_vert, &data_vert[3]);
-            queue_push(queue_next_horz, data_vert[3], &resp_horz);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_vert[0], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_vert[1], &resp_horz_1);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_vert[2], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_vert[3], &resp_horz_1);
           }
         }
       }
@@ -616,15 +659,21 @@ void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
 void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
                     const uint32_t rep_count,
                     systolic_matrix_t const *__restrict__ C) {
-  int32_t *queue_prev_horz;
-  int32_t *queue_next_horz;
-  int32_t *queue_prev_vert;
-  int32_t *queue_next_vert;
+  int32_t *queue_prev_horz_0;
+  int32_t *queue_prev_horz_1;
+  int32_t *queue_next_horz_0;
+  int32_t *queue_next_horz_1;
+  int32_t *queue_prev_vert_0;
+  int32_t *queue_prev_vert_1;
+  int32_t *queue_next_vert_0;
+  int32_t *queue_next_vert_1;
   int32_t data_horz[4] = {0, 0, 0, 0};
   int32_t data_vert[4] = {0, 0, 0, 0};
   int32_t data_dummy __attribute__((unused)) = 0;
-  int32_t resp_horz __attribute__((unused)) = 0;
-  int32_t resp_vert __attribute__((unused)) = 0;
+  int32_t resp_horz_0 __attribute__((unused)) = 0;
+  int32_t resp_horz_1 __attribute__((unused)) = 0;
+  int32_t resp_vert_0 __attribute__((unused)) = 0;
+  int32_t resp_vert_1 __attribute__((unused)) = 0;
   int32_t *matrix_C;
   uint32_t num_rows_C;
   uint32_t num_cols_C;
@@ -638,17 +687,23 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
   uint32_t anchor_row_1;
 
   // Assign queues
-  queue_prev_horz = queues_horz[row_idx][col_idx];
+  queue_prev_horz_0 = queues_horz_0[row_idx][col_idx];
+  queue_prev_horz_1 = queues_horz_1[row_idx][col_idx];
   if (col_idx == SYSTOLIC_SIZE - 1) {
-    queue_next_horz = NULL;
+    queue_next_horz_0 = NULL;
+    queue_next_horz_1 = NULL;
   } else {
-    queue_next_horz = queues_horz[row_idx][col_idx + 1];
+    queue_next_horz_0 = queues_horz_0[row_idx][col_idx + 1];
+    queue_next_horz_1 = queues_horz_1[row_idx][col_idx + 1];
   }
-  queue_prev_vert = queues_vert[row_idx][col_idx];
+  queue_prev_vert_0 = queues_vert_0[row_idx][col_idx];
+  queue_prev_vert_1 = queues_vert_1[row_idx][col_idx];
   if (row_idx == SYSTOLIC_SIZE - 1) {
-    queue_next_vert = NULL;
+    queue_next_vert_0 = NULL;
+    queue_next_vert_1 = NULL;
   } else {
-    queue_next_vert = queues_vert[row_idx + 1][col_idx];
+    queue_next_vert_0 = queues_vert_0[row_idx + 1][col_idx];
+    queue_next_vert_1 = queues_vert_1[row_idx + 1][col_idx];
   }
 
   // Get matrix arrays
@@ -659,7 +714,7 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
   num_cols_C = C->num_cols;
 
   // PE is not at a boundary
-  if (queue_next_horz && queue_next_vert) {
+  if (queue_next_horz_0 && queue_next_vert_0) {
     // Execute step-wise matrix multiplication
     for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
       for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
@@ -677,29 +732,29 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
 
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_pop(queue_prev_vert, &data_vert[0]);
-            queue_push(queue_next_horz, data_horz[0], &resp_horz);
-            queue_push(queue_next_vert, data_vert[0], &resp_vert);
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_pop(queue_prev_vert, &data_vert[1]);
-            queue_push(queue_next_horz, data_horz[1], &resp_horz);
-            queue_push(queue_next_vert, data_vert[1], &resp_vert);
-            curr_element_0_C += data_horz[1] * data_vert[1];
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_pop(queue_prev_vert, &data_vert[2]);
-            queue_push(queue_next_horz, data_horz[2], &resp_horz);
-            queue_push(queue_next_vert, data_vert[2], &resp_vert);
-            curr_element_1_C += data_horz[0] * data_vert[2];
-            curr_element_2_C += data_horz[2] * data_vert[0];
-            curr_element_3_C += data_horz[2] * data_vert[2];
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_pop(queue_prev_vert, &data_vert[3]);
-            queue_push(queue_next_horz, data_horz[3], &resp_horz);
-            queue_push(queue_next_vert, data_vert[3], &resp_vert);
-            curr_element_1_C += data_horz[1] * data_vert[3];
-            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
             curr_element_3_C += data_horz[3] * data_vert[3];
           }
 
@@ -713,22 +768,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_pop(queue_prev_vert, &data_vert[0]);
-            queue_push(queue_next_horz, data_horz[0], &resp_horz);
-            queue_push(queue_next_vert, data_vert[0], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_pop(queue_prev_vert, &data_vert[1]);
-            queue_push(queue_next_horz, data_horz[1], &resp_horz);
-            queue_push(queue_next_vert, data_vert[1], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_pop(queue_prev_vert, &data_vert[2]);
-            queue_push(queue_next_horz, data_horz[2], &resp_horz);
-            queue_push(queue_next_vert, data_vert[2], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_pop(queue_prev_vert, &data_vert[3]);
-            queue_push(queue_next_horz, data_horz[3], &resp_horz);
-            queue_push(queue_next_vert, data_vert[3], &resp_vert);
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
           }
         }
       }
@@ -736,7 +791,7 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
   }
 
   // PE is at the right boundary
-  if (!queue_next_horz && queue_next_vert) {
+  if (!queue_next_horz_0 && queue_next_vert_0) {
     // Execute step-wise matrix multiplication
     for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
       for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
@@ -754,25 +809,25 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
 
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_pop(queue_prev_vert, &data_vert[0]);
-            queue_push(queue_next_vert, data_vert[0], &resp_vert);
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_pop(queue_prev_vert, &data_vert[1]);
-            queue_push(queue_next_vert, data_vert[1], &resp_vert);
-            curr_element_0_C += data_horz[1] * data_vert[1];
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_pop(queue_prev_vert, &data_vert[2]);
-            queue_push(queue_next_vert, data_vert[2], &resp_vert);
-            curr_element_1_C += data_horz[0] * data_vert[2];
-            curr_element_2_C += data_horz[2] * data_vert[0];
-            curr_element_3_C += data_horz[2] * data_vert[2];
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_pop(queue_prev_vert, &data_vert[3]);
-            queue_push(queue_next_vert, data_vert[3], &resp_vert);
-            curr_element_1_C += data_horz[1] * data_vert[3];
-            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
             curr_element_3_C += data_horz[3] * data_vert[3];
           }
 
@@ -786,22 +841,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
             data_vert[0] += data_horz[0];
-            queue_push(queue_next_vert, data_vert[0], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_pop(queue_prev_vert, &data_vert[1]);
             data_vert[1] += data_horz[1];
-            queue_push(queue_next_vert, data_vert[1], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
             data_vert[2] += data_horz[2];
-            queue_push(queue_next_vert, data_vert[2], &resp_vert);
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_pop(queue_prev_vert, &data_vert[3]);
             data_vert[3] += data_horz[3];
-            queue_push(queue_next_vert, data_vert[3], &resp_vert);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
           }
         }
       }
@@ -809,7 +864,7 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
   }
 
   // PE is at the bottom boundary
-  if (queue_next_horz && !queue_next_vert) {
+  if (queue_next_horz_0 && !queue_next_vert_0) {
     // Execute step-wise matrix multiplication
     for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
       for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
@@ -827,25 +882,25 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
 
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_pop(queue_prev_vert, &data_vert[0]);
-            queue_push(queue_next_horz, data_horz[0], &resp_horz);
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_pop(queue_prev_vert, &data_vert[1]);
-            queue_push(queue_next_horz, data_horz[1], &resp_horz);
-            curr_element_0_C += data_horz[1] * data_vert[1];
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_pop(queue_prev_vert, &data_vert[2]);
-            queue_push(queue_next_horz, data_horz[2], &resp_horz);
-            curr_element_1_C += data_horz[0] * data_vert[2];
-            curr_element_2_C += data_horz[2] * data_vert[0];
-            curr_element_3_C += data_horz[2] * data_vert[2];
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_pop(queue_prev_vert, &data_vert[3]);
-            queue_push(queue_next_horz, data_horz[3], &resp_horz);
-            curr_element_1_C += data_horz[1] * data_vert[3];
-            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
             curr_element_3_C += data_horz[3] * data_vert[3];
           }
 
@@ -859,22 +914,22 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
             data_horz[0] += data_vert[0];
-            queue_push(queue_next_horz, data_horz[0], &resp_horz);
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_pop(queue_prev_vert, &data_vert[1]);
             data_horz[1] += data_vert[1];
-            queue_push(queue_next_horz, data_horz[1], &resp_horz);
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
             data_horz[2] += data_vert[2];
-            queue_push(queue_next_horz, data_horz[2], &resp_horz);
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_pop(queue_prev_vert, &data_vert[3]);
             data_horz[3] += data_vert[3];
-            queue_push(queue_next_horz, data_horz[3], &resp_horz);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
           }
         }
       }
@@ -882,7 +937,7 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
   }
 
   // PE is at the bottom right corner
-  if (!queue_next_horz && !queue_next_vert) {
+  if (!queue_next_horz_0 && !queue_next_vert_0) {
     // Execute step-wise matrix multiplication
     for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
       for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
@@ -900,21 +955,21 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
 
           // Systolic matrix multiplication through MACs
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
             curr_element_0_C += data_horz[0] * data_vert[0];
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_pop(queue_prev_vert, &data_vert[1]);
-            curr_element_0_C += data_horz[1] * data_vert[1];
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_pop(queue_prev_vert, &data_vert[2]);
-            curr_element_1_C += data_horz[0] * data_vert[2];
-            curr_element_2_C += data_horz[2] * data_vert[0];
-            curr_element_3_C += data_horz[2] * data_vert[2];
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_pop(queue_prev_vert, &data_vert[3]);
-            curr_element_1_C += data_horz[1] * data_vert[3];
-            curr_element_2_C += data_horz[3] * data_vert[1];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
             curr_element_3_C += data_horz[3] * data_vert[3];
           }
 
@@ -928,17 +983,17 @@ void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
         } else {
           // Pop and push dummy data
           for (uint32_t i = 0; i < rep_count; ++i) {
-            queue_pop(queue_prev_horz, &data_horz[0]);
-            queue_pop(queue_prev_vert, &data_vert[0]);
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
             data_dummy += data_horz[0] * data_vert[0];
-            queue_pop(queue_prev_horz, &data_horz[1]);
-            queue_pop(queue_prev_vert, &data_vert[1]);
             data_dummy += data_horz[1] * data_vert[1];
-            queue_pop(queue_prev_horz, &data_horz[2]);
-            queue_pop(queue_prev_vert, &data_vert[2]);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
             data_dummy += data_horz[2] * data_vert[2];
-            queue_pop(queue_prev_horz, &data_horz[3]);
-            queue_pop(queue_prev_vert, &data_vert[3]);
             data_dummy += data_horz[3] * data_vert[3];
             // TODO: FIND SAFER WAY TO ENFORCE DATA DEPENDENCY
             if (!data_dummy)

From 3674ea7b81d7034d6f331d583c65101877510dd1 Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Tue, 15 Jun 2021 16:13:40 +0200
Subject: [PATCH 13/24] [apps] Improve matmul_xqueue code - generalize
 systolic_init() - add additional grid mappings

---
 software/apps/systolic/matmul_xqueue/main.c | 80 ++++++++++++---------
 software/runtime/systolic/matmul_xqueue.h   | 26 +++----
 2 files changed, 61 insertions(+), 45 deletions(-)

diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c
index fafc4fbea..f7a648ab3 100644
--- a/software/apps/systolic/matmul_xqueue/main.c
+++ b/software/apps/systolic/matmul_xqueue/main.c
@@ -27,11 +27,12 @@
 #include "synchronization.h"
 
 // Dimensions of matrices
-#define DIM_M 16
-#define DIM_N 16
-#define DIM_P 16
+#define DIM_M 24
+#define DIM_N 24
+#define DIM_P 24
 
-uint32_t *grid_mapping;
+uint32_t *tile_mapping;
+uint32_t *core_mapping;
 
 int32_t *matrix_A;
 int32_t *matrix_B;
@@ -77,7 +78,8 @@ int main() {
 
   // Allocate systolic grid mapping
   if (core_id == 0) {
-    grid_mapping = (uint32_t *)simple_malloc(num_cores * 4);
+    tile_mapping = (uint32_t *)simple_malloc(num_cores * 4);
+    core_mapping = (uint32_t *)simple_malloc(num_cores * 4);
   }
 
   // ----------
@@ -92,44 +94,53 @@ int main() {
   uint32_t col_idx = core_id / 4;
   uint32_t row_idx = core_id % 4;
 
-  // Assign grid position (tile wise)
-  // uint32_t col_idx;
-  // uint32_t row_idx;
-  // if (core_id < 4) {
-  //   col_idx = core_id % 2;
-  //   row_idx = core_id / 2;
-  // } else if (core_id < 8) {
-  //   col_idx = core_id % 2 + 2;
-  //   row_idx = core_id / 6;
-  // } else if (core_id < 12) {
-  //   col_idx = core_id % 2;
-  //   row_idx = core_id / 10 + 2;
-  // } else {
-  //   col_idx = core_id % 2 + 2;
-  //   row_idx = core_id / 14 + 2;
-  // }
-
-  // uint32_t mapped_tile = tile_id;
+  // Assign grid position (square wise)
+  // uint32_t col_idx = tile_id % 2;
+  // col_idx *= 2;
+  // col_idx += core_id % 2;
+  // uint32_t row_idx = tile_id / 2;
+  // row_idx *= 2;
+  // row_idx += (core_id % 4) / 2;
 
   // ----------
   // 256 CORES
   // ----------
 
+  // Assign grid position (row wise)
+  // uint32_t col_idx = core_id % 16;
+  // uint32_t row_idx = core_id / 16;
+
   // Assign grid position (col wise)
   // uint32_t col_idx = core_id / 16;
   // uint32_t row_idx = core_id % 16;
 
-  // Assign grid position (tile wise)
-  // uint32_t mapped_group = core_id % 4;
-  // uint32_t col_idx = tile_id / 4;
-  // uint32_t row_idx = (tile_id % 4) + (mapped_group * 4);
-  // uint32_t mapped_tile = (tile_id % 16) + (mapped_group * 16);
+  // Assign grid position (square wise)
+  // uint32_t col_idx = tile_id % 8;
+  // col_idx *= 2;
+  // col_idx += core_id % 2;
+  // uint32_t row_idx = tile_id / 8;
+  // row_idx *= 2;
+  // row_idx += (core_id % 4) / 2;
+
+  // Assign grid position (square square wise)
+  // uint32_t group_id = tile_id / 16;
+  // uint32_t add_col = group_id % 2;
+  // uint32_t add_row = group_id / 2;
+  // uint32_t col_idx = tile_id % 4;
+  // col_idx *= 2;
+  // col_idx += core_id % 2;
+  // col_idx += add_col * 8;
+  // uint32_t row_idx = (tile_id % 16) / 4;
+  // row_idx *= 2;
+  // row_idx += (core_id % 4) / 2;
+  // row_idx += add_row * 8;
 
   // Wait for all cores
   mempool_barrier(num_cores);
 
-  // Set systolic grid mapping
-  grid_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = tile_id;
+  // Set tile and core mapping
+  tile_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = tile_id;
+  core_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = core_id;
 
   // Wait for all cores
   mempool_barrier(num_cores);
@@ -138,11 +149,14 @@ int main() {
   if (core_id == 0) {
     printf("> Initialize\n");
 
-    // Print out grid mapping
-    // print_matrix((int32_t *)grid_mapping, 4, 4);
+    // Print out tile mapping
+    // print_matrix((int32_t *)tile_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
+
+    // Print out core mapping
+    // print_matrix((int32_t *)core_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
 
     // Initialize systolic array
-    systolic_init(grid_mapping);
+    systolic_init(tile_mapping, core_mapping);
 
     // Create systolic matrices
     generate_gradient_matrix(&matrix_A, DIM_M, DIM_N);
diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h
index d9afe9a12..cb26e762b 100644
--- a/software/runtime/systolic/matmul_xqueue.h
+++ b/software/runtime/systolic/matmul_xqueue.h
@@ -58,32 +58,34 @@ int32_t *queues_horz_0[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
 int32_t *queues_horz_1[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
 
 // queue push
-static inline void queue_push(void *const queue, int32_t data, int32_t *const ret) {
-  asm volatile ("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue));
+static inline void queue_push(void *const queue, int32_t data,
+                              int32_t *const ret) {
+  asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue));
 }
 
 // queue pop
 inline void queue_pop(void *const queue, int32_t *const ret) {
-  asm volatile ("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue));
+  asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue));
 }
 
-// TODO: GENERALIZE FOR ANY NUMBER OF TILES
-void systolic_init(uint32_t const *grid_mapping) {
+void systolic_init(uint32_t const *tile_mapping, uint32_t const *core_mapping) {
   // Create systolic array via queues
   extern int32_t __seq_start;
   uint32_t grid_pos = 0;
   uint32_t tile_id;
+  uint32_t core_id;
   uint32_t tile_offset;
-  uint32_t bank_sel[4] = {0, 0, 0, 0};
+  uint32_t core_offset;
   for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
     for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
-      tile_id = grid_mapping[grid_pos];
+      tile_id = tile_mapping[grid_pos];
+      core_id = core_mapping[grid_pos];
       tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4;
-      queues_vert_0[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 0;
-      queues_vert_1[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 1;
-      queues_horz_0[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 2;
-      queues_horz_1[y][x] = &__seq_start + tile_offset + bank_sel[tile_id] + 3;
-      bank_sel[tile_id] += 4;
+      core_offset = core_id % 4 * 4;
+      queues_vert_0[y][x] = &__seq_start + tile_offset + core_offset + 0;
+      queues_vert_1[y][x] = &__seq_start + tile_offset + core_offset + 1;
+      queues_horz_0[y][x] = &__seq_start + tile_offset + core_offset + 2;
+      queues_horz_1[y][x] = &__seq_start + tile_offset + core_offset + 3;
       ++grid_pos;
     }
   }

From 3ea900b5c47140a8f6c3d688ea2b8b23ce8fb5f7 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Tue, 13 Sep 2022 15:58:22 +0200
Subject: [PATCH 14/24] [apps] Implement systolic xqueue 2d convolution

---
 software/apps/systolic/conv_xqueue/main.c | 250 +++++++++++++++
 software/runtime/systolic/conv_xqueue.h   | 358 ++++++++++++++++++++++
 2 files changed, 608 insertions(+)
 create mode 100644 software/apps/systolic/conv_xqueue/main.c
 create mode 100644 software/runtime/systolic/conv_xqueue.h

diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c
new file mode 100644
index 000000000..e3b3644ab
--- /dev/null
+++ b/software/apps/systolic/conv_xqueue/main.c
@@ -0,0 +1,250 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Gua Hao Khov, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "alloc.h"
+#include "encoding.h"
+#include "systolic/conv_xqueue.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+// Dimensions of matrix X
+#define DIM_X_M 32
+#define DIM_X_N 32
+
+// Dimensions of matrix Y
+#define DIM_Y_M (DIM_X_M - (KERNEL_SIZE - 1))
+#define DIM_Y_N (DIM_X_N - (KERNEL_SIZE - 1))
+
+// Dimensions of maps
+#define KERNEL_ROWS KERNEL_SIZE
+#define KERNEL_COLS KERNEL_SIZE *NUM_KERNELS
+#define NUM_ACCS NUM_KERNELS
+
+uint32_t *kernel_tile_map;
+uint32_t *kernel_core_map;
+uint32_t *row_acc_tile_map;
+uint32_t *row_acc_core_map;
+
+int32_t *matrix_X;
+int32_t *matrix_Y;
+
+int32_t weights[3][3] = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+
+void generate_gradient_matrix(int32_t **matrix, uint32_t num_rows,
+                              uint32_t num_cols) {
+  int32_t *new_matrix = (int32_t *)simple_malloc(num_rows * num_cols * 4);
+  for (uint32_t y = 0; y < num_rows; ++y) {
+    for (uint32_t x = 0; x < num_cols; ++x) {
+      new_matrix[y * num_cols + x] = (int32_t)(y + x);
+    }
+  }
+  *matrix = new_matrix;
+}
+
+void print_matrix(int32_t const *matrix, uint32_t num_rows,
+                  uint32_t num_columns) {
+  printf("Matrix at 0x%8X\n", (uint32_t)matrix);
+  for (uint32_t i = 0; i < num_rows; ++i) {
+    for (uint32_t j = 0; j < num_columns; ++j) {
+      printf("%5d ", matrix[i * num_columns + j]);
+    }
+    printf("\n");
+  }
+}
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t tile_id = core_id / 4;
+
+  // Initialize synchronization variables
+  mempool_barrier_init(core_id);
+
+  // Initialization
+  mempool_init(core_id, num_cores);
+
+  // Allocate tile and core maps
+  if (core_id == 0) {
+    kernel_tile_map = (uint32_t *)simple_malloc(KERNEL_ROWS * KERNEL_COLS * 4);
+    kernel_core_map = (uint32_t *)simple_malloc(KERNEL_ROWS * KERNEL_COLS * 4);
+    row_acc_tile_map = (uint32_t *)simple_malloc(NUM_ACCS * 4);
+    row_acc_core_map = (uint32_t *)simple_malloc(NUM_ACCS * 4);
+  }
+
+  // Systolic identifiers
+  int32_t is_enabled = 0;
+  int32_t is_kernel_core = 0;
+  uint32_t kernel_id = 0;
+  uint32_t kernel_row = 0;
+  uint32_t kernel_col = 0;
+
+  // ----------
+  // ACC COMBO
+  // ----------
+
+  // TODO: VISUAL DESCRIPTION
+  // TODO: CURRENTLY ONLY WORKS FOR KERNEL_SIZE == 3
+
+  kernel_id = tile_id / 5;
+  uint32_t kernel_pair_id = tile_id % 5;
+  uint32_t tile_core_id = core_id % 4;
+  if (kernel_pair_id < 3) {
+    is_kernel_core = 1;
+    kernel_row = kernel_pair_id;
+    kernel_col = tile_core_id % 2;
+    kernel_id += tile_core_id / 2;
+  } else {
+    if (tile_core_id == 3) {
+      is_kernel_core = 0;
+    } else {
+      is_kernel_core = 1;
+      kernel_row = tile_core_id;
+      kernel_col = 2;
+    }
+    kernel_id += kernel_pair_id % 3;
+  }
+
+  // Core is only enabled if its kernel is required
+  if (kernel_id < NUM_KERNELS) {
+    is_enabled = 1;
+  } else {
+    is_enabled = 0;
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Set tile and core maps
+  if (is_enabled) {
+    if (is_kernel_core) {
+      kernel_tile_map[kernel_row * KERNEL_COLS + kernel_col] = tile_id;
+      kernel_core_map[kernel_row * KERNEL_COLS + kernel_col] = core_id;
+    } else {
+      row_acc_tile_map[kernel_id] = tile_id;
+      row_acc_core_map[kernel_id] = core_id;
+    }
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Setup
+  if (core_id == 0) {
+    printf("> Initialize\n");
+
+    // Print out maps
+    // print_matrix((int32_t *)kernel_tile_map, KERNEL_ROWS, KERNEL_COLS);
+    // print_matrix((int32_t *)kernel_core_map, KERNEL_ROWS, KERNEL_COLS);
+    // print_matrix((int32_t *)row_acc_tile_map, 1, NUM_ACCS);
+    // print_matrix((int32_t *)row_acc_core_map, 1, NUM_ACCS);
+
+    // Initialize systolic array
+    systolic_init(kernel_tile_map, kernel_core_map, row_acc_tile_map,
+                  row_acc_core_map);
+
+    // Create and initialize matrices
+    generate_gradient_matrix(&matrix_X, DIM_X_M, DIM_X_N);
+    matrix_Y = (int32_t *)simple_malloc(DIM_Y_M * DIM_Y_N * 4);
+
+    // Print out matrix X
+    // printf("> Print Matrix X\n");
+    // print_matrix(matrix_X, DIM_X_M, DIM_X_N);
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    // Start benchmark
+    printf("> Start\n");
+    // mempool_start_benchmark();
+  }
+
+  // Start benchmark for all cores
+  mempool_barrier(num_cores);
+  mempool_start_benchmark();
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  if (is_enabled) {
+    if (is_kernel_core) {
+      switch (kernel_col) {
+      case 0:
+        if (kernel_id == 0) {
+          systolic_conv_first_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                                     matrix_X, (int32_t *)weights);
+        } else {
+          if (kernel_row == 2) {
+            systolic_conv_first_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                                       matrix_X, (int32_t *)weights);
+          } else {
+            systolic_conv_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                                 (int32_t *)weights);
+          }
+        }
+        break;
+      case (KERNEL_SIZE - 1):
+        if (kernel_id == NUM_KERNELS - 1) {
+          systolic_conv_last_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                                  (int32_t *)weights);
+        } else {
+          if (kernel_row == 0) {
+            systolic_conv_last_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                                    (int32_t *)weights);
+          } else {
+            systolic_conv_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                               (int32_t *)weights);
+          }
+        }
+        break;
+      default:
+        systolic_conv_follower(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                               (int32_t *)weights);
+      }
+    } else {
+      systolic_conv_row_acc(kernel_id, DIM_Y_M, DIM_Y_N, matrix_Y);
+    }
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Stop benchmark for all cores
+  mempool_stop_benchmark();
+  mempool_barrier(num_cores);
+
+  // Print out benchmark
+  if (core_id == 0) {
+    // Stop benchmark
+    // mempool_stop_benchmark();
+    printf("> End\n");
+
+    // Print out matrix Y
+    printf("> Print Matrix Y\n");
+    print_matrix(matrix_Y, DIM_Y_M, DIM_Y_N);
+  }
+
+  // wait until all cores have finished
+  mempool_barrier(num_cores);
+  return 0;
+}
diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h
new file mode 100644
index 000000000..8730dcef3
--- /dev/null
+++ b/software/runtime/systolic/conv_xqueue.h
@@ -0,0 +1,358 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Gua Hao Khov, ETH Zurich
+
+/* This library implements a simple systolic architecture emulation
+ * using global code based orchestration
+ */
+
+/* TODO DESCRIPTION
+ *
+ *
+ *
+ *
+ *
+ *
+ */
+
+#include "alloc.h"
+#include "printf.h"
+
+// Kernel size (fixed)
+#define KERNEL_SIZE 3
+
+// Number of kernels
+#define NUM_KERNELS 1
+
+// Array of queue ptrs in row-major order (concatenated kernels)
+int32_t *queues_x[KERNEL_SIZE][NUM_KERNELS * KERNEL_SIZE];
+int32_t *queues_y[KERNEL_SIZE][NUM_KERNELS * KERNEL_SIZE];
+int32_t *queues_row_acc[KERNEL_SIZE][NUM_KERNELS];
+
+// queue push
+static inline void queue_push(void *const queue, int32_t data,
+                              int32_t *const ret) {
+  asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue));
+}
+
+// queue pop
+inline void queue_pop(void *const queue, int32_t *const ret) {
+  asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue));
+}
+
+void systolic_init(uint32_t const *kernel_tile_map,
+                   uint32_t const *kernel_core_map,
+                   uint32_t const *row_acc_tile_map,
+                   uint32_t const *row_acc_core_map) {
+  // Create systolic array via queues
+  extern int32_t __seq_start;
+  uint32_t grid_pos;
+  uint32_t tile_id;
+  uint32_t core_id;
+  uint32_t tile_offset;
+  uint32_t core_offset;
+
+  // Kernel queues
+  grid_pos = 0;
+  for (uint32_t y = 0; y < KERNEL_SIZE; ++y) {
+    for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) {
+      tile_id = kernel_tile_map[grid_pos];
+      core_id = kernel_core_map[grid_pos];
+      tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4;
+      core_offset = core_id % 4 * 4;
+      queues_x[y][x] = &__seq_start + tile_offset + core_offset + 0;
+      queues_y[y][x] = &__seq_start + tile_offset + core_offset + 1;
+      ++grid_pos;
+    }
+  }
+
+  // Row accumulator queues
+  grid_pos = 0;
+  for (uint32_t x = 0; x < NUM_KERNELS; ++x) {
+    tile_id = row_acc_tile_map[x];
+    core_id = row_acc_core_map[x];
+    tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4;
+    core_offset = core_id % 4 * 4;
+    queues_row_acc[0][x] = &__seq_start + tile_offset + core_offset + 0;
+    queues_row_acc[1][x] = &__seq_start + tile_offset + core_offset + 1;
+    queues_row_acc[2][x] = &__seq_start + tile_offset + core_offset + 2;
+  }
+
+  // Print out queue addresses
+  // printf("queues_x\n");
+  // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) {
+  //     printf("%5d ", queues_x[y][x]);
+  //   }
+  //   printf("\n");
+  // }
+  // printf("queues_y\n");
+  // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) {
+  //     printf("%5d ", queues_y[y][x]);
+  //   }
+  //   printf("\n");
+  // }
+  // printf("queues_row_acc\n");
+  // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < NUM_KERNELS; ++x) {
+  //     printf("%5d ", queues_row_acc[y][x]);
+  //   }
+  //   printf("\n");
+  // }
+}
+
+void systolic_conv_first_leader(const uint32_t kernel_id,
+                                const uint32_t kernel_row,
+                                const uint32_t num_rows,
+                                const uint32_t num_cols,
+                                int32_t const *__restrict__ X,
+                                int32_t const *__restrict__ W) {
+  int32_t *queue_next_x;
+  int32_t *queue_next_y;
+  int32_t resp_x __attribute__((unused)) = 0;
+  int32_t resp_y __attribute__((unused)) = 0;
+  int32_t weight;
+  int32_t curr_x;
+  int32_t curr_y;
+  uint32_t first_row = kernel_id + kernel_row;
+  uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1;
+
+  // Assign queues
+  queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1];
+  queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1];
+
+  // Load weight
+  weight = W[kernel_row * KERNEL_SIZE + 0];
+
+  // Execute row-wise systolic 2d convolution
+  for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) {
+    // Populate kernel
+    curr_x = X[row * num_cols + 0];
+    queue_push(queue_next_x, curr_x, &resp_x);
+    curr_x = X[row * num_cols + 1];
+    queue_push(queue_next_x, curr_x, &resp_x);
+    curr_x = X[row * num_cols + 2];
+    // Convolution
+    for (uint32_t col = 3; col < num_cols; ++col) {
+      queue_push(queue_next_x, curr_x, &resp_x);
+      curr_y = curr_x * weight;
+      curr_x = X[row * num_cols + col];
+      queue_push(queue_next_y, curr_y, &resp_y);
+    }
+    // Flush kernel
+    queue_push(queue_next_x, curr_x, &resp_x);
+    curr_y = curr_x * weight;
+    queue_push(queue_next_y, curr_y, &resp_y);
+  }
+}
+
+void systolic_conv_leader(const uint32_t kernel_id, const uint32_t kernel_row,
+                          const uint32_t num_rows, const uint32_t num_cols,
+                          int32_t const *__restrict__ W) {
+  int32_t *queue_prev_x;
+  int32_t *queue_next_x;
+  int32_t *queue_next_y;
+  int32_t resp_x __attribute__((unused)) = 0;
+  int32_t resp_y __attribute__((unused)) = 0;
+  int32_t weight;
+  int32_t curr_x;
+  int32_t curr_y;
+  uint32_t first_row = kernel_id + kernel_row;
+  uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1;
+
+  // Assign queues
+  queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 0];
+  queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1];
+  queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1];
+
+  // Load weight
+  weight = W[kernel_row * KERNEL_SIZE + 0];
+
+  // Execute row-wise systolic 2d convolution
+  for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) {
+    // Populate kernel
+    queue_pop(queue_prev_x, &curr_x);
+    queue_push(queue_next_x, curr_x, &resp_x);
+    queue_pop(queue_prev_x, &curr_x);
+    queue_push(queue_next_x, curr_x, &resp_x);
+    queue_pop(queue_prev_x, &curr_x);
+    // Convolution
+    for (uint32_t col = 3; col < num_cols; ++col) {
+      queue_push(queue_next_x, curr_x, &resp_x);
+      curr_y = curr_x * weight;
+      queue_pop(queue_prev_x, &curr_x);
+      queue_push(queue_next_y, curr_y, &resp_y);
+    }
+    // Flush kernel
+    queue_push(queue_next_x, curr_x, &resp_x);
+    curr_y = curr_x * weight;
+    queue_push(queue_next_y, curr_y, &resp_y);
+  }
+}
+
+void systolic_conv_follower(const uint32_t kernel_id, const uint32_t kernel_row,
+                            const uint32_t num_rows, const uint32_t num_cols,
+                            int32_t const *__restrict__ W) {
+  int32_t *queue_prev_x;
+  int32_t *queue_next_x;
+  int32_t *queue_prev_y;
+  int32_t *queue_next_y;
+  int32_t resp_x __attribute__((unused)) = 0;
+  int32_t resp_y __attribute__((unused)) = 0;
+  int32_t weight;
+  int32_t curr_x;
+  int32_t curr_y;
+  uint32_t first_row = kernel_id + kernel_row;
+  uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1;
+
+  // Assign queues
+  queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1];
+  queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2];
+  queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1];
+  queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2];
+
+  // Load weight
+  weight = W[kernel_row * KERNEL_SIZE + 1];
+
+  // Execute row-wise systolic 2d convolution
+  for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) {
+    // Populate kernel
+    queue_pop(queue_prev_x, &curr_x);
+    queue_push(queue_next_x, curr_x, &resp_x);
+    queue_pop(queue_prev_x, &curr_x);
+    // Convolution
+    for (uint32_t col = 2; col < num_cols; ++col) {
+      queue_pop(queue_prev_y, &curr_y);
+      queue_push(queue_next_x, curr_x, &resp_x);
+      curr_y += curr_x * weight;
+      queue_pop(queue_prev_x, &curr_x);
+      queue_push(queue_next_y, curr_y, &resp_y);
+    }
+    // Flush kernel
+    queue_push(queue_next_x, curr_x, &resp_x);
+  }
+}
+
+void systolic_conv_NAME(const uint32_t kernel_id, const uint32_t kernel_row,
+                        const uint32_t num_rows, const uint32_t num_cols,
+                        int32_t const *__restrict__ W) {
+  int32_t *queue_prev_x;
+  int32_t *queue_next_x;
+  int32_t *queue_prev_y;
+  int32_t *queue_next_y;
+  int32_t resp_x __attribute__((unused)) = 0;
+  int32_t resp_y __attribute__((unused)) = 0;
+  int32_t weight;
+  int32_t curr_x;
+  int32_t curr_y;
+  uint32_t first_row = kernel_id + kernel_row;
+  uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1;
+
+  // Assign queues
+  queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2];
+  queue_next_x = queues_x[kernel_row + 1][(kernel_id + 1) * KERNEL_SIZE];
+  queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2];
+  queue_next_y = queues_row_acc[kernel_row][kernel_id];
+
+  // Load weight
+  weight = W[kernel_row * KERNEL_SIZE + 2];
+
+  // Execute row-wise systolic 2d convolution
+  for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) {
+    // Populate kernel
+    queue_pop(queue_prev_x, &curr_x);
+    // Convolution
+    for (uint32_t col = 1; col < num_cols - 1; ++col) {
+      queue_pop(queue_prev_y, &curr_y);
+      queue_push(queue_next_x, curr_x, &resp_x);
+      curr_y += curr_x * weight;
+      queue_pop(queue_prev_x, &curr_x);
+      queue_push(queue_next_y, curr_y, &resp_y);
+    }
+    // Flush kernel
+    queue_push(queue_next_x, curr_x, &resp_x);
+    queue_pop(queue_prev_x, &curr_x);
+    queue_push(queue_next_x, curr_x, &resp_x);
+  }
+}
+
+void systolic_conv_last_NAME(const uint32_t kernel_id,
+                             const uint32_t kernel_row, const uint32_t num_rows,
+                             const uint32_t num_cols,
+                             int32_t const *__restrict__ W) {
+  int32_t *queue_prev_x;
+  int32_t *queue_prev_y;
+  int32_t *queue_next_y;
+  int32_t resp_y __attribute__((unused)) = 0;
+  int32_t weight;
+  int32_t curr_x;
+  int32_t curr_y;
+  uint32_t first_row = kernel_id + kernel_row;
+  uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1;
+
+  // Assign queues
+  queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2];
+  queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2];
+  queue_next_y = queues_row_acc[kernel_row][kernel_id];
+
+  // Load weight
+  weight = W[kernel_row * KERNEL_SIZE + 2];
+
+  // Execute row-wise systolic 2d convolution
+  for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) {
+    // Populate kernel
+    queue_pop(queue_prev_x, &curr_x);
+    // Convolution
+    for (uint32_t col = 1; col < num_cols - 1; ++col) {
+      queue_pop(queue_prev_y, &curr_y);
+      curr_y += curr_x * weight;
+      queue_pop(queue_prev_x, &curr_x);
+      queue_push(queue_next_y, curr_y, &resp_y);
+    }
+    // Flush kernel
+    queue_pop(queue_prev_x, &curr_x);
+  }
+}
+
+void systolic_conv_row_acc(const uint32_t kernel_id, const uint32_t num_rows_y,
+                           const uint32_t num_cols_y, int32_t *__restrict__ Y) {
+  int32_t *queue_y_0;
+  int32_t *queue_y_1;
+  int32_t *queue_y_2;
+  int32_t curr_y_0;
+  int32_t curr_y_1;
+  int32_t curr_y_2;
+  int32_t total_y;
+
+  // Assign queues
+  queue_y_0 = queues_row_acc[0][kernel_id];
+  queue_y_1 = queues_row_acc[1][kernel_id];
+  queue_y_2 = queues_row_acc[2][kernel_id];
+
+  // Execute row-wise systolic 2d convolution
+  for (uint32_t row = kernel_id; row < num_rows_y; row += NUM_KERNELS) {
+    // Accumulate and Store
+    for (uint32_t col = 0; col < num_cols_y; ++col) {
+      queue_pop(queue_y_0, &curr_y_0);
+      queue_pop(queue_y_1, &curr_y_1);
+      queue_pop(queue_y_2, &curr_y_2);
+      total_y = curr_y_0 + curr_y_1 + curr_y_2;
+      Y[row * num_cols_y + col] = total_y;
+    }
+  }
+}

From 8a3b524bed3758ba8ec2153329c054a4cbb1034c Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Wed, 23 Jun 2021 13:41:12 +0200
Subject: [PATCH 15/24] [apps] Improve conv_xqueue code - change PEs role names
 - add different core mapping - flush queues at the end of execution

---
 software/apps/systolic/conv_xqueue/main.c | 93 +++++++++++++++++------
 software/runtime/systolic/conv_xqueue.h   | 42 +++++-----
 2 files changed, 91 insertions(+), 44 deletions(-)

diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c
index e3b3644ab..7184962f7 100644
--- a/software/apps/systolic/conv_xqueue/main.c
+++ b/software/apps/systolic/conv_xqueue/main.c
@@ -36,7 +36,7 @@
 
 // Dimensions of maps
 #define KERNEL_ROWS KERNEL_SIZE
-#define KERNEL_COLS KERNEL_SIZE *NUM_KERNELS
+#define KERNEL_COLS (KERNEL_SIZE * NUM_KERNELS)
 #define NUM_ACCS NUM_KERNELS
 
 uint32_t *kernel_tile_map;
@@ -101,17 +101,23 @@ int main() {
   // ACC COMBO
   // ----------
 
-  // TODO: VISUAL DESCRIPTION
+  // XY: X = Tile and Y = Core % 4
+  //
+  // 00 01 30 **
+  // 10 11 31 33 02 03 40 **
+  // 20 21 32 ** 12 13 41 43
+  //             22 23 42 **
+
   // TODO: CURRENTLY ONLY WORKS FOR KERNEL_SIZE == 3
 
-  kernel_id = tile_id / 5;
-  uint32_t kernel_pair_id = tile_id % 5;
+  uint32_t group_id = tile_id / 5;
+  uint32_t group_tile_id = tile_id % 5;
   uint32_t tile_core_id = core_id % 4;
-  if (kernel_pair_id < 3) {
+  if (group_tile_id < 3) {
     is_kernel_core = 1;
-    kernel_row = kernel_pair_id;
+    kernel_row = group_tile_id;
     kernel_col = tile_core_id % 2;
-    kernel_id += tile_core_id / 2;
+    kernel_id = 2 * group_id + (tile_core_id / 2);
   } else {
     if (tile_core_id == 3) {
       is_kernel_core = 0;
@@ -120,9 +126,47 @@ int main() {
       kernel_row = tile_core_id;
       kernel_col = 2;
     }
-    kernel_id += kernel_pair_id % 3;
+    kernel_id = 2 * group_id + (group_tile_id % 3);
   }
 
+  // ----------
+  // LONG ROWS
+  // ----------
+
+  // XY: X = Tile and Y = Core % 4
+  //
+  // 00 01 02 **
+  // 10 11 12 90 13 30 31 **
+  // 20 21 22 ** 23 40 41 91 42 43 60 **
+  //             03 50 51 ** 52 53 70 92 71 72 73 **
+  //                         32 33 80 ** 81 82 83 93
+  //                                     61 62 63 **
+
+  // TODO: CURRENTLY ONLY WORKS FOR KERNEL_SIZE == 3
+
+  // uint32_t group_id = tile_id / 10;
+  // uint32_t group_tile_id = tile_id % 10;
+  // uint32_t tile_core_id = core_id % 4;
+  // if (group_tile_id < 9) {
+  //   is_kernel_core = 1;
+  //   uint32_t group_kernel_id = group_tile_id / 3;
+  //   kernel_row = group_tile_id % 3;
+  //   kernel_col = (tile_core_id + group_kernel_id) % 3;
+  //   uint32_t threshold = 3 - group_kernel_id;
+  //   if (tile_core_id >= threshold) {
+  //     group_kernel_id += 1;
+  //     if (kernel_row == 0) {
+  //       kernel_row = 2;
+  //     } else {
+  //       kernel_row -= 1;
+  //     }
+  //   }
+  //   kernel_id = 4 * group_id + group_kernel_id;
+  // } else {
+  //   is_kernel_core = 0;
+  //   kernel_id = 4 * group_id + tile_core_id;
+  // }
+
   // Core is only enabled if its kernel is required
   if (kernel_id < NUM_KERNELS) {
     is_enabled = 1;
@@ -136,8 +180,9 @@ int main() {
   // Set tile and core maps
   if (is_enabled) {
     if (is_kernel_core) {
-      kernel_tile_map[kernel_row * KERNEL_COLS + kernel_col] = tile_id;
-      kernel_core_map[kernel_row * KERNEL_COLS + kernel_col] = core_id;
+      uint32_t map_col = KERNEL_SIZE * kernel_id + kernel_col;
+      kernel_tile_map[kernel_row * KERNEL_COLS + map_col] = tile_id;
+      kernel_core_map[kernel_row * KERNEL_COLS + map_col] = core_id;
     } else {
       row_acc_tile_map[kernel_id] = tile_id;
       row_acc_core_map[kernel_id] = core_id;
@@ -191,35 +236,35 @@ int main() {
       switch (kernel_col) {
       case 0:
         if (kernel_id == 0) {
-          systolic_conv_first_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                                     matrix_X, (int32_t *)weights);
+          systolic_conv_first_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                                    matrix_X, (int32_t *)weights);
         } else {
           if (kernel_row == 2) {
-            systolic_conv_first_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                                       matrix_X, (int32_t *)weights);
+            systolic_conv_first_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                                      matrix_X, (int32_t *)weights);
           } else {
-            systolic_conv_leader(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                                 (int32_t *)weights);
+            systolic_conv_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                                (int32_t *)weights);
           }
         }
         break;
       case (KERNEL_SIZE - 1):
         if (kernel_id == NUM_KERNELS - 1) {
-          systolic_conv_last_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                                  (int32_t *)weights);
+          systolic_conv_last_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                                 (int32_t *)weights);
         } else {
           if (kernel_row == 0) {
-            systolic_conv_last_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                                    (int32_t *)weights);
+            systolic_conv_last_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                                   (int32_t *)weights);
           } else {
-            systolic_conv_NAME(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                               (int32_t *)weights);
+            systolic_conv_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                              (int32_t *)weights);
           }
         }
         break;
       default:
-        systolic_conv_follower(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                               (int32_t *)weights);
+        systolic_conv_mid(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
+                          (int32_t *)weights);
       }
     } else {
       systolic_conv_row_acc(kernel_id, DIM_Y_M, DIM_Y_N, matrix_Y);
diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h
index 8730dcef3..85e372f42 100644
--- a/software/runtime/systolic/conv_xqueue.h
+++ b/software/runtime/systolic/conv_xqueue.h
@@ -36,7 +36,7 @@
 #define KERNEL_SIZE 3
 
 // Number of kernels
-#define NUM_KERNELS 1
+#define NUM_KERNELS 25
 
 // Array of queue ptrs in row-major order (concatenated kernels)
 int32_t *queues_x[KERNEL_SIZE][NUM_KERNELS * KERNEL_SIZE];
@@ -116,12 +116,11 @@ void systolic_init(uint32_t const *kernel_tile_map,
   // }
 }
 
-void systolic_conv_first_leader(const uint32_t kernel_id,
-                                const uint32_t kernel_row,
-                                const uint32_t num_rows,
-                                const uint32_t num_cols,
-                                int32_t const *__restrict__ X,
-                                int32_t const *__restrict__ W) {
+void systolic_conv_first_front(const uint32_t kernel_id,
+                               const uint32_t kernel_row,
+                               const uint32_t num_rows, const uint32_t num_cols,
+                               int32_t const *__restrict__ X,
+                               int32_t const *__restrict__ W) {
   int32_t *queue_next_x;
   int32_t *queue_next_y;
   int32_t resp_x __attribute__((unused)) = 0;
@@ -161,9 +160,9 @@ void systolic_conv_first_leader(const uint32_t kernel_id,
   }
 }
 
-void systolic_conv_leader(const uint32_t kernel_id, const uint32_t kernel_row,
-                          const uint32_t num_rows, const uint32_t num_cols,
-                          int32_t const *__restrict__ W) {
+void systolic_conv_front(const uint32_t kernel_id, const uint32_t kernel_row,
+                         const uint32_t num_rows, const uint32_t num_cols,
+                         int32_t const *__restrict__ W) {
   int32_t *queue_prev_x;
   int32_t *queue_next_x;
   int32_t *queue_next_y;
@@ -205,9 +204,9 @@ void systolic_conv_leader(const uint32_t kernel_id, const uint32_t kernel_row,
   }
 }
 
-void systolic_conv_follower(const uint32_t kernel_id, const uint32_t kernel_row,
-                            const uint32_t num_rows, const uint32_t num_cols,
-                            int32_t const *__restrict__ W) {
+void systolic_conv_mid(const uint32_t kernel_id, const uint32_t kernel_row,
+                       const uint32_t num_rows, const uint32_t num_cols,
+                       int32_t const *__restrict__ W) {
   int32_t *queue_prev_x;
   int32_t *queue_next_x;
   int32_t *queue_prev_y;
@@ -248,9 +247,9 @@ void systolic_conv_follower(const uint32_t kernel_id, const uint32_t kernel_row,
   }
 }
 
-void systolic_conv_NAME(const uint32_t kernel_id, const uint32_t kernel_row,
-                        const uint32_t num_rows, const uint32_t num_cols,
-                        int32_t const *__restrict__ W) {
+void systolic_conv_end(const uint32_t kernel_id, const uint32_t kernel_row,
+                       const uint32_t num_rows, const uint32_t num_cols,
+                       int32_t const *__restrict__ W) {
   int32_t *queue_prev_x;
   int32_t *queue_next_x;
   int32_t *queue_prev_y;
@@ -289,12 +288,15 @@ void systolic_conv_NAME(const uint32_t kernel_id, const uint32_t kernel_row,
     queue_pop(queue_prev_x, &curr_x);
     queue_push(queue_next_x, curr_x, &resp_x);
   }
+
+  // Flush next queues at the end of execution
+  queue_pop(queue_next_x, &curr_x);
+  queue_pop(queue_next_x, &curr_x);
 }
 
-void systolic_conv_last_NAME(const uint32_t kernel_id,
-                             const uint32_t kernel_row, const uint32_t num_rows,
-                             const uint32_t num_cols,
-                             int32_t const *__restrict__ W) {
+void systolic_conv_last_end(const uint32_t kernel_id, const uint32_t kernel_row,
+                            const uint32_t num_rows, const uint32_t num_cols,
+                            int32_t const *__restrict__ W) {
   int32_t *queue_prev_x;
   int32_t *queue_prev_y;
   int32_t *queue_next_y;

From e29d84cb822ccb318aba054d66d95182042dd76e Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Fri, 25 Jun 2021 02:43:19 +0200
Subject: [PATCH 16/24] [apps] Improve 2d conv density for conv_xqueue

---
 software/apps/systolic/conv_xqueue/main.c | 178 +----
 software/runtime/systolic/conv_xqueue.h   | 840 +++++++++++++++-------
 2 files changed, 597 insertions(+), 421 deletions(-)

diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c
index 7184962f7..6fd8045b5 100644
--- a/software/apps/systolic/conv_xqueue/main.c
+++ b/software/apps/systolic/conv_xqueue/main.c
@@ -31,23 +31,17 @@
 #define DIM_X_N 32
 
 // Dimensions of matrix Y
-#define DIM_Y_M (DIM_X_M - (KERNEL_SIZE - 1))
-#define DIM_Y_N (DIM_X_N - (KERNEL_SIZE - 1))
+#define DIM_Y_M (DIM_X_M - 2)
+#define DIM_Y_N (DIM_X_N - 2)
 
-// Dimensions of maps
-#define KERNEL_ROWS KERNEL_SIZE
-#define KERNEL_COLS (KERNEL_SIZE * NUM_KERNELS)
-#define NUM_ACCS NUM_KERNELS
-
-uint32_t *kernel_tile_map;
-uint32_t *kernel_core_map;
-uint32_t *row_acc_tile_map;
-uint32_t *row_acc_core_map;
+uint32_t *tile_map;
+uint32_t *core_map;
 
 int32_t *matrix_X;
 int32_t *matrix_Y;
 
 int32_t weights[3][3] = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+int32_t *matrix_W = (int32_t *)weights;
 
 void generate_gradient_matrix(int32_t **matrix, uint32_t num_rows,
                               uint32_t num_cols) {
@@ -84,110 +78,16 @@ int main() {
 
   // Allocate tile and core maps
   if (core_id == 0) {
-    kernel_tile_map = (uint32_t *)simple_malloc(KERNEL_ROWS * KERNEL_COLS * 4);
-    kernel_core_map = (uint32_t *)simple_malloc(KERNEL_ROWS * KERNEL_COLS * 4);
-    row_acc_tile_map = (uint32_t *)simple_malloc(NUM_ACCS * 4);
-    row_acc_core_map = (uint32_t *)simple_malloc(NUM_ACCS * 4);
-  }
-
-  // Systolic identifiers
-  int32_t is_enabled = 0;
-  int32_t is_kernel_core = 0;
-  uint32_t kernel_id = 0;
-  uint32_t kernel_row = 0;
-  uint32_t kernel_col = 0;
-
-  // ----------
-  // ACC COMBO
-  // ----------
-
-  // XY: X = Tile and Y = Core % 4
-  //
-  // 00 01 30 **
-  // 10 11 31 33 02 03 40 **
-  // 20 21 32 ** 12 13 41 43
-  //             22 23 42 **
-
-  // TODO: CURRENTLY ONLY WORKS FOR KERNEL_SIZE == 3
-
-  uint32_t group_id = tile_id / 5;
-  uint32_t group_tile_id = tile_id % 5;
-  uint32_t tile_core_id = core_id % 4;
-  if (group_tile_id < 3) {
-    is_kernel_core = 1;
-    kernel_row = group_tile_id;
-    kernel_col = tile_core_id % 2;
-    kernel_id = 2 * group_id + (tile_core_id / 2);
-  } else {
-    if (tile_core_id == 3) {
-      is_kernel_core = 0;
-    } else {
-      is_kernel_core = 1;
-      kernel_row = tile_core_id;
-      kernel_col = 2;
-    }
-    kernel_id = 2 * group_id + (group_tile_id % 3);
-  }
-
-  // ----------
-  // LONG ROWS
-  // ----------
-
-  // XY: X = Tile and Y = Core % 4
-  //
-  // 00 01 02 **
-  // 10 11 12 90 13 30 31 **
-  // 20 21 22 ** 23 40 41 91 42 43 60 **
-  //             03 50 51 ** 52 53 70 92 71 72 73 **
-  //                         32 33 80 ** 81 82 83 93
-  //                                     61 62 63 **
-
-  // TODO: CURRENTLY ONLY WORKS FOR KERNEL_SIZE == 3
-
-  // uint32_t group_id = tile_id / 10;
-  // uint32_t group_tile_id = tile_id % 10;
-  // uint32_t tile_core_id = core_id % 4;
-  // if (group_tile_id < 9) {
-  //   is_kernel_core = 1;
-  //   uint32_t group_kernel_id = group_tile_id / 3;
-  //   kernel_row = group_tile_id % 3;
-  //   kernel_col = (tile_core_id + group_kernel_id) % 3;
-  //   uint32_t threshold = 3 - group_kernel_id;
-  //   if (tile_core_id >= threshold) {
-  //     group_kernel_id += 1;
-  //     if (kernel_row == 0) {
-  //       kernel_row = 2;
-  //     } else {
-  //       kernel_row -= 1;
-  //     }
-  //   }
-  //   kernel_id = 4 * group_id + group_kernel_id;
-  // } else {
-  //   is_kernel_core = 0;
-  //   kernel_id = 4 * group_id + tile_core_id;
-  // }
-
-  // Core is only enabled if its kernel is required
-  if (kernel_id < NUM_KERNELS) {
-    is_enabled = 1;
-  } else {
-    is_enabled = 0;
+    tile_map = (uint32_t *)simple_malloc(num_cores * 4);
+    core_map = (uint32_t *)simple_malloc(num_cores * 4);
   }
 
   // Wait for all cores
   mempool_barrier(num_cores);
 
   // Set tile and core maps
-  if (is_enabled) {
-    if (is_kernel_core) {
-      uint32_t map_col = KERNEL_SIZE * kernel_id + kernel_col;
-      kernel_tile_map[kernel_row * KERNEL_COLS + map_col] = tile_id;
-      kernel_core_map[kernel_row * KERNEL_COLS + map_col] = core_id;
-    } else {
-      row_acc_tile_map[kernel_id] = tile_id;
-      row_acc_core_map[kernel_id] = core_id;
-    }
-  }
+  tile_map[core_id] = tile_id;
+  core_map[core_id] = core_id;
 
   // Wait for all cores
   mempool_barrier(num_cores);
@@ -197,14 +97,11 @@ int main() {
     printf("> Initialize\n");
 
     // Print out maps
-    // print_matrix((int32_t *)kernel_tile_map, KERNEL_ROWS, KERNEL_COLS);
-    // print_matrix((int32_t *)kernel_core_map, KERNEL_ROWS, KERNEL_COLS);
-    // print_matrix((int32_t *)row_acc_tile_map, 1, NUM_ACCS);
-    // print_matrix((int32_t *)row_acc_core_map, 1, NUM_ACCS);
+    // print_matrix((int32_t *)tile_map, 1, num_cores);
+    // print_matrix((int32_t *)core_map, 1, num_cores);
 
     // Initialize systolic array
-    systolic_init(kernel_tile_map, kernel_core_map, row_acc_tile_map,
-                  row_acc_core_map);
+    systolic_init(tile_map, core_map);
 
     // Create and initialize matrices
     generate_gradient_matrix(&matrix_X, DIM_X_M, DIM_X_N);
@@ -231,44 +128,15 @@ int main() {
   // Wait for all cores
   mempool_barrier(num_cores);
 
-  if (is_enabled) {
-    if (is_kernel_core) {
-      switch (kernel_col) {
-      case 0:
-        if (kernel_id == 0) {
-          systolic_conv_first_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                                    matrix_X, (int32_t *)weights);
-        } else {
-          if (kernel_row == 2) {
-            systolic_conv_first_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                                      matrix_X, (int32_t *)weights);
-          } else {
-            systolic_conv_front(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                                (int32_t *)weights);
-          }
-        }
-        break;
-      case (KERNEL_SIZE - 1):
-        if (kernel_id == NUM_KERNELS - 1) {
-          systolic_conv_last_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                                 (int32_t *)weights);
-        } else {
-          if (kernel_row == 0) {
-            systolic_conv_last_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                                   (int32_t *)weights);
-          } else {
-            systolic_conv_end(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                              (int32_t *)weights);
-          }
-        }
-        break;
-      default:
-        systolic_conv_mid(kernel_id, kernel_row, DIM_X_M, DIM_X_N,
-                          (int32_t *)weights);
-      }
-    } else {
-      systolic_conv_row_acc(kernel_id, DIM_Y_M, DIM_Y_N, matrix_Y);
-    }
+  switch (core_id) {
+  case 0:
+    systolic_conv_front(DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y);
+    break;
+  case (NUM_CORES - 1):
+    systolic_conv_end(core_id, DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y);
+    break;
+  default:
+    systolic_conv_mid(core_id, DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y);
   }
 
   // Wait for all cores
@@ -285,8 +153,8 @@ int main() {
     printf("> End\n");
 
     // Print out matrix Y
-    printf("> Print Matrix Y\n");
-    print_matrix(matrix_Y, DIM_Y_M, DIM_Y_N);
+    // printf("> Print Matrix Y\n");
+    // print_matrix(matrix_Y, DIM_Y_M, DIM_Y_N);
   }
 
   // wait until all cores have finished
diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h
index 85e372f42..7224acb90 100644
--- a/software/runtime/systolic/conv_xqueue.h
+++ b/software/runtime/systolic/conv_xqueue.h
@@ -21,8 +21,8 @@
  */
 
 /* TODO DESCRIPTION
- *
- *
+ * TODO: LIMITATION NUM_COLS_Y >= 2 <=> NUM_COLS >= 4
+ * TODO: COMPLETELY FIXED TO KERNEL SIZE OF 3
  *
  *
  *
@@ -32,16 +32,9 @@
 #include "alloc.h"
 #include "printf.h"
 
-// Kernel size (fixed)
-#define KERNEL_SIZE 3
-
-// Number of kernels
-#define NUM_KERNELS 25
-
 // Array of queue ptrs in row-major order (concatenated kernels)
-int32_t *queues_x[KERNEL_SIZE][NUM_KERNELS * KERNEL_SIZE];
-int32_t *queues_y[KERNEL_SIZE][NUM_KERNELS * KERNEL_SIZE];
-int32_t *queues_row_acc[KERNEL_SIZE][NUM_KERNELS];
+int32_t *queues_x_0[NUM_CORES];
+int32_t *queues_x_1[NUM_CORES];
 
 // queue push
 static inline void queue_push(void *const queue, int32_t data,
@@ -54,307 +47,622 @@ inline void queue_pop(void *const queue, int32_t *const ret) {
   asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue));
 }
 
-void systolic_init(uint32_t const *kernel_tile_map,
-                   uint32_t const *kernel_core_map,
-                   uint32_t const *row_acc_tile_map,
-                   uint32_t const *row_acc_core_map) {
+void systolic_init(uint32_t const *tile_map, uint32_t const *core_map) {
   // Create systolic array via queues
   extern int32_t __seq_start;
-  uint32_t grid_pos;
   uint32_t tile_id;
   uint32_t core_id;
   uint32_t tile_offset;
   uint32_t core_offset;
 
-  // Kernel queues
-  grid_pos = 0;
-  for (uint32_t y = 0; y < KERNEL_SIZE; ++y) {
-    for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) {
-      tile_id = kernel_tile_map[grid_pos];
-      core_id = kernel_core_map[grid_pos];
-      tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4;
-      core_offset = core_id % 4 * 4;
-      queues_x[y][x] = &__seq_start + tile_offset + core_offset + 0;
-      queues_y[y][x] = &__seq_start + tile_offset + core_offset + 1;
-      ++grid_pos;
-    }
-  }
-
-  // Row accumulator queues
-  grid_pos = 0;
-  for (uint32_t x = 0; x < NUM_KERNELS; ++x) {
-    tile_id = row_acc_tile_map[x];
-    core_id = row_acc_core_map[x];
+  for (uint32_t i = 0; i < NUM_CORES; ++i) {
+    tile_id = tile_map[i];
+    core_id = core_map[i];
     tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4;
     core_offset = core_id % 4 * 4;
-    queues_row_acc[0][x] = &__seq_start + tile_offset + core_offset + 0;
-    queues_row_acc[1][x] = &__seq_start + tile_offset + core_offset + 1;
-    queues_row_acc[2][x] = &__seq_start + tile_offset + core_offset + 2;
+    queues_x_0[i] = &__seq_start + tile_offset + core_offset + 0;
+    queues_x_1[i] = &__seq_start + tile_offset + core_offset + 1;
   }
 
   // Print out queue addresses
-  // printf("queues_x\n");
-  // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) {
-  //   for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) {
-  //     printf("%5d ", queues_x[y][x]);
-  //   }
-  //   printf("\n");
-  // }
-  // printf("queues_y\n");
-  // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) {
-  //   for (uint32_t x = 0; x < NUM_KERNELS * KERNEL_SIZE; ++x) {
-  //     printf("%5d ", queues_y[y][x]);
-  //   }
-  //   printf("\n");
+  // printf("queues_x_0\n");
+  // for (uint32_t i = 0; i < NUM_CORES; ++i) {
+  //   printf("%5d ", queues_x_0[i]);
   // }
-  // printf("queues_row_acc\n");
-  // for (uint32_t y = 0; y < KERNEL_SIZE; ++y) {
-  //   for (uint32_t x = 0; x < NUM_KERNELS; ++x) {
-  //     printf("%5d ", queues_row_acc[y][x]);
-  //   }
-  //   printf("\n");
+  // printf("\n");
+  // printf("queues_x_1\n");
+  // for (uint32_t i = 0; i < NUM_CORES; ++i) {
+  //   printf("%5d ", queues_x_1[i]);
   // }
+  // printf("\n");
 }
 
-void systolic_conv_first_front(const uint32_t kernel_id,
-                               const uint32_t kernel_row,
-                               const uint32_t num_rows, const uint32_t num_cols,
-                               int32_t const *__restrict__ X,
-                               int32_t const *__restrict__ W) {
-  int32_t *queue_next_x;
-  int32_t *queue_next_y;
-  int32_t resp_x __attribute__((unused)) = 0;
-  int32_t resp_y __attribute__((unused)) = 0;
-  int32_t weight;
-  int32_t curr_x;
-  int32_t curr_y;
-  uint32_t first_row = kernel_id + kernel_row;
-  uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1;
+void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
+                         int32_t const *__restrict__ X,
+                         int32_t const *__restrict__ W,
+                         int32_t *__restrict__ Y) {
+  int32_t *queue_next_x_0;
+  int32_t *queue_next_x_1;
+  int32_t resp_x_0 __attribute__((unused)) = 0;
+  int32_t resp_x_1 __attribute__((unused)) = 0;
+  int32_t weights[3][3];
+  int32_t curr_x[3];
+  int32_t acc_y[3] = {0, 0, 0};
+  uint32_t row;
+  uint32_t col;
+  uint32_t num_cols_y = num_cols - 2;
 
   // Assign queues
-  queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1];
-  queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1];
+  queue_next_x_0 = queues_x_0[1];
+  queue_next_x_1 = queues_x_1[1];
 
-  // Load weight
-  weight = W[kernel_row * KERNEL_SIZE + 0];
-
-  // Execute row-wise systolic 2d convolution
-  for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) {
-    // Populate kernel
-    curr_x = X[row * num_cols + 0];
-    queue_push(queue_next_x, curr_x, &resp_x);
-    curr_x = X[row * num_cols + 1];
-    queue_push(queue_next_x, curr_x, &resp_x);
-    curr_x = X[row * num_cols + 2];
-    // Convolution
-    for (uint32_t col = 3; col < num_cols; ++col) {
-      queue_push(queue_next_x, curr_x, &resp_x);
-      curr_y = curr_x * weight;
-      curr_x = X[row * num_cols + col];
-      queue_push(queue_next_y, curr_y, &resp_y);
+  // Load weights
+  for (uint32_t y = 0; y < 3; ++y) {
+    for (uint32_t x = 0; x < 3; ++x) {
+      weights[y][x] = W[y * 3 + x];
     }
-    // Flush kernel
-    queue_push(queue_next_x, curr_x, &resp_x);
-    curr_y = curr_x * weight;
-    queue_push(queue_next_y, curr_y, &resp_y);
   }
-}
-
-void systolic_conv_front(const uint32_t kernel_id, const uint32_t kernel_row,
-                         const uint32_t num_rows, const uint32_t num_cols,
-                         int32_t const *__restrict__ W) {
-  int32_t *queue_prev_x;
-  int32_t *queue_next_x;
-  int32_t *queue_next_y;
-  int32_t resp_x __attribute__((unused)) = 0;
-  int32_t resp_y __attribute__((unused)) = 0;
-  int32_t weight;
-  int32_t curr_x;
-  int32_t curr_y;
-  uint32_t first_row = kernel_id + kernel_row;
-  uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1;
-
-  // Assign queues
-  queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 0];
-  queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1];
-  queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1];
-
-  // Load weight
-  weight = W[kernel_row * KERNEL_SIZE + 0];
 
   // Execute row-wise systolic 2d convolution
-  for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) {
-    // Populate kernel
-    queue_pop(queue_prev_x, &curr_x);
-    queue_push(queue_next_x, curr_x, &resp_x);
-    queue_pop(queue_prev_x, &curr_x);
-    queue_push(queue_next_x, curr_x, &resp_x);
-    queue_pop(queue_prev_x, &curr_x);
-    // Convolution
-    for (uint32_t col = 3; col < num_cols; ++col) {
-      queue_push(queue_next_x, curr_x, &resp_x);
-      curr_y = curr_x * weight;
-      queue_pop(queue_prev_x, &curr_x);
-      queue_push(queue_next_y, curr_y, &resp_y);
+  row = 2;
+  while (row < num_rows - 1) {
+    // ----------
+    // POPULATE 0
+    // ----------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + 0];
+    curr_x[2] = X[(row - 0) * num_cols + 0];
+    curr_x[0] = X[(row - 2) * num_cols + 0];
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 1st column of weights
+    acc_y[2] += curr_x[0] * weights[0][0];
+    acc_y[2] += curr_x[1] * weights[1][0];
+    acc_y[2] += curr_x[2] * weights[2][0];
+    // ----------
+    // POPULATE 1
+    // ----------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + 1];
+    curr_x[2] = X[(row - 0) * num_cols + 1];
+    curr_x[0] = X[(row - 2) * num_cols + 1];
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 2nd column of weights
+    acc_y[2] += curr_x[0] * weights[0][1];
+    acc_y[2] += curr_x[1] * weights[1][1];
+    acc_y[2] += curr_x[2] * weights[2][1];
+    // MACs with 1st column of weights
+    acc_y[0] += curr_x[0] * weights[0][0];
+    acc_y[0] += curr_x[1] * weights[1][0];
+    acc_y[0] += curr_x[2] * weights[2][0];
+    // -----------
+    // CONVOLUTION
+    // -----------
+    col = 2;
+    while (col < num_cols_y) {
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col];
+      curr_x[2] = X[(row - 0) * num_cols + col];
+      curr_x[0] = X[(row - 2) * num_cols + col];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 3th column of weights
+      acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+      acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+      acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+      // MACs with 2nd column of weights
+      acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+      acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+      acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
+      // MACs with 1st column of weights
+      acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0];
+      acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0];
+      acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+      // Reset finished accumulation
+      acc_y[col % 3] = 0;
+      // Increment column index
+      ++col;
     }
-    // Flush kernel
-    queue_push(queue_next_x, curr_x, &resp_x);
-    curr_y = curr_x * weight;
-    queue_push(queue_next_y, curr_y, &resp_y);
+    // -------
+    // FLUSH 0
+    // -------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + col];
+    curr_x[2] = X[(row - 0) * num_cols + col];
+    curr_x[0] = X[(row - 2) * num_cols + col];
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 3th column of weights
+    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+    // MACs with 2nd column of weights
+    acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+    acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+    acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
+    // Store finished accumulation
+    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+    // Reset finished accumulation
+    acc_y[col % 3] = 0;
+    // Increment column index
+    ++col;
+    // -------
+    // FLUSH 1
+    // -------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + col];
+    curr_x[2] = X[(row - 0) * num_cols + col];
+    curr_x[0] = X[(row - 2) * num_cols + col];
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 3th column of weights
+    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+    // Store finished accumulation
+    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+    // Reset finished accumulation
+    acc_y[col % 3] = 0;
+    // -------------
+    // INCREMENT ROW
+    // -------------
+    row += NUM_CORES;
   }
-}
 
-void systolic_conv_mid(const uint32_t kernel_id, const uint32_t kernel_row,
-                       const uint32_t num_rows, const uint32_t num_cols,
-                       int32_t const *__restrict__ W) {
-  int32_t *queue_prev_x;
-  int32_t *queue_next_x;
-  int32_t *queue_prev_y;
-  int32_t *queue_next_y;
-  int32_t resp_x __attribute__((unused)) = 0;
-  int32_t resp_y __attribute__((unused)) = 0;
-  int32_t weight;
-  int32_t curr_x;
-  int32_t curr_y;
-  uint32_t first_row = kernel_id + kernel_row;
-  uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1;
-
-  // Assign queues
-  queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 1];
-  queue_next_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2];
-  queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 1];
-  queue_next_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2];
-
-  // Load weight
-  weight = W[kernel_row * KERNEL_SIZE + 1];
-
-  // Execute row-wise systolic 2d convolution
-  for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) {
-    // Populate kernel
-    queue_pop(queue_prev_x, &curr_x);
-    queue_push(queue_next_x, curr_x, &resp_x);
-    queue_pop(queue_prev_x, &curr_x);
-    // Convolution
-    for (uint32_t col = 2; col < num_cols; ++col) {
-      queue_pop(queue_prev_y, &curr_y);
-      queue_push(queue_next_x, curr_x, &resp_x);
-      curr_y += curr_x * weight;
-      queue_pop(queue_prev_x, &curr_x);
-      queue_push(queue_next_y, curr_y, &resp_y);
+  // Finish last row of systolic 2d convolution without pushing
+  if (row == num_rows - 1) {
+    // ----------
+    // POPULATE 0
+    // ----------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + 0];
+    curr_x[2] = X[(row - 0) * num_cols + 0];
+    curr_x[0] = X[(row - 2) * num_cols + 0];
+    // MACs with 1st column of weights
+    acc_y[2] += curr_x[0] * weights[0][0];
+    acc_y[2] += curr_x[1] * weights[1][0];
+    acc_y[2] += curr_x[2] * weights[2][0];
+    // ----------
+    // POPULATE 1
+    // ----------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + 1];
+    curr_x[2] = X[(row - 0) * num_cols + 1];
+    curr_x[0] = X[(row - 2) * num_cols + 1];
+    // MACs with 2nd column of weights
+    acc_y[2] += curr_x[0] * weights[0][1];
+    acc_y[2] += curr_x[1] * weights[1][1];
+    acc_y[2] += curr_x[2] * weights[2][1];
+    // MACs with 1st column of weights
+    acc_y[0] += curr_x[0] * weights[0][0];
+    acc_y[0] += curr_x[1] * weights[1][0];
+    acc_y[0] += curr_x[2] * weights[2][0];
+    // -----------
+    // CONVOLUTION
+    // -----------
+    col = 2;
+    while (col < num_cols_y) {
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col];
+      curr_x[2] = X[(row - 0) * num_cols + col];
+      curr_x[0] = X[(row - 2) * num_cols + col];
+      // MACs with 3th column of weights
+      acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+      acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+      acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+      // MACs with 2nd column of weights
+      acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+      acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+      acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
+      // MACs with 1st column of weights
+      acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0];
+      acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0];
+      acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+      // Reset finished accumulation
+      acc_y[col % 3] = 0;
+      // Increment column index
+      ++col;
     }
-    // Flush kernel
-    queue_push(queue_next_x, curr_x, &resp_x);
+    // -------
+    // FLUSH 0
+    // -------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + col];
+    curr_x[2] = X[(row - 0) * num_cols + col];
+    curr_x[0] = X[(row - 2) * num_cols + col];
+    // MACs with 3th column of weights
+    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+    // MACs with 2nd column of weights
+    acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+    acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+    acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
+    // Store finished accumulation
+    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+    // Reset finished accumulation
+    acc_y[col % 3] = 0;
+    // Increment column index
+    ++col;
+    // -------
+    // FLUSH 1
+    // -------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + col];
+    curr_x[2] = X[(row - 0) * num_cols + col];
+    curr_x[0] = X[(row - 2) * num_cols + col];
+    // MACs with 3th column of weights
+    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+    // Store finished accumulation
+    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+    // Reset finished accumulation
+    acc_y[col % 3] = 0;
   }
 }
 
-void systolic_conv_end(const uint32_t kernel_id, const uint32_t kernel_row,
-                       const uint32_t num_rows, const uint32_t num_cols,
-                       int32_t const *__restrict__ W) {
-  int32_t *queue_prev_x;
-  int32_t *queue_next_x;
-  int32_t *queue_prev_y;
-  int32_t *queue_next_y;
-  int32_t resp_x __attribute__((unused)) = 0;
-  int32_t resp_y __attribute__((unused)) = 0;
-  int32_t weight;
-  int32_t curr_x;
-  int32_t curr_y;
-  uint32_t first_row = kernel_id + kernel_row;
-  uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1;
+void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
+                       const uint32_t num_cols, int32_t const *__restrict__ X,
+                       int32_t const *__restrict__ W, int32_t *__restrict__ Y) {
+  int32_t *queue_prev_x_0;
+  int32_t *queue_next_x_0;
+  int32_t *queue_prev_x_1;
+  int32_t *queue_next_x_1;
+  int32_t resp_x_0 __attribute__((unused)) = 0;
+  int32_t resp_x_1 __attribute__((unused)) = 0;
+  int32_t weights[3][3];
+  int32_t curr_x[3];
+  int32_t acc_y[3] = {0, 0, 0};
+  uint32_t row;
+  uint32_t col;
+  uint32_t num_cols_y = num_cols - 2;
 
   // Assign queues
-  queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2];
-  queue_next_x = queues_x[kernel_row + 1][(kernel_id + 1) * KERNEL_SIZE];
-  queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2];
-  queue_next_y = queues_row_acc[kernel_row][kernel_id];
-
-  // Load weight
-  weight = W[kernel_row * KERNEL_SIZE + 2];
+  queue_prev_x_0 = queues_x_0[kernel_id];
+  queue_next_x_0 = queues_x_0[kernel_id + 1];
+  queue_prev_x_1 = queues_x_1[kernel_id];
+  queue_next_x_1 = queues_x_1[kernel_id + 1];
+
+  // Load weights
+  for (uint32_t y = 0; y < 3; ++y) {
+    for (uint32_t x = 0; x < 3; ++x) {
+      weights[y][x] = W[y * 3 + x];
+    }
+  }
 
   // Execute row-wise systolic 2d convolution
-  for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) {
-    // Populate kernel
-    queue_pop(queue_prev_x, &curr_x);
-    // Convolution
-    for (uint32_t col = 1; col < num_cols - 1; ++col) {
-      queue_pop(queue_prev_y, &curr_y);
-      queue_push(queue_next_x, curr_x, &resp_x);
-      curr_y += curr_x * weight;
-      queue_pop(queue_prev_x, &curr_x);
-      queue_push(queue_next_y, curr_y, &resp_y);
+  row = kernel_id + 2;
+  while (row < num_rows - 1) {
+    // ----------
+    // POPULATE 0
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 0];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 1st column of weights
+    acc_y[2] += curr_x[0] * weights[0][0];
+    acc_y[2] += curr_x[1] * weights[1][0];
+    acc_y[2] += curr_x[2] * weights[2][0];
+    // ----------
+    // POPULATE 1
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 1];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 2nd column of weights
+    acc_y[2] += curr_x[0] * weights[0][1];
+    acc_y[2] += curr_x[1] * weights[1][1];
+    acc_y[2] += curr_x[2] * weights[2][1];
+    // MACs with 1st column of weights
+    acc_y[0] += curr_x[0] * weights[0][0];
+    acc_y[0] += curr_x[1] * weights[1][0];
+    acc_y[0] += curr_x[2] * weights[2][0];
+    // -----------
+    // CONVOLUTION
+    // -----------
+    col = 2;
+    while (col < num_cols_y) {
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 3th column of weights
+      acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+      acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+      acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+      // MACs with 2nd column of weights
+      acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+      acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+      acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
+      // MACs with 1st column of weights
+      acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0];
+      acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0];
+      acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+      // Reset finished accumulation
+      acc_y[col % 3] = 0;
+      // Increment column index
+      ++col;
     }
-    // Flush kernel
-    queue_push(queue_next_x, curr_x, &resp_x);
-    queue_pop(queue_prev_x, &curr_x);
-    queue_push(queue_next_x, curr_x, &resp_x);
+    // -------
+    // FLUSH 0
+    // -------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + col];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 3th column of weights
+    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+    // MACs with 2nd column of weights
+    acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+    acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+    acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
+    // Store finished accumulation
+    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+    // Reset finished accumulation
+    acc_y[col % 3] = 0;
+    // Increment column index
+    ++col;
+    // -------
+    // FLUSH 1
+    // -------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + col];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 3th column of weights
+    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+    // Store finished accumulation
+    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+    // Reset finished accumulation
+    acc_y[col % 3] = 0;
+    // -------------
+    // INCREMENT ROW
+    // -------------
+    row += NUM_CORES;
   }
 
-  // Flush next queues at the end of execution
-  queue_pop(queue_next_x, &curr_x);
-  queue_pop(queue_next_x, &curr_x);
+  // Finish last row of systolic 2d convolution without pushing
+  if (row == num_rows - 1) {
+    // ----------
+    // POPULATE 0
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 0];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 1st column of weights
+    acc_y[2] += curr_x[0] * weights[0][0];
+    acc_y[2] += curr_x[1] * weights[1][0];
+    acc_y[2] += curr_x[2] * weights[2][0];
+    // ----------
+    // POPULATE 1
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 1];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 2nd column of weights
+    acc_y[2] += curr_x[0] * weights[0][1];
+    acc_y[2] += curr_x[1] * weights[1][1];
+    acc_y[2] += curr_x[2] * weights[2][1];
+    // MACs with 1st column of weights
+    acc_y[0] += curr_x[0] * weights[0][0];
+    acc_y[0] += curr_x[1] * weights[1][0];
+    acc_y[0] += curr_x[2] * weights[2][0];
+    // -----------
+    // CONVOLUTION
+    // -----------
+    col = 2;
+    while (col < num_cols_y) {
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 3th column of weights
+      acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+      acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+      acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+      // MACs with 2nd column of weights
+      acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+      acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+      acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
+      // MACs with 1st column of weights
+      acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0];
+      acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0];
+      acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+      // Reset finished accumulation
+      acc_y[col % 3] = 0;
+      // Increment column index
+      ++col;
+    }
+    // -------
+    // FLUSH 0
+    // -------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + col];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 3th column of weights
+    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+    // MACs with 2nd column of weights
+    acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+    acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+    acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
+    // Store finished accumulation
+    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+    // Reset finished accumulation
+    acc_y[col % 3] = 0;
+    // Increment column index
+    ++col;
+    // -------
+    // FLUSH 1
+    // -------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + col];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 3th column of weights
+    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+    // Store finished accumulation
+    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+    // Reset finished accumulation
+    acc_y[col % 3] = 0;
+  }
 }
 
-void systolic_conv_last_end(const uint32_t kernel_id, const uint32_t kernel_row,
-                            const uint32_t num_rows, const uint32_t num_cols,
-                            int32_t const *__restrict__ W) {
-  int32_t *queue_prev_x;
-  int32_t *queue_prev_y;
-  int32_t *queue_next_y;
-  int32_t resp_y __attribute__((unused)) = 0;
-  int32_t weight;
-  int32_t curr_x;
-  int32_t curr_y;
-  uint32_t first_row = kernel_id + kernel_row;
-  uint32_t last_row = num_rows - KERNEL_SIZE + kernel_row + 1;
+void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
+                       const uint32_t num_cols, int32_t const *__restrict__ X,
+                       int32_t const *__restrict__ W, int32_t *__restrict__ Y) {
+  int32_t *queue_prev_x_0;
+  int32_t *queue_prev_x_1;
+  int32_t weights[3][3];
+  int32_t curr_x[3];
+  int32_t acc_y[3] = {0, 0, 0};
+  uint32_t col;
+  uint32_t num_cols_y = num_cols - 2;
 
   // Assign queues
-  queue_prev_x = queues_x[kernel_row][kernel_id * KERNEL_SIZE + 2];
-  queue_prev_y = queues_y[kernel_row][kernel_id * KERNEL_SIZE + 2];
-  queue_next_y = queues_row_acc[kernel_row][kernel_id];
+  queue_prev_x_0 = queues_x_0[kernel_id];
+  queue_prev_x_1 = queues_x_1[kernel_id];
 
-  // Load weight
-  weight = W[kernel_row * KERNEL_SIZE + 2];
-
-  // Execute row-wise systolic 2d convolution
-  for (uint32_t row = first_row; row < last_row; row += NUM_KERNELS) {
-    // Populate kernel
-    queue_pop(queue_prev_x, &curr_x);
-    // Convolution
-    for (uint32_t col = 1; col < num_cols - 1; ++col) {
-      queue_pop(queue_prev_y, &curr_y);
-      curr_y += curr_x * weight;
-      queue_pop(queue_prev_x, &curr_x);
-      queue_push(queue_next_y, curr_y, &resp_y);
+  // Load weights
+  for (uint32_t y = 0; y < 3; ++y) {
+    for (uint32_t x = 0; x < 3; ++x) {
+      weights[y][x] = W[y * 3 + x];
     }
-    // Flush kernel
-    queue_pop(queue_prev_x, &curr_x);
   }
-}
-
-void systolic_conv_row_acc(const uint32_t kernel_id, const uint32_t num_rows_y,
-                           const uint32_t num_cols_y, int32_t *__restrict__ Y) {
-  int32_t *queue_y_0;
-  int32_t *queue_y_1;
-  int32_t *queue_y_2;
-  int32_t curr_y_0;
-  int32_t curr_y_1;
-  int32_t curr_y_2;
-  int32_t total_y;
-
-  // Assign queues
-  queue_y_0 = queues_row_acc[0][kernel_id];
-  queue_y_1 = queues_row_acc[1][kernel_id];
-  queue_y_2 = queues_row_acc[2][kernel_id];
 
   // Execute row-wise systolic 2d convolution
-  for (uint32_t row = kernel_id; row < num_rows_y; row += NUM_KERNELS) {
-    // Accumulate and Store
-    for (uint32_t col = 0; col < num_cols_y; ++col) {
-      queue_pop(queue_y_0, &curr_y_0);
-      queue_pop(queue_y_1, &curr_y_1);
-      queue_pop(queue_y_2, &curr_y_2);
-      total_y = curr_y_0 + curr_y_1 + curr_y_2;
-      Y[row * num_cols_y + col] = total_y;
+  for (uint32_t row = kernel_id + 2; row < num_rows; row += NUM_CORES) {
+    // ----------
+    // POPULATE 0
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 0];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 1st column of weights
+    acc_y[2] += curr_x[0] * weights[0][0];
+    acc_y[2] += curr_x[1] * weights[1][0];
+    acc_y[2] += curr_x[2] * weights[2][0];
+    // ----------
+    // POPULATE 1
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 1];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 2nd column of weights
+    acc_y[2] += curr_x[0] * weights[0][1];
+    acc_y[2] += curr_x[1] * weights[1][1];
+    acc_y[2] += curr_x[2] * weights[2][1];
+    // MACs with 1st column of weights
+    acc_y[0] += curr_x[0] * weights[0][0];
+    acc_y[0] += curr_x[1] * weights[1][0];
+    acc_y[0] += curr_x[2] * weights[2][0];
+    // -----------
+    // CONVOLUTION
+    // -----------
+    col = 2;
+    while (col < num_cols_y) {
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 3th column of weights
+      acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+      acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+      acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+      // MACs with 2nd column of weights
+      acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+      acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+      acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
+      // MACs with 1st column of weights
+      acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0];
+      acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0];
+      acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+      // Reset finished accumulation
+      acc_y[col % 3] = 0;
+      // Increment column index
+      ++col;
     }
+    // -------
+    // FLUSH 0
+    // -------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + col];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 3th column of weights
+    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+    // MACs with 2nd column of weights
+    acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+    acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+    acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
+    // Store finished accumulation
+    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+    // Reset finished accumulation
+    acc_y[col % 3] = 0;
+    // Increment column index
+    ++col;
+    // -------
+    // FLUSH 1
+    // -------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + col];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 3th column of weights
+    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
+    // Store finished accumulation
+    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+    // Reset finished accumulation
+    acc_y[col % 3] = 0;
   }
 }

From 0fbe88bd7e3b95b9c7a7796d6154dba9621d0a2f Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Fri, 25 Jun 2021 02:45:16 +0200
Subject: [PATCH 17/24] [apps] Improve conv_xqueue ecode - fix illegal multi
 queue pop - increase performance via fixed cyclical pattern - add shuffling
 of MACs to hide accelerator latency

---
 software/runtime/systolic/conv_xqueue.h | 903 ++++++++++++++++++++----
 1 file changed, 747 insertions(+), 156 deletions(-)

diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h
index 7224acb90..096ec4427 100644
--- a/software/runtime/systolic/conv_xqueue.h
+++ b/software/runtime/systolic/conv_xqueue.h
@@ -120,6 +120,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     acc_y[2] += curr_x[0] * weights[0][0];
     acc_y[2] += curr_x[1] * weights[1][0];
     acc_y[2] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("":::"memory");
     // ----------
     // POPULATE 1
     // ----------
@@ -130,42 +131,165 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     // Push lower part of x vector
     queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
     queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-    // MACs with 2nd column of weights
+    // MACs with 1st row of weights
     acc_y[2] += curr_x[0] * weights[0][1];
-    acc_y[2] += curr_x[1] * weights[1][1];
-    acc_y[2] += curr_x[2] * weights[2][1];
-    // MACs with 1st column of weights
     acc_y[0] += curr_x[0] * weights[0][0];
+    // MACs with 2nd row of weights
+    acc_y[2] += curr_x[1] * weights[1][1];
     acc_y[0] += curr_x[1] * weights[1][0];
+    // MACs with 3rd row of weights
+    acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
-    // -----------
-    // CONVOLUTION
-    // -----------
+    // ------------------
+    // CONVOLUTION BURSTS
+    // ------------------
     col = 2;
+    while (col < num_cols_y - 2) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 0];
+      curr_x[2] = X[(row - 0) * num_cols + col + 0];
+      curr_x[0] = X[(row - 2) * num_cols + col + 0];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 1];
+      curr_x[2] = X[(row - 0) * num_cols + col + 1];
+      curr_x[0] = X[(row - 2) * num_cols + col + 1];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 2
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 2];
+      curr_x[2] = X[(row - 0) * num_cols + col + 2];
+      curr_x[0] = X[(row - 2) * num_cols + col + 2];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[1] += curr_x[0] * weights[0][2];
+      acc_y[2] += curr_x[0] * weights[0][1];
+      acc_y[0] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[1] += curr_x[1] * weights[1][2];
+      acc_y[2] += curr_x[1] * weights[1][1];
+      acc_y[0] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[1] += curr_x[2] * weights[2][2];
+      acc_y[2] += curr_x[2] * weights[2][1];
+      acc_y[0] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
+      // Reset finished accumulation
+      acc_y[1] = 0;
+      // ----------------
+      // INCREMENT COLUMN
+      // ----------------
+      col += 3;
+    }
+    // ---------------------
+    // CONVOLUTION REMAINDER
+    // ---------------------
     while (col < num_cols_y) {
+      // -----------
+      // ITERATION 0
+      // -----------
       // Load x vector
-      curr_x[1] = X[(row - 1) * num_cols + col];
-      curr_x[2] = X[(row - 0) * num_cols + col];
-      curr_x[0] = X[(row - 2) * num_cols + col];
+      curr_x[1] = X[(row - 1) * num_cols + col + 0];
+      curr_x[2] = X[(row - 0) * num_cols + col + 0];
+      curr_x[0] = X[(row - 2) * num_cols + col + 0];
       // Push lower part of x vector
       queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
       queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-      // MACs with 3th column of weights
-      acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-      acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-      acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-      // MACs with 2nd column of weights
-      acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
-      acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
-      acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
-      // MACs with 1st column of weights
-      acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0];
-      acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0];
-      acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0];
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
       // Store finished accumulation
-      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
       // Reset finished accumulation
-      acc_y[col % 3] = 0;
+      acc_y[2] = 0;
+      // Increment column index
+      ++col;
+      if (col >= num_cols_y) break;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 1];
+      curr_x[2] = X[(row - 0) * num_cols + col + 1];
+      curr_x[0] = X[(row - 2) * num_cols + col + 1];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
       // Increment column index
       ++col;
     }
@@ -179,18 +303,17 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     // Push lower part of x vector
     queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
     queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-    // MACs with 3th column of weights
+    // MACs with 1st row of weights
     acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    // MACs with 2nd column of weights
     acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+    // MACs with 2nd row of weights
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
     acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+    // MACs with 3rd row of weights
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
     acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
     // Store finished accumulation
     Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Reset finished accumulation
-    acc_y[col % 3] = 0;
     // Increment column index
     ++col;
     // -------
@@ -203,14 +326,18 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     // Push lower part of x vector
     queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
     queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-    // MACs with 3th column of weights
+    // MACs with 3rd column of weights
     acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
     acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
     acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
     // Store finished accumulation
     Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Reset finished accumulation
-    acc_y[col % 3] = 0;
+    // ------------------
+    // RESET ACCUMULATORS
+    // ------------------
+    acc_y[0] = 0;
+    acc_y[1] = 0;
+    acc_y[2] = 0;
     // -------------
     // INCREMENT ROW
     // -------------
@@ -237,39 +364,156 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     curr_x[1] = X[(row - 1) * num_cols + 1];
     curr_x[2] = X[(row - 0) * num_cols + 1];
     curr_x[0] = X[(row - 2) * num_cols + 1];
-    // MACs with 2nd column of weights
+    // MACs with 1st row of weights
     acc_y[2] += curr_x[0] * weights[0][1];
-    acc_y[2] += curr_x[1] * weights[1][1];
-    acc_y[2] += curr_x[2] * weights[2][1];
-    // MACs with 1st column of weights
     acc_y[0] += curr_x[0] * weights[0][0];
+    // MACs with 2nd row of weights
+    acc_y[2] += curr_x[1] * weights[1][1];
     acc_y[0] += curr_x[1] * weights[1][0];
+    // MACs with 3rd row of weights
+    acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
-    // -----------
-    // CONVOLUTION
-    // -----------
+    // ------------------
+    // CONVOLUTION BURSTS
+    // ------------------
     col = 2;
+    while (col < num_cols_y - 2) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 0];
+      curr_x[2] = X[(row - 0) * num_cols + col + 0];
+      curr_x[0] = X[(row - 2) * num_cols + col + 0];
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 1];
+      curr_x[2] = X[(row - 0) * num_cols + col + 1];
+      curr_x[0] = X[(row - 2) * num_cols + col + 1];
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 2
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 2];
+      curr_x[2] = X[(row - 0) * num_cols + col + 2];
+      curr_x[0] = X[(row - 2) * num_cols + col + 2];
+      // MACs with 1st row of weights
+      acc_y[1] += curr_x[0] * weights[0][2];
+      acc_y[2] += curr_x[0] * weights[0][1];
+      acc_y[0] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[1] += curr_x[1] * weights[1][2];
+      acc_y[2] += curr_x[1] * weights[1][1];
+      acc_y[0] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[1] += curr_x[2] * weights[2][2];
+      acc_y[2] += curr_x[2] * weights[2][1];
+      acc_y[0] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
+      // Reset finished accumulation
+      acc_y[1] = 0;
+      // ----------------
+      // INCREMENT COLUMN
+      // ----------------
+      col += 3;
+    }
+    // ---------------------
+    // CONVOLUTION REMAINDER
+    // ---------------------
     while (col < num_cols_y) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 0];
+      curr_x[2] = X[(row - 0) * num_cols + col + 0];
+      curr_x[0] = X[(row - 2) * num_cols + col + 0];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      // Increment column index
+      ++col;
+      if (col >= num_cols_y) break;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 1
+      // -----------
       // Load x vector
-      curr_x[1] = X[(row - 1) * num_cols + col];
-      curr_x[2] = X[(row - 0) * num_cols + col];
-      curr_x[0] = X[(row - 2) * num_cols + col];
-      // MACs with 3th column of weights
-      acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-      acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-      acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-      // MACs with 2nd column of weights
-      acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
-      acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
-      acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
-      // MACs with 1st column of weights
-      acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0];
-      acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0];
-      acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0];
+      curr_x[1] = X[(row - 1) * num_cols + col + 1];
+      curr_x[2] = X[(row - 0) * num_cols + col + 1];
+      curr_x[0] = X[(row - 2) * num_cols + col + 1];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
       // Store finished accumulation
-      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
       // Reset finished accumulation
-      acc_y[col % 3] = 0;
+      acc_y[0] = 0;
       // Increment column index
       ++col;
     }
@@ -280,18 +524,17 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     curr_x[1] = X[(row - 1) * num_cols + col];
     curr_x[2] = X[(row - 0) * num_cols + col];
     curr_x[0] = X[(row - 2) * num_cols + col];
-    // MACs with 3th column of weights
+    // MACs with 1st row of weights
     acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    // MACs with 2nd column of weights
     acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+    // MACs with 2nd row of weights
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
     acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+    // MACs with 3rd row of weights
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
     acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
     // Store finished accumulation
     Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Reset finished accumulation
-    acc_y[col % 3] = 0;
     // Increment column index
     ++col;
     // -------
@@ -301,14 +544,12 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     curr_x[1] = X[(row - 1) * num_cols + col];
     curr_x[2] = X[(row - 0) * num_cols + col];
     curr_x[0] = X[(row - 2) * num_cols + col];
-    // MACs with 3th column of weights
+    // MACs with 3rd column of weights
     acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
     acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
     acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
     // Store finished accumulation
     Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Reset finished accumulation
-    acc_y[col % 3] = 0;
   }
 }
 
@@ -358,6 +599,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     acc_y[2] += curr_x[0] * weights[0][0];
     acc_y[2] += curr_x[1] * weights[1][0];
     acc_y[2] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("":::"memory");
     // ----------
     // POPULATE 1
     // ----------
@@ -368,19 +610,142 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     // Push lower part of x vector
     queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
     queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-    // MACs with 2nd column of weights
+    // MACs with 1st row of weights
     acc_y[2] += curr_x[0] * weights[0][1];
-    acc_y[2] += curr_x[1] * weights[1][1];
-    acc_y[2] += curr_x[2] * weights[2][1];
-    // MACs with 1st column of weights
     acc_y[0] += curr_x[0] * weights[0][0];
+    // MACs with 2nd row of weights
+    acc_y[2] += curr_x[1] * weights[1][1];
     acc_y[0] += curr_x[1] * weights[1][0];
+    // MACs with 3rd row of weights
+    acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
-    // -----------
-    // CONVOLUTION
-    // -----------
+    // ------------------
+    // CONVOLUTION BURSTS
+    // ------------------
     col = 2;
+    while (col < num_cols_y - 2) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 0];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 1];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 2
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 2];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[1] += curr_x[0] * weights[0][2];
+      acc_y[2] += curr_x[0] * weights[0][1];
+      acc_y[0] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[1] += curr_x[1] * weights[1][2];
+      acc_y[2] += curr_x[1] * weights[1][1];
+      acc_y[0] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[1] += curr_x[2] * weights[2][2];
+      acc_y[2] += curr_x[2] * weights[2][1];
+      acc_y[0] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
+      // Reset finished accumulation
+      acc_y[1] = 0;
+      // ----------------
+      // INCREMENT COLUMN
+      // ----------------
+      col += 3;
+    }
+    // ---------------------
+    // CONVOLUTION REMAINDER
+    // ---------------------
     while (col < num_cols_y) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      // Increment column index
+      ++col;
+      if (col >= num_cols_y) break;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 1
+      // -----------
       // Pop and load x vector
       queue_pop(queue_prev_x_1, &curr_x[1]);
       curr_x[2] = X[row * num_cols + col];
@@ -388,22 +753,22 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       // Push lower part of x vector
       queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
       queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-      // MACs with 3th column of weights
-      acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-      acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-      acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-      // MACs with 2nd column of weights
-      acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
-      acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
-      acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
-      // MACs with 1st column of weights
-      acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0];
-      acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0];
-      acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0];
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
       // Store finished accumulation
-      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
       // Reset finished accumulation
-      acc_y[col % 3] = 0;
+      acc_y[0] = 0;
       // Increment column index
       ++col;
     }
@@ -417,18 +782,17 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     // Push lower part of x vector
     queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
     queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-    // MACs with 3th column of weights
+    // MACs with 1st row of weights
     acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    // MACs with 2nd column of weights
     acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+    // MACs with 2nd row of weights
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
     acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+    // MACs with 3rd row of weights
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
     acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
     // Store finished accumulation
     Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Reset finished accumulation
-    acc_y[col % 3] = 0;
     // Increment column index
     ++col;
     // -------
@@ -441,14 +805,18 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     // Push lower part of x vector
     queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
     queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-    // MACs with 3th column of weights
+    // MACs with 3rd column of weights
     acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
     acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
     acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
     // Store finished accumulation
     Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Reset finished accumulation
-    acc_y[col % 3] = 0;
+    // ------------------
+    // RESET ACCUMULATORS
+    // ------------------
+    acc_y[0] = 0;
+    acc_y[1] = 0;
+    acc_y[2] = 0;
     // -------------
     // INCREMENT ROW
     // -------------
@@ -475,39 +843,150 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     queue_pop(queue_prev_x_1, &curr_x[1]);
     curr_x[2] = X[row * num_cols + 1];
     queue_pop(queue_prev_x_0, &curr_x[0]);
-    // MACs with 2nd column of weights
+    // MACs with 1st row of weights
     acc_y[2] += curr_x[0] * weights[0][1];
-    acc_y[2] += curr_x[1] * weights[1][1];
-    acc_y[2] += curr_x[2] * weights[2][1];
-    // MACs with 1st column of weights
     acc_y[0] += curr_x[0] * weights[0][0];
+    // MACs with 2nd row of weights
+    acc_y[2] += curr_x[1] * weights[1][1];
     acc_y[0] += curr_x[1] * weights[1][0];
+    // MACs with 3rd row of weights
+    acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
-    // -----------
-    // CONVOLUTION
-    // -----------
+    // ------------------
+    // CONVOLUTION BURSTS
+    // ------------------
     col = 2;
+    while (col < num_cols_y - 2) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 0];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 1];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 2
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 2];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[1] += curr_x[0] * weights[0][2];
+      acc_y[2] += curr_x[0] * weights[0][1];
+      acc_y[0] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[1] += curr_x[1] * weights[1][2];
+      acc_y[2] += curr_x[1] * weights[1][1];
+      acc_y[0] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[1] += curr_x[2] * weights[2][2];
+      acc_y[2] += curr_x[2] * weights[2][1];
+      acc_y[0] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
+      // Reset finished accumulation
+      acc_y[1] = 0;
+      // ----------------
+      // INCREMENT COLUMN
+      // ----------------
+      col += 3;
+    }
+    // ---------------------
+    // CONVOLUTION REMAINDER
+    // ---------------------
     while (col < num_cols_y) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      // Increment column index
+      ++col;
+      if (col >= num_cols_y) break;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 1
+      // -----------
       // Pop and load x vector
       queue_pop(queue_prev_x_1, &curr_x[1]);
       curr_x[2] = X[row * num_cols + col];
       queue_pop(queue_prev_x_0, &curr_x[0]);
-      // MACs with 3th column of weights
-      acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-      acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-      acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-      // MACs with 2nd column of weights
-      acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
-      acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
-      acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
-      // MACs with 1st column of weights
-      acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0];
-      acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0];
-      acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0];
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
       // Store finished accumulation
-      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
       // Reset finished accumulation
-      acc_y[col % 3] = 0;
+      acc_y[0] = 0;
       // Increment column index
       ++col;
     }
@@ -518,18 +997,17 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     queue_pop(queue_prev_x_1, &curr_x[1]);
     curr_x[2] = X[row * num_cols + col];
     queue_pop(queue_prev_x_0, &curr_x[0]);
-    // MACs with 3th column of weights
+    // MACs with 1st row of weights
     acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    // MACs with 2nd column of weights
     acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+    // MACs with 2nd row of weights
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
     acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+    // MACs with 3rd row of weights
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
     acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
     // Store finished accumulation
     Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Reset finished accumulation
-    acc_y[col % 3] = 0;
     // Increment column index
     ++col;
     // -------
@@ -539,14 +1017,12 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     queue_pop(queue_prev_x_1, &curr_x[1]);
     curr_x[2] = X[row * num_cols + col];
     queue_pop(queue_prev_x_0, &curr_x[0]);
-    // MACs with 3th column of weights
+    // MACs with 3rd column of weights
     acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
     acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
     acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
     // Store finished accumulation
     Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Reset finished accumulation
-    acc_y[col % 3] = 0;
   }
 }
 
@@ -585,6 +1061,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
     acc_y[2] += curr_x[0] * weights[0][0];
     acc_y[2] += curr_x[1] * weights[1][0];
     acc_y[2] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("":::"memory");
     // ----------
     // POPULATE 1
     // ----------
@@ -592,39 +1069,150 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
     queue_pop(queue_prev_x_1, &curr_x[1]);
     curr_x[2] = X[row * num_cols + 1];
     queue_pop(queue_prev_x_0, &curr_x[0]);
-    // MACs with 2nd column of weights
+    // MACs with 1st row of weights
     acc_y[2] += curr_x[0] * weights[0][1];
-    acc_y[2] += curr_x[1] * weights[1][1];
-    acc_y[2] += curr_x[2] * weights[2][1];
-    // MACs with 1st column of weights
     acc_y[0] += curr_x[0] * weights[0][0];
+    // MACs with 2nd row of weights
+    acc_y[2] += curr_x[1] * weights[1][1];
     acc_y[0] += curr_x[1] * weights[1][0];
+    // MACs with 3rd row of weights
+    acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
-    // -----------
-    // CONVOLUTION
-    // -----------
+    // ------------------
+    // CONVOLUTION BURSTS
+    // ------------------
     col = 2;
+    while (col < num_cols_y - 2) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 0];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 1];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 2
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 2];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[1] += curr_x[0] * weights[0][2];
+      acc_y[2] += curr_x[0] * weights[0][1];
+      acc_y[0] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[1] += curr_x[1] * weights[1][2];
+      acc_y[2] += curr_x[1] * weights[1][1];
+      acc_y[0] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[1] += curr_x[2] * weights[2][2];
+      acc_y[2] += curr_x[2] * weights[2][1];
+      acc_y[0] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
+      // Reset finished accumulation
+      acc_y[1] = 0;
+      // ----------------
+      // INCREMENT COLUMN
+      // ----------------
+      col += 3;
+    }
+    // ---------------------
+    // CONVOLUTION REMAINDER
+    // ---------------------
     while (col < num_cols_y) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      // Increment column index
+      ++col;
+      if (col >= num_cols_y) break;
+      __asm__ __volatile__("":::"memory");
+      // -----------
+      // ITERATION 1
+      // -----------
       // Pop and load x vector
       queue_pop(queue_prev_x_1, &curr_x[1]);
       curr_x[2] = X[row * num_cols + col];
       queue_pop(queue_prev_x_0, &curr_x[0]);
-      // MACs with 3th column of weights
-      acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-      acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-      acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-      // MACs with 2nd column of weights
-      acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
-      acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
-      acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
-      // MACs with 1st column of weights
-      acc_y[(col + 2) % 3] += curr_x[0] * weights[0][0];
-      acc_y[(col + 2) % 3] += curr_x[1] * weights[1][0];
-      acc_y[(col + 2) % 3] += curr_x[2] * weights[2][0];
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
       // Store finished accumulation
-      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
       // Reset finished accumulation
-      acc_y[col % 3] = 0;
+      acc_y[0] = 0;
       // Increment column index
       ++col;
     }
@@ -635,18 +1223,17 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
     queue_pop(queue_prev_x_1, &curr_x[1]);
     curr_x[2] = X[row * num_cols + col];
     queue_pop(queue_prev_x_0, &curr_x[0]);
-    // MACs with 3th column of weights
+    // MACs with 1st row of weights
     acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    // MACs with 2nd column of weights
     acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
+    // MACs with 2nd row of weights
+    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
     acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
+    // MACs with 3rd row of weights
+    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
     acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
     // Store finished accumulation
     Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Reset finished accumulation
-    acc_y[col % 3] = 0;
     // Increment column index
     ++col;
     // -------
@@ -656,13 +1243,17 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
     queue_pop(queue_prev_x_1, &curr_x[1]);
     curr_x[2] = X[row * num_cols + col];
     queue_pop(queue_prev_x_0, &curr_x[0]);
-    // MACs with 3th column of weights
+    // MACs with 3rd column of weights
     acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
     acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
     acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
     // Store finished accumulation
     Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Reset finished accumulation
-    acc_y[col % 3] = 0;
+    // ------------------
+    // RESET ACCUMULATORS
+    // ------------------
+    acc_y[0] = 0;
+    acc_y[1] = 0;
+    acc_y[2] = 0;
   }
 }

From 244b9a5794c273ea5ad4410e998b382587fb2ba4 Mon Sep 17 00:00:00 2001
From: Gua Hao Khov <khovg@student.ethz.ch>
Date: Fri, 16 Jul 2021 02:35:02 +0200
Subject: [PATCH 18/24] [apps] Improve regularity of conv_xqueue

---
 hardware/src/tcdm_adapter_xqueue.sv       |   6 +-
 software/apps/systolic/conv_xqueue/main.c |   4 +-
 software/runtime/systolic/conv_xqueue.h   | 264 +++-------------------
 3 files changed, 42 insertions(+), 232 deletions(-)

diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv
index 4adb3f415..407952141 100644
--- a/hardware/src/tcdm_adapter_xqueue.sv
+++ b/hardware/src/tcdm_adapter_xqueue.sv
@@ -10,9 +10,9 @@
 
 `include "common_cells/registers.svh"
 
-import cf_math_pkg::idx_width;
-
-module tcdm_adapter_xqueue #(
+module tcdm_adapter_xqueue
+  import cf_math_pkg::idx_width;
+#(
   parameter int unsigned AddrWidth    = 32,
   parameter int unsigned DataWidth    = 32,
   parameter int unsigned XQueueSize   = 4,
diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c
index 6fd8045b5..c95b674d5 100644
--- a/software/apps/systolic/conv_xqueue/main.c
+++ b/software/apps/systolic/conv_xqueue/main.c
@@ -27,8 +27,8 @@
 #include "synchronization.h"
 
 // Dimensions of matrix X
-#define DIM_X_M 32
-#define DIM_X_N 32
+#define DIM_X_M 258
+#define DIM_X_N 61
 
 // Dimensions of matrix Y
 #define DIM_Y_M (DIM_X_M - 2)
diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h
index 096ec4427..bff238d0e 100644
--- a/software/runtime/systolic/conv_xqueue.h
+++ b/software/runtime/systolic/conv_xqueue.h
@@ -39,12 +39,12 @@ int32_t *queues_x_1[NUM_CORES];
 // queue push
 static inline void queue_push(void *const queue, int32_t data,
                               int32_t *const ret) {
-  asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue));
+  asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue) : "memory");
 }
 
 // queue pop
 inline void queue_pop(void *const queue, int32_t *const ret) {
-  asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue));
+  asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue) : "memory");
 }
 
 void systolic_init(uint32_t const *tile_map, uint32_t const *core_map) {
@@ -87,7 +87,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
   int32_t resp_x_1 __attribute__((unused)) = 0;
   int32_t weights[3][3];
   int32_t curr_x[3];
-  int32_t acc_y[3] = {0, 0, 0};
+  register int32_t acc_y[3] = {0, 0, 0};
   uint32_t row;
   uint32_t col;
   uint32_t num_cols_y = num_cols - 2;
@@ -140,11 +140,12 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     // MACs with 3rd row of weights
     acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("":::"memory");
     // ------------------
     // CONVOLUTION BURSTS
     // ------------------
     col = 2;
-    while (col < num_cols_y - 2) {
+    while (col < num_cols_y) {
       // -----------
       // ITERATION 0
       // -----------
@@ -225,15 +226,17 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
       // Reset finished accumulation
       acc_y[1] = 0;
+      __asm__ __volatile__("":::"memory");
       // ----------------
       // INCREMENT COLUMN
       // ----------------
       col += 3;
     }
+    __asm__ __volatile__("":::"memory");
     // ---------------------
     // CONVOLUTION REMAINDER
     // ---------------------
-    while (col < num_cols_y) {
+    while (col < num_cols) {
       // -----------
       // ITERATION 0
       // -----------
@@ -258,11 +261,9 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       acc_y[1] += curr_x[2] * weights[2][0];
       // Store finished accumulation
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
-      // Reset finished accumulation
-      acc_y[2] = 0;
       // Increment column index
       ++col;
-      if (col >= num_cols_y) break;
+      if (col >= num_cols) break;
       __asm__ __volatile__("":::"memory");
       // -----------
       // ITERATION 1
@@ -288,50 +289,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       acc_y[2] += curr_x[2] * weights[2][0];
       // Store finished accumulation
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
-      // Reset finished accumulation
-      acc_y[0] = 0;
-      // Increment column index
-      ++col;
     }
-    // -------
-    // FLUSH 0
-    // -------
-    // Load x vector
-    curr_x[1] = X[(row - 1) * num_cols + col];
-    curr_x[2] = X[(row - 0) * num_cols + col];
-    curr_x[0] = X[(row - 2) * num_cols + col];
-    // Push lower part of x vector
-    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
-    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-    // MACs with 1st row of weights
-    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
-    // MACs with 2nd row of weights
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
-    // MACs with 3rd row of weights
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
-    // Store finished accumulation
-    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Increment column index
-    ++col;
-    // -------
-    // FLUSH 1
-    // -------
-    // Load x vector
-    curr_x[1] = X[(row - 1) * num_cols + col];
-    curr_x[2] = X[(row - 0) * num_cols + col];
-    curr_x[0] = X[(row - 2) * num_cols + col];
-    // Push lower part of x vector
-    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
-    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-    // MACs with 3rd column of weights
-    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    // Store finished accumulation
-    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
     // ------------------
     // RESET ACCUMULATORS
     // ------------------
@@ -357,6 +315,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     acc_y[2] += curr_x[0] * weights[0][0];
     acc_y[2] += curr_x[1] * weights[1][0];
     acc_y[2] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("":::"memory");
     // ----------
     // POPULATE 1
     // ----------
@@ -373,11 +332,12 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     // MACs with 3rd row of weights
     acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("":::"memory");
     // ------------------
     // CONVOLUTION BURSTS
     // ------------------
     col = 2;
-    while (col < num_cols_y - 2) {
+    while (col < num_cols_y) {
       // -----------
       // ITERATION 0
       // -----------
@@ -449,15 +409,17 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
       // Reset finished accumulation
       acc_y[1] = 0;
+      __asm__ __volatile__("":::"memory");
       // ----------------
       // INCREMENT COLUMN
       // ----------------
       col += 3;
     }
+    __asm__ __volatile__("":::"memory");
     // ---------------------
     // CONVOLUTION REMAINDER
     // ---------------------
-    while (col < num_cols_y) {
+    while (col < num_cols) {
       // -----------
       // ITERATION 0
       // -----------
@@ -482,11 +444,9 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       acc_y[1] += curr_x[2] * weights[2][0];
       // Store finished accumulation
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
-      // Reset finished accumulation
-      acc_y[2] = 0;
       // Increment column index
       ++col;
-      if (col >= num_cols_y) break;
+      if (col >= num_cols) break;
       __asm__ __volatile__("":::"memory");
       // -----------
       // ITERATION 1
@@ -512,44 +472,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       acc_y[2] += curr_x[2] * weights[2][0];
       // Store finished accumulation
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
-      // Reset finished accumulation
-      acc_y[0] = 0;
-      // Increment column index
-      ++col;
     }
-    // -------
-    // FLUSH 0
-    // -------
-    // Load x vector
-    curr_x[1] = X[(row - 1) * num_cols + col];
-    curr_x[2] = X[(row - 0) * num_cols + col];
-    curr_x[0] = X[(row - 2) * num_cols + col];
-    // MACs with 1st row of weights
-    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
-    // MACs with 2nd row of weights
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
-    // MACs with 3rd row of weights
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
-    // Store finished accumulation
-    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Increment column index
-    ++col;
-    // -------
-    // FLUSH 1
-    // -------
-    // Load x vector
-    curr_x[1] = X[(row - 1) * num_cols + col];
-    curr_x[2] = X[(row - 0) * num_cols + col];
-    curr_x[0] = X[(row - 2) * num_cols + col];
-    // MACs with 3rd column of weights
-    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    // Store finished accumulation
-    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
   }
 }
 
@@ -564,7 +487,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
   int32_t resp_x_1 __attribute__((unused)) = 0;
   int32_t weights[3][3];
   int32_t curr_x[3];
-  int32_t acc_y[3] = {0, 0, 0};
+  register int32_t acc_y[3] = {0, 0, 0};
   uint32_t row;
   uint32_t col;
   uint32_t num_cols_y = num_cols - 2;
@@ -619,11 +542,12 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     // MACs with 3rd row of weights
     acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("":::"memory");
     // ------------------
     // CONVOLUTION BURSTS
     // ------------------
     col = 2;
-    while (col < num_cols_y - 2) {
+    while (col < num_cols_y) {
       // -----------
       // ITERATION 0
       // -----------
@@ -704,15 +628,17 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
       // Reset finished accumulation
       acc_y[1] = 0;
+      __asm__ __volatile__("":::"memory");
       // ----------------
       // INCREMENT COLUMN
       // ----------------
       col += 3;
     }
+    __asm__ __volatile__("":::"memory");
     // ---------------------
     // CONVOLUTION REMAINDER
     // ---------------------
-    while (col < num_cols_y) {
+    while (col < num_cols) {
       // -----------
       // ITERATION 0
       // -----------
@@ -737,11 +663,9 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       acc_y[1] += curr_x[2] * weights[2][0];
       // Store finished accumulation
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
-      // Reset finished accumulation
-      acc_y[2] = 0;
       // Increment column index
       ++col;
-      if (col >= num_cols_y) break;
+      if (col >= num_cols) break;
       __asm__ __volatile__("":::"memory");
       // -----------
       // ITERATION 1
@@ -767,50 +691,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       acc_y[2] += curr_x[2] * weights[2][0];
       // Store finished accumulation
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
-      // Reset finished accumulation
-      acc_y[0] = 0;
-      // Increment column index
-      ++col;
     }
-    // -------
-    // FLUSH 0
-    // -------
-    // Pop and load x vector
-    queue_pop(queue_prev_x_1, &curr_x[1]);
-    curr_x[2] = X[row * num_cols + col];
-    queue_pop(queue_prev_x_0, &curr_x[0]);
-    // Push lower part of x vector
-    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
-    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-    // MACs with 1st row of weights
-    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
-    // MACs with 2nd row of weights
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
-    // MACs with 3rd row of weights
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
-    // Store finished accumulation
-    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Increment column index
-    ++col;
-    // -------
-    // FLUSH 1
-    // -------
-    // Pop and load x vector
-    queue_pop(queue_prev_x_1, &curr_x[1]);
-    curr_x[2] = X[row * num_cols + col];
-    queue_pop(queue_prev_x_0, &curr_x[0]);
-    // Push lower part of x vector
-    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
-    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
-    // MACs with 3rd column of weights
-    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    // Store finished accumulation
-    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
     // ------------------
     // RESET ACCUMULATORS
     // ------------------
@@ -836,6 +717,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     acc_y[2] += curr_x[0] * weights[0][0];
     acc_y[2] += curr_x[1] * weights[1][0];
     acc_y[2] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("":::"memory");
     // ----------
     // POPULATE 1
     // ----------
@@ -852,11 +734,12 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     // MACs with 3rd row of weights
     acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("":::"memory");
     // ------------------
     // CONVOLUTION BURSTS
     // ------------------
     col = 2;
-    while (col < num_cols_y - 2) {
+    while (col < num_cols_y) {
       // -----------
       // ITERATION 0
       // -----------
@@ -928,15 +811,17 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
       // Reset finished accumulation
       acc_y[1] = 0;
+      __asm__ __volatile__("":::"memory");
       // ----------------
       // INCREMENT COLUMN
       // ----------------
       col += 3;
     }
+    __asm__ __volatile__("":::"memory");
     // ---------------------
     // CONVOLUTION REMAINDER
     // ---------------------
-    while (col < num_cols_y) {
+    while (col < num_cols) {
       // -----------
       // ITERATION 0
       // -----------
@@ -958,11 +843,9 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       acc_y[1] += curr_x[2] * weights[2][0];
       // Store finished accumulation
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
-      // Reset finished accumulation
-      acc_y[2] = 0;
       // Increment column index
       ++col;
-      if (col >= num_cols_y) break;
+      if (col >= num_cols) break;
       __asm__ __volatile__("":::"memory");
       // -----------
       // ITERATION 1
@@ -985,44 +868,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       acc_y[2] += curr_x[2] * weights[2][0];
       // Store finished accumulation
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
-      // Reset finished accumulation
-      acc_y[0] = 0;
-      // Increment column index
-      ++col;
     }
-    // -------
-    // FLUSH 0
-    // -------
-    // Pop and load x vector
-    queue_pop(queue_prev_x_1, &curr_x[1]);
-    curr_x[2] = X[row * num_cols + col];
-    queue_pop(queue_prev_x_0, &curr_x[0]);
-    // MACs with 1st row of weights
-    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
-    // MACs with 2nd row of weights
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
-    // MACs with 3rd row of weights
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
-    // Store finished accumulation
-    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Increment column index
-    ++col;
-    // -------
-    // FLUSH 1
-    // -------
-    // Pop and load x vector
-    queue_pop(queue_prev_x_1, &curr_x[1]);
-    curr_x[2] = X[row * num_cols + col];
-    queue_pop(queue_prev_x_0, &curr_x[0]);
-    // MACs with 3rd column of weights
-    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    // Store finished accumulation
-    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
   }
 }
 
@@ -1033,7 +879,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
   int32_t *queue_prev_x_1;
   int32_t weights[3][3];
   int32_t curr_x[3];
-  int32_t acc_y[3] = {0, 0, 0};
+  register int32_t acc_y[3] = {0, 0, 0};
   uint32_t col;
   uint32_t num_cols_y = num_cols - 2;
 
@@ -1078,11 +924,12 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
     // MACs with 3rd row of weights
     acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("":::"memory");
     // ------------------
     // CONVOLUTION BURSTS
     // ------------------
     col = 2;
-    while (col < num_cols_y - 2) {
+    while (col < num_cols_y) {
       // -----------
       // ITERATION 0
       // -----------
@@ -1154,15 +1001,17 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
       // Reset finished accumulation
       acc_y[1] = 0;
+      __asm__ __volatile__("":::"memory");
       // ----------------
       // INCREMENT COLUMN
       // ----------------
       col += 3;
     }
+    __asm__ __volatile__("":::"memory");
     // ---------------------
     // CONVOLUTION REMAINDER
     // ---------------------
-    while (col < num_cols_y) {
+    while (col < num_cols) {
       // -----------
       // ITERATION 0
       // -----------
@@ -1184,11 +1033,9 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
       acc_y[1] += curr_x[2] * weights[2][0];
       // Store finished accumulation
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
-      // Reset finished accumulation
-      acc_y[2] = 0;
       // Increment column index
       ++col;
-      if (col >= num_cols_y) break;
+      if (col >= num_cols) break;
       __asm__ __volatile__("":::"memory");
       // -----------
       // ITERATION 1
@@ -1211,44 +1058,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
       acc_y[2] += curr_x[2] * weights[2][0];
       // Store finished accumulation
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
-      // Reset finished accumulation
-      acc_y[0] = 0;
-      // Increment column index
-      ++col;
     }
-    // -------
-    // FLUSH 0
-    // -------
-    // Pop and load x vector
-    queue_pop(queue_prev_x_1, &curr_x[1]);
-    curr_x[2] = X[row * num_cols + col];
-    queue_pop(queue_prev_x_0, &curr_x[0]);
-    // MACs with 1st row of weights
-    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 1) % 3] += curr_x[0] * weights[0][1];
-    // MACs with 2nd row of weights
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 1) % 3] += curr_x[1] * weights[1][1];
-    // MACs with 3rd row of weights
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    acc_y[(col + 1) % 3] += curr_x[2] * weights[2][1];
-    // Store finished accumulation
-    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
-    // Increment column index
-    ++col;
-    // -------
-    // FLUSH 1
-    // -------
-    // Pop and load x vector
-    queue_pop(queue_prev_x_1, &curr_x[1]);
-    curr_x[2] = X[row * num_cols + col];
-    queue_pop(queue_prev_x_0, &curr_x[0]);
-    // MACs with 3rd column of weights
-    acc_y[(col + 0) % 3] += curr_x[0] * weights[0][2];
-    acc_y[(col + 0) % 3] += curr_x[1] * weights[1][2];
-    acc_y[(col + 0) % 3] += curr_x[2] * weights[2][2];
-    // Store finished accumulation
-    Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[col % 3];
     // ------------------
     // RESET ACCUMULATORS
     // ------------------

From 4517da9fe0094cb396c1a3a83b83fd03f17eca95 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Wed, 14 Sep 2022 12:45:20 +0200
Subject: [PATCH 19/24] [apps] Fix license and format

---
 software/apps/systolic/conv_xqueue/main.c   |  18 +---
 software/apps/systolic/matmul_xqueue/main.c |  18 +---
 software/apps/systolic/xqueue_test/main.c   |  20 +---
 software/runtime/systolic/conv_xqueue.h     | 106 ++++++++++----------
 software/runtime/systolic/matmul_xqueue.h   |  16 +--
 5 files changed, 63 insertions(+), 115 deletions(-)

diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c
index c95b674d5..f4c4339b8 100644
--- a/software/apps/systolic/conv_xqueue/main.c
+++ b/software/apps/systolic/conv_xqueue/main.c
@@ -1,18 +1,6 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-//
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 
 // Author: Gua Hao Khov, ETH Zurich
 
@@ -21,10 +9,10 @@
 
 #include "alloc.h"
 #include "encoding.h"
-#include "systolic/conv_xqueue.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "systolic/conv_xqueue.h"
 
 // Dimensions of matrix X
 #define DIM_X_M 258
diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c
index f7a648ab3..dada500b4 100644
--- a/software/apps/systolic/matmul_xqueue/main.c
+++ b/software/apps/systolic/matmul_xqueue/main.c
@@ -1,18 +1,6 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-//
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 
 // Author: Gua Hao Khov, ETH Zurich
 
@@ -21,10 +9,10 @@
 
 #include "alloc.h"
 #include "encoding.h"
-#include "systolic/matmul_xqueue.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "systolic/matmul_xqueue.h"
 
 // Dimensions of matrices
 #define DIM_M 24
diff --git a/software/apps/systolic/xqueue_test/main.c b/software/apps/systolic/xqueue_test/main.c
index 4cd39ca5c..ee4b7ee92 100644
--- a/software/apps/systolic/xqueue_test/main.c
+++ b/software/apps/systolic/xqueue_test/main.c
@@ -1,18 +1,6 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-//
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 
 // Author: Gua Hao Khov, ETH Zurich
 
@@ -32,14 +20,14 @@ int32_t producer_check, consumer_check, dummy_check;
 // queue push
 static inline int32_t queue_push(void *const queue, int32_t data) {
   int32_t ret;
-  asm volatile ("q.push.w %0, %1, (%2)" : "=r"(ret) : "r"(data), "r"(queue));
+  asm volatile("q.push.w %0, %1, (%2)" : "=r"(ret) : "r"(data), "r"(queue));
   return ret;
 }
 
 // queue pop
 inline int32_t queue_pop(void *const queue) {
   int32_t ret;
-  asm volatile ("q.pop.w %0, 0(%1)" : "=r"(ret) : "r"(queue));
+  asm volatile("q.pop.w %0, 0(%1)" : "=r"(ret) : "r"(queue));
   return ret;
 }
 
diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h
index bff238d0e..8e6e251de 100644
--- a/software/runtime/systolic/conv_xqueue.h
+++ b/software/runtime/systolic/conv_xqueue.h
@@ -1,18 +1,6 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-//
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 
 // Author: Gua Hao Khov, ETH Zurich
 
@@ -39,7 +27,10 @@ int32_t *queues_x_1[NUM_CORES];
 // queue push
 static inline void queue_push(void *const queue, int32_t data,
                               int32_t *const ret) {
-  asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue) : "memory");
+  asm volatile("q.push.w %0, %1, (%2)"
+               : "+r"(*ret)
+               : "r"(data), "r"(queue)
+               : "memory");
 }
 
 // queue pop
@@ -120,7 +111,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     acc_y[2] += curr_x[0] * weights[0][0];
     acc_y[2] += curr_x[1] * weights[1][0];
     acc_y[2] += curr_x[2] * weights[2][0];
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ----------
     // POPULATE 1
     // ----------
@@ -140,7 +131,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     // MACs with 3rd row of weights
     acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ------------------
     // CONVOLUTION BURSTS
     // ------------------
@@ -172,7 +163,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
       // Reset finished accumulation
       acc_y[2] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 1
       // -----------
@@ -199,7 +190,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
       // Reset finished accumulation
       acc_y[0] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 2
       // -----------
@@ -226,13 +217,13 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
       // Reset finished accumulation
       acc_y[1] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // ----------------
       // INCREMENT COLUMN
       // ----------------
       col += 3;
     }
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ---------------------
     // CONVOLUTION REMAINDER
     // ---------------------
@@ -263,8 +254,9 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
       // Increment column index
       ++col;
-      if (col >= num_cols) break;
-      __asm__ __volatile__("":::"memory");
+      if (col >= num_cols)
+        break;
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 1
       // -----------
@@ -315,7 +307,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     acc_y[2] += curr_x[0] * weights[0][0];
     acc_y[2] += curr_x[1] * weights[1][0];
     acc_y[2] += curr_x[2] * weights[2][0];
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ----------
     // POPULATE 1
     // ----------
@@ -332,7 +324,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
     // MACs with 3rd row of weights
     acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ------------------
     // CONVOLUTION BURSTS
     // ------------------
@@ -361,7 +353,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
       // Reset finished accumulation
       acc_y[2] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 1
       // -----------
@@ -385,7 +377,7 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
       // Reset finished accumulation
       acc_y[0] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 2
       // -----------
@@ -409,13 +401,13 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
       // Reset finished accumulation
       acc_y[1] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // ----------------
       // INCREMENT COLUMN
       // ----------------
       col += 3;
     }
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ---------------------
     // CONVOLUTION REMAINDER
     // ---------------------
@@ -446,8 +438,9 @@ void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
       // Increment column index
       ++col;
-      if (col >= num_cols) break;
-      __asm__ __volatile__("":::"memory");
+      if (col >= num_cols)
+        break;
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 1
       // -----------
@@ -522,7 +515,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     acc_y[2] += curr_x[0] * weights[0][0];
     acc_y[2] += curr_x[1] * weights[1][0];
     acc_y[2] += curr_x[2] * weights[2][0];
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ----------
     // POPULATE 1
     // ----------
@@ -542,7 +535,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     // MACs with 3rd row of weights
     acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ------------------
     // CONVOLUTION BURSTS
     // ------------------
@@ -574,7 +567,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
       // Reset finished accumulation
       acc_y[2] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 1
       // -----------
@@ -601,7 +594,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
       // Reset finished accumulation
       acc_y[0] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 2
       // -----------
@@ -628,13 +621,13 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
       // Reset finished accumulation
       acc_y[1] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // ----------------
       // INCREMENT COLUMN
       // ----------------
       col += 3;
     }
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ---------------------
     // CONVOLUTION REMAINDER
     // ---------------------
@@ -665,8 +658,9 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
       // Increment column index
       ++col;
-      if (col >= num_cols) break;
-      __asm__ __volatile__("":::"memory");
+      if (col >= num_cols)
+        break;
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 1
       // -----------
@@ -717,7 +711,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     acc_y[2] += curr_x[0] * weights[0][0];
     acc_y[2] += curr_x[1] * weights[1][0];
     acc_y[2] += curr_x[2] * weights[2][0];
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ----------
     // POPULATE 1
     // ----------
@@ -734,7 +728,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
     // MACs with 3rd row of weights
     acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ------------------
     // CONVOLUTION BURSTS
     // ------------------
@@ -763,7 +757,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
       // Reset finished accumulation
       acc_y[2] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 1
       // -----------
@@ -787,7 +781,7 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
       // Reset finished accumulation
       acc_y[0] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 2
       // -----------
@@ -811,13 +805,13 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
       // Reset finished accumulation
       acc_y[1] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // ----------------
       // INCREMENT COLUMN
       // ----------------
       col += 3;
     }
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ---------------------
     // CONVOLUTION REMAINDER
     // ---------------------
@@ -845,8 +839,9 @@ void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
       // Increment column index
       ++col;
-      if (col >= num_cols) break;
-      __asm__ __volatile__("":::"memory");
+      if (col >= num_cols)
+        break;
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 1
       // -----------
@@ -907,7 +902,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
     acc_y[2] += curr_x[0] * weights[0][0];
     acc_y[2] += curr_x[1] * weights[1][0];
     acc_y[2] += curr_x[2] * weights[2][0];
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ----------
     // POPULATE 1
     // ----------
@@ -924,7 +919,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
     // MACs with 3rd row of weights
     acc_y[2] += curr_x[2] * weights[2][1];
     acc_y[0] += curr_x[2] * weights[2][0];
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ------------------
     // CONVOLUTION BURSTS
     // ------------------
@@ -953,7 +948,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
       // Reset finished accumulation
       acc_y[2] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 1
       // -----------
@@ -977,7 +972,7 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
       // Reset finished accumulation
       acc_y[0] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 2
       // -----------
@@ -1001,13 +996,13 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
       // Reset finished accumulation
       acc_y[1] = 0;
-      __asm__ __volatile__("":::"memory");
+      __asm__ __volatile__("" ::: "memory");
       // ----------------
       // INCREMENT COLUMN
       // ----------------
       col += 3;
     }
-    __asm__ __volatile__("":::"memory");
+    __asm__ __volatile__("" ::: "memory");
     // ---------------------
     // CONVOLUTION REMAINDER
     // ---------------------
@@ -1035,8 +1030,9 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
       Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
       // Increment column index
       ++col;
-      if (col >= num_cols) break;
-      __asm__ __volatile__("":::"memory");
+      if (col >= num_cols)
+        break;
+      __asm__ __volatile__("" ::: "memory");
       // -----------
       // ITERATION 1
       // -----------
diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h
index cb26e762b..dbfe51b8b 100644
--- a/software/runtime/systolic/matmul_xqueue.h
+++ b/software/runtime/systolic/matmul_xqueue.h
@@ -1,18 +1,6 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-//
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 
 // Author: Gua Hao Khov, ETH Zurich
 

From 4691691e04cfff047f87589b75f564cc4c85da7d Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Thu, 15 Sep 2022 15:02:31 +0200
Subject: [PATCH 20/24] [CHANGELOG] Add Xqueue extension and sw kernels

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 187934617..e934e6ca3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### Added
 - Add a DMA
+- Add support to hardrware-accelerated queues for CGRA (RV32A extension)
+- Add systolic implementation of matmul and 2d convolution exploiting hardware-accelerated queues
 
 ### Fixed
 - Measure the `wfi` stalls and stalls caused by `opc` properly
@@ -34,7 +36,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Add the `terapool` configuration
 - Add read-only caches to the hierarchical AXI interconnect
 - Add a `memcpy` benchmark
-- Add a systolic configuration including runtime support and a matmul application
+- Add a systolic configuration for software-emulated CGRA including runtime support and a systolic matmul
 - Add `axpy` kernel
 - Add Spyglass linting scripts
 - Add an OpenMP runtime and example applications

From 8e210c7b6c621ea01ae174493e57cb26fbbd64f2 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Fri, 16 Sep 2022 11:02:56 +0200
Subject: [PATCH 21/24] [config] Update systolic config

---
 config/README.md   | 1 +
 config/systolic.mk | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/config/README.md b/config/README.md
index 1aa187773..60641c979 100644
--- a/config/README.md
+++ b/config/README.md
@@ -10,6 +10,7 @@ flavors of MemPool. We currently support three flavors:
 - `terapool`: 1024 cores, organized into 128 tiles with eight cores each
 - `mempool`: 256 cores, organized into 64 tiles with four cores each (default)
 - `minpool`: 16 cores, organized into 4 tiles with four cores each
+- `systolic`: same as `mempool` but the cores form a CGRA
 
 Use the `config` variable to define which configuration to take. For example,
 to run a simulation with the `minpool` configuration, you would run
diff --git a/config/systolic.mk b/config/systolic.mk
index e14ce5a99..9d22978d3 100644
--- a/config/systolic.mk
+++ b/config/systolic.mk
@@ -19,7 +19,7 @@ num_cores_per_tile ?= 4
 banking_factor ?= 4
 
 # Radix for hierarchical AXI interconnect
-axi_hier_radix ?= 16
+axi_hier_radix ?= 20
 
 # Number of AXI masters per group
 axi_masters_per_group ?= 1

From 26f38776660f806981b78a6701b225350c542792 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Fri, 23 Sep 2022 17:59:15 +0200
Subject: [PATCH 22/24] [hardware] :bug: Add write response to Xqueue TCDM
 adapter

---
 hardware/src/tcdm_adapter_xqueue.sv | 37 +++++++++++++----------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv
index 407952141..196ed2222 100644
--- a/hardware/src/tcdm_adapter_xqueue.sv
+++ b/hardware/src/tcdm_adapter_xqueue.sv
@@ -84,15 +84,15 @@ module tcdm_adapter_xqueue
   logic sresp_vld;
 
   // Helper signals to determine response data acquisition
-  logic mem_read_req;
-  logic force_rdata_acq;
-  logic prevent_rdata_acq;
+  logic mem_req;
+  logic prevent_resp_acq;
 
   // FSM related signals
   state_e state_q, state_d;
   logic   vld_amo_op;
   logic   req_accepted, resp_accepted;
   logic   queue_stalled_d, queue_stalled_q;
+  logic   amo_wb;
 
   // Temporary storage for AMO operations
   amo_op_t              amo_op_d, amo_op_q;
@@ -131,7 +131,7 @@ module tcdm_adapter_xqueue
     .ready_i(meta_out_rdy    ),
     .data_o (stored_meta_data)
   );
-  assign meta_in_vld  = req_accepted & !in_write_i & !stalled_queue_op;
+  assign meta_in_vld  = req_accepted & !stalled_queue_op;
   assign meta_out_rdy = sresp_select_q ? 1'b0 : resp_accepted;
 
   // Stores the metadata at handshake of stalled queue operations
@@ -169,10 +169,10 @@ module tcdm_adapter_xqueue
   assign resp_in_data  = out_rdata_i;
   assign rdata_out_rdy = resp_accepted;
 
-  // Set if memory read request occurs this cycle
-  assign mem_read_req = out_req_o & !out_write_o;
-  // Acquire response data a cycle after a memory read request (can be forced or prevented)
-  assign rdata_in_vld_d = force_rdata_acq | (mem_read_req & !prevent_rdata_acq);
+  // Set if memory read/write request occurs this cycle
+  assign mem_req = out_req_o && !amo_wb;
+  // Acquire response data a cycle after a memory read/write request (can be forced or prevented)
+  assign rdata_in_vld_d = mem_req & !prevent_resp_acq;
 
   // Output response valid if both meta and read data are available (the read data will always be last)
   assign resp_vld   = meta_out_vld  & rdata_out_vld;
@@ -194,6 +194,7 @@ module tcdm_adapter_xqueue
     amo_op_d        = AMONone;
     addr_d          = addr_q;
     amo_operand_b_d = amo_operand_b_q;
+    amo_wb          = 1'b0;
     state_d         = state_q;
     sresp_select_d  = sresp_select_q;
     queue_stalled_d = queue_stalled_q;
@@ -211,9 +212,8 @@ module tcdm_adapter_xqueue
     // Response data as feed-through of read data
     // resp_in_data   = out_rdata_i;
 
-    // Flags to force or prevent response acquisition
-    force_rdata_acq   = 1'b0;
-    prevent_rdata_acq = 1'b0;
+    // Flag to prevent read/write response acquisition in case it does not actually happen
+    prevent_resp_acq = 1'b0;
 
     // Flags to increment queue counters
     increment_tail = 1'b0;
@@ -257,14 +257,11 @@ module tcdm_adapter_xqueue
               // Note: Memory write is still executed but the tail is not incremented
               // Set stalled flag
               queue_stalled_d   = 1'b1;
-              // Prevent acquisition of response data
-              prevent_rdata_acq = 1'b1;
+              // Prevent acquisition of read/write response data
+              prevent_resp_acq = 1'b1;
             end else begin
               // Set increment flag
               increment_tail  = 1'b1;
-              // Force acquisition of response data despite a write access
-              // Response data will match the write data of the write access
-              force_rdata_acq = 1'b1;
               // Previous queue pop failed due to empty queue
               if (queue_stalled_q) begin
                 queue_stalled_d = 1'b0;
@@ -278,8 +275,8 @@ module tcdm_adapter_xqueue
             if (queue_empty) begin
               // Set stalled flag
               queue_stalled_d   = 1'b1;
-              // Prevent acquisition of response data despite read access
-              prevent_rdata_acq = 1'b1;
+              // Prevent acquisition of read/write response data
+              prevent_resp_acq = 1'b1;
             end else begin
               // Set increment flag
               increment_head = 1'b1;
@@ -303,6 +300,7 @@ module tcdm_adapter_xqueue
         out_write_o = 1'b1;
         out_add_o   = addr_q;
         out_be_o    = 4'b1111;
+        amo_wb      = 1'b1;
         // serve from register if we cut the path
         if (RegisterAmo) begin
           out_wdata_o = amo_result_q;
@@ -327,9 +325,6 @@ module tcdm_adapter_xqueue
           increment_tail  = 1'b1;
           // Trigger memory access
           out_req_o       = 1'b1;
-          // Force acquisition of response data despite a write access
-          // Response data will match the write data of the write access
-          force_rdata_acq = 1'b1;
           // Set meta data selection to stalled meta data
           sresp_select_d  = 1'b1;
           // Return to Idle

From 2d8fa4edd61cacf0d1a030afa803c912bc0b1b63 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Mon, 21 Nov 2022 15:20:22 +0100
Subject: [PATCH 23/24] [software] Generalize systolic matmul NUM_CORES

---
 software/apps/systolic/matmul_xqueue/main.c | 14 +++++++++-----
 software/runtime/systolic/matmul_xqueue.h   |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c
index dada500b4..5c69fde7e 100644
--- a/software/apps/systolic/matmul_xqueue/main.c
+++ b/software/apps/systolic/matmul_xqueue/main.c
@@ -70,6 +70,7 @@ int main() {
     core_mapping = (uint32_t *)simple_malloc(num_cores * 4);
   }
 
+#if NUM_CORES == 16
   // ----------
   // 16 CORES
   // ----------
@@ -89,7 +90,7 @@ int main() {
   // uint32_t row_idx = tile_id / 2;
   // row_idx *= 2;
   // row_idx += (core_id % 4) / 2;
-
+#elif NUM_CORES == 256
   // ----------
   // 256 CORES
   // ----------
@@ -99,8 +100,8 @@ int main() {
   // uint32_t row_idx = core_id / 16;
 
   // Assign grid position (col wise)
-  // uint32_t col_idx = core_id / 16;
-  // uint32_t row_idx = core_id % 16;
+  uint32_t col_idx = core_id / 16;
+  uint32_t row_idx = core_id % 16;
 
   // Assign grid position (square wise)
   // uint32_t col_idx = tile_id % 8;
@@ -122,6 +123,9 @@ int main() {
   // row_idx *= 2;
   // row_idx += (core_id % 4) / 2;
   // row_idx += add_row * 8;
+#else
+#error Unsupported NUM_CORES
+#endif
 
   // Wait for all cores
   mempool_barrier(num_cores);
@@ -138,10 +142,10 @@ int main() {
     printf("> Initialize\n");
 
     // Print out tile mapping
-    // print_matrix((int32_t *)tile_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
+    //print_matrix((int32_t *)tile_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
 
     // Print out core mapping
-    // print_matrix((int32_t *)core_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
+    //print_matrix((int32_t *)core_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
 
     // Initialize systolic array
     systolic_init(tile_mapping, core_mapping);
diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h
index dbfe51b8b..c1f8aac3b 100644
--- a/software/runtime/systolic/matmul_xqueue.h
+++ b/software/runtime/systolic/matmul_xqueue.h
@@ -28,7 +28,7 @@
 #include "printf.h"
 
 // Dimensions of square systolic array
-#define SYSTOLIC_SIZE 4
+#define SYSTOLIC_SIZE 16
 
 // Systolic matrix
 typedef struct {

From 0c2dffe5229cb7ca0ff904a0baa0cd0ecbce068d Mon Sep 17 00:00:00 2001
From: "msc22h14 Vaibhav Krishna (vakrishna)" <msc22h14@badile06.ee.ethz.ch>
Date: Fri, 2 Dec 2022 16:09:57 +0100
Subject: [PATCH 24/24] [tb] Add support to trace retired TCDM operations

---
 CHANGELOG.md              |   1 +
 CONTRIBUTORS.md           |   1 +
 hardware/Makefile         |   7 +-
 hardware/tb/mempool_tb.sv | 177 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 185 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e934e6ca3..f2ef14457 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Add a DMA
 - Add support to hardrware-accelerated queues for CGRA (RV32A extension)
 - Add systolic implementation of matmul and 2d convolution exploiting hardware-accelerated queues
+- Add ability to Trace the operations retired by the TCDM adapters  
 
 ### Fixed
 - Measure the `wfi` stalls and stalls caused by `opc` properly
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index a64295735..c7145f41e 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -11,4 +11,5 @@ Thanks to all.
 * Marc Gantenbein
 * Marco Bertuletti
 * Sergio Mazzola
+* Vaibhav Krishna
 * Yichao Zhang
diff --git a/hardware/Makefile b/hardware/Makefile
index 046d9ed04..5c1cd7ab9 100644
--- a/hardware/Makefile
+++ b/hardware/Makefile
@@ -47,6 +47,7 @@ verilator_top   ?= mempool_tb_verilator
 python          ?= python3
 # Enable tracing
 snitch_trace    ?= 0
+bank_trace		  ?= 0
 
 # Check if the specified QuestaSim version exists
 ifeq (, $(shell which $(questa_cmd)))
@@ -90,13 +91,17 @@ vlog_args += -work $(library)
 vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups) -DBANKING_FACTOR=$(banking_factor)
 vlog_defs += -DL2_BASE=$(l2_base) -DL2_SIZE=$(l2_size) -DL2_BANKS=$(l2_banks)
 vlog_defs += -DBOOT_ADDR=$(boot_addr) -DXPULPIMG=$(xpulpimg)
-vlog_defs += -DSNITCH_TRACE=$(snitch_trace)
+vlog_defs += -DSNITCH_TRACE=$(snitch_trace) -DBANK_TRACE=$(bank_trace)
 vlog_defs += -DAXI_DATA_WIDTH=$(axi_data_width)
 vlog_defs += -DRO_LINE_WIDTH=$(ro_line_width)
 vlog_defs += -DDMAS_PER_GROUP=$(dmas_per_group)
 vlog_defs += -DAXI_HIER_RADIX=$(axi_hier_radix) -DAXI_MASTERS_PER_GROUP=$(axi_masters_per_group)
 vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE=$(xqueue) -DXQUEUE_SIZE=$(xqueue_size)
 
+ifeq ($(xqueue),1)
+	vlog_defs+= -DXQUEUE_TCDM_ADAPTER 
+endif
+
 # Traffic generation enabled
 ifdef tg
 	tg_ncycles ?= 10000
diff --git a/hardware/tb/mempool_tb.sv b/hardware/tb/mempool_tb.sv
index c8dd12e9b..fedf5c651 100644
--- a/hardware/tb/mempool_tb.sv
+++ b/hardware/tb/mempool_tb.sv
@@ -194,6 +194,183 @@ module mempool_tb;
     end: gen_wfi_tiles
   end: gen_wfi_groups
 
+`endif
+`endif
+
+  /************************
+   *  Mempool Bank Trace  *
+   ************************/
+//Accessing Signals heirarchically not supported by Verilator
+`ifndef TARGET_SYNTHESIS
+`ifndef TARGET_VERILATOR
+  //Hierarchy for TCDM adapter
+  `ifdef XQUEUE_TCDM_ADAPTER 
+    `define TCDM_ADAPTER(group,tile,bank) \
+    dut.i_mempool_cluster.gen_groups[group].i_group.gen_tiles[tile].i_tile.gen_banks[bank].gen_tcdm_adapter_xqueue.i_tcdm_adapter 
+  `else
+    `define TCDM_ADAPTER(group,tile,bank) \
+    dut.i_mempool_cluster.gen_groups[group].i_group.gen_tiles[tile].i_tile.gen_banks[bank].gen_tcdm_adapter.i_tcdm_adapter
+  `endif
+  int f;
+
+  initial begin
+    f = $fopen("trace_bank.dasm", "w");
+  end
+
+  localparam int BankTrace = `ifdef BANK_TRACE `BANK_TRACE `else 0 `endif;
+
+  genvar i,j,k;
+  generate;
+    for (i=0; i<NumGroups; ++i) begin : gen_bank_trace_groups
+      for (j=0; j<NumTilesPerGroup; ++j) begin : gen_bank_trace_tiles
+        for (k=0; k<NumBanksPerTile; ++k) begin : gen_bank_trace_banks
+          int unsigned stall_d, stall_q, stall;
+          group_id_t group_id, ini_group;
+          tile_group_id_t ini_tile;
+          tile_core_id_t ini_core;
+          logic increment_head_q, increment_tail_q, vld_amo_op_q;
+          logic [DataWidth-1:0] q_push_data_d, q_push_data_q;
+          string trace_entry; 
+          typedef logic [$clog2(NumCoresPerTile + NumGroups)-1:0] local_req_interco_addr_t;
+          typedef struct packed {
+            local_req_interco_addr_t ini_addr;
+            meta_id_t meta_id;
+            tile_group_id_t tile_id;
+            tile_core_id_t core_id;
+            logic wide;
+          } metadata_t;
+          metadata_t metadata_sel; 
+          logic print_stall_d, print_stall_q, print_lw_d, print_lw_q, print_sw_d, print_sw_q;
+          logic [31:0] in_addr_d, in_addr_q;
+          logic [31:0] sw_d, sw_q;
+
+          always_comb begin
+            group_id      = i;
+            metadata_sel  = `TCDM_ADAPTER(i,j,k).in_meta_o;
+            stall_d       = stall_q;
+            q_push_data_d = q_push_data_q;
+            print_stall_d = 1'b0;
+            print_lw_d    = 1'b0;
+            print_sw_d    = 1'b0;
+            in_addr_d     = in_addr_q;
+            sw_d          = sw_q;
+            //Storing Qpush data
+            if(`TCDM_ADAPTER(i,j,k).in_valid_i && `TCDM_ADAPTER(i,j,k).in_amo_i == 4'hC) begin
+              q_push_data_d = `TCDM_ADAPTER(i,j,k).in_wdata_i;
+            end
+            //Calculating Intitiating core from Response Metadata 
+            if (metadata_sel.ini_addr >= NumCoresPerTile) begin
+              ini_group = $bits(group_id_t)'(metadata_sel.ini_addr - NumCoresPerTile) ^ group_id;
+              ini_tile  = metadata_sel.tile_id;
+              ini_core  = metadata_sel.core_id;
+            end else begin
+              ini_group = group_id;
+              ini_tile  = j;
+              ini_core  = metadata_sel.ini_addr;
+            end
+            `ifdef XQUEUE_TCDM_ADAPTER
+              //Stall calculation for queue operations
+              if (`TCDM_ADAPTER(i,j,k).increment_head || `TCDM_ADAPTER(i,j,k).increment_tail) begin
+                stall_d <= 0;
+              end else begin
+                if (`TCDM_ADAPTER(i,j,k).queue_stalled_q) begin
+                  stall_d <= stall_q + 1;
+                end
+              end
+              //Print the cycles of stalled queue operation when it is resolved 
+              if (`TCDM_ADAPTER(i,j,k).queue_stalled_q && !(`TCDM_ADAPTER(i,j,k).queue_stalled_d)) begin
+                print_stall_d = 1'b1;
+                stall = stall_q;
+              end
+            `endif
+            //Print Non-Atomic Loads and Stores 
+            if ((`TCDM_ADAPTER(i,j,k).in_amo_i == '0) && `TCDM_ADAPTER(i,j,k).in_valid_i && `TCDM_ADAPTER(i,j,k).in_ready_o) begin
+              in_addr_d = `TCDM_ADAPTER(i,j,k).in_address_i;
+              if (`TCDM_ADAPTER(i,j,k).in_write_i) begin
+                print_sw_d  = 1'b1;
+                sw_d        = `TCDM_ADAPTER(i,j,k).in_wdata_i;
+              end else begin
+                print_lw_d  = 1'b1;
+              end
+            end
+          end
+
+          always_ff @(posedge clk or negedge rst_n) begin
+            if (!rst_n) begin
+              stall_q           <= 0;
+              increment_head_q  <= '0;
+              increment_tail_q  <= '0;
+              vld_amo_op_q      <= '0;
+              q_push_data_q     <= '0;
+              print_stall_q     <= '0;
+              print_lw_q        <= '0;
+              print_sw_q        <= '0;
+              in_addr_q         <= '0;
+              sw_q              <= '0;
+            end else begin
+              stall_q           <= stall_d;
+              `ifdef XQUEUE_TCDM_ADAPTER
+                increment_head_q  <= `TCDM_ADAPTER(i,j,k).increment_head;
+                increment_tail_q  <= `TCDM_ADAPTER(i,j,k).increment_tail;
+                vld_amo_op_q      <= `TCDM_ADAPTER(i,j,k).vld_amo_op && `TCDM_ADAPTER(i,j,k).req_accepted;
+              `else 
+                increment_head_q  <= '0;
+                increment_tail_q  <= '0;
+                vld_amo_op_q      <= '0;
+              `endif
+              q_push_data_q     <= q_push_data_d;
+              print_stall_q     <= print_stall_d;
+              print_lw_q        <= print_lw_d;
+              print_sw_q        <= print_sw_d;
+              in_addr_q         <= in_addr_d;
+              sw_q              <= sw_d;
+              //Print when a Bank Operation is retired
+              if (BankTrace && `TCDM_ADAPTER(i,j,k).in_valid_o)begin
+                `ifdef XQUEUE_TCDM_ADAPTER
+                  //AMO excluding Qpush and Qpop
+                  if(vld_amo_op_q)begin
+                    trace_entry = $sformatf("%t: (%1d,%2d,%2d): %s, init=(%1d,%2d,%2d), address= 0x%h, data= %d\n",$time,i,j,k,`TCDM_ADAPTER(i,j,k).amo_op_q, ini_group, ini_tile, ini_core, `TCDM_ADAPTER(i,j,k).addr_q,`TCDM_ADAPTER(i,j,k).amo_result);
+                    $fwrite(f, trace_entry);
+                  end
+                  //Queue operations
+                  if(increment_head_q || increment_tail_q) begin
+                    if (increment_head_q) begin
+                      trace_entry = $sformatf("%t: (%1d,%2d,%2d): Qpop ,",$time,i,j,k); 
+                      trace_entry = $sformatf("%s init=(%1d,%2d,%2d), data= %d", trace_entry, ini_group, ini_tile, ini_core, `TCDM_ADAPTER(i,j,k).in_rdata_o);
+                    end else if (increment_tail_q)begin
+                      trace_entry = $sformatf("%t: (%1d,%2d,%2d): Qpush,",$time,i,j,k); 
+                      trace_entry = $sformatf("%s init=(%1d,%2d,%2d), data= %d", trace_entry, ini_group, ini_tile, ini_core, q_push_data_q);
+                    end
+                    if(print_stall_q) begin 
+                      trace_entry = $sformatf("%s: Qstall=%d\n", trace_entry, stall);
+                    end else begin
+                      trace_entry = $sformatf("%s\n",trace_entry);
+                    end
+                    $fwrite(f, trace_entry);
+                  end
+                `endif
+                //Load
+                if (print_lw_q) begin
+                  trace_entry =  $sformatf("%t: (%1d,%2d,%2d): Load Word , init=(%1d,%2d,%2d), address= 0x%h, data = %d\n",$time,i,j,k, ini_group, ini_tile, ini_core, in_addr_q, `TCDM_ADAPTER(i,j,k).in_rdata_o);
+                  $fwrite(f, trace_entry);
+                end
+                //Store
+                if (print_sw_q) begin
+                  trace_entry =  $sformatf("%t: (%1d,%2d,%2d): Store Word, init=(%1d,%2d,%2d), address= 0x%h, data = %d\n",$time,i,j,k, ini_group, ini_tile, ini_core, in_addr_q, sw_q);
+                  $fwrite(f, trace_entry);
+                end 
+              end
+            end
+          end 
+        end
+      end
+    end
+  endgenerate
+  
+  final begin
+    $fclose(f);
+  end
+
 `endif
 `endif