diff --git a/Bender.yml b/Bender.yml
index 08b62d28c..982aa0c6b 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -28,6 +28,7 @@ sources:
   - hardware/src/mempool_cc.sv
   - hardware/src/snitch_addr_demux.sv
   - hardware/src/tcdm_adapter.sv
+  - hardware/src/tcdm_adapter_xqueue.sv
   - hardware/src/tcdm_shim.sv
   - hardware/src/tcdm_wide_narrow_mux.sv
   - hardware/src/address_scrambler.sv
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 187934617..f2ef14457 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### Added
 - Add a DMA
+- Add support to hardrware-accelerated queues for CGRA (RV32A extension)
+- Add systolic implementation of matmul and 2d convolution exploiting hardware-accelerated queues
+- Add ability to Trace the operations retired by the TCDM adapters  
 
 ### Fixed
 - Measure the `wfi` stalls and stalls caused by `opc` properly
@@ -34,7 +37,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Add the `terapool` configuration
 - Add read-only caches to the hierarchical AXI interconnect
 - Add a `memcpy` benchmark
-- Add a systolic configuration including runtime support and a matmul application
+- Add a systolic configuration for software-emulated CGRA including runtime support and a systolic matmul
 - Add `axpy` kernel
 - Add Spyglass linting scripts
 - Add an OpenMP runtime and example applications
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index a64295735..c7145f41e 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -11,4 +11,5 @@ Thanks to all.
 * Marc Gantenbein
 * Marco Bertuletti
 * Sergio Mazzola
+* Vaibhav Krishna
 * Yichao Zhang
diff --git a/config/README.md b/config/README.md
index 1aa187773..60641c979 100644
--- a/config/README.md
+++ b/config/README.md
@@ -10,6 +10,7 @@ flavors of MemPool. We currently support three flavors:
 - `terapool`: 1024 cores, organized into 128 tiles with eight cores each
 - `mempool`: 256 cores, organized into 64 tiles with four cores each (default)
 - `minpool`: 16 cores, organized into 4 tiles with four cores each
+- `systolic`: same as `mempool` but the cores form a CGRA
 
 Use the `config` variable to define which configuration to take. For example,
 to run a simulation with the `minpool` configuration, you would run
diff --git a/config/config.mk b/config/config.mk
index ea0ff5425..fb01a9006 100644
--- a/config/config.mk
+++ b/config/config.mk
@@ -56,6 +56,9 @@ dmas_per_group ?= 4
 ##  Xqueues configuration  ##
 #############################
 
+# Hardware queues for systolic (atomic ISA extension in TCDM adapter)
+xqueue ?= 0
+
 # XQueue extension's queue size in each memory bank (in words)
 xqueue_size ?= 0
 
diff --git a/config/mempool.mk b/config/mempool.mk
index a3df45b35..ec2c34154 100644
--- a/config/mempool.mk
+++ b/config/mempool.mk
@@ -17,6 +17,9 @@ num_groups ?= 4
 # Number of cores per MemPool tile
 num_cores_per_tile ?= 4
 
+# L1 scratchpad banking factor
+banking_factor ?= 4
+
 # Radix for hierarchical AXI interconnect
 axi_hier_radix ?= 20
 
diff --git a/config/minpool.mk b/config/minpool.mk
index 455cd30e6..484bef548 100644
--- a/config/minpool.mk
+++ b/config/minpool.mk
@@ -17,6 +17,9 @@ num_groups ?= 4
 # Number of cores per MemPool tile
 num_cores_per_tile ?= 4
 
+# L1 scratchpad banking factor
+banking_factor ?= 4
+
 # Number of DMA backends in each group
 dmas_per_group ?= 1
 
diff --git a/config/systolic.mk b/config/systolic.mk
index 5de36e4c5..9d22978d3 100644
--- a/config/systolic.mk
+++ b/config/systolic.mk
@@ -15,8 +15,11 @@ num_groups ?= 4
 # Number of cores per MemPool tile
 num_cores_per_tile ?= 4
 
+# L1 scratchpad banking factor
+banking_factor ?= 4
+
 # Radix for hierarchical AXI interconnect
-axi_hier_radix ?= 16
+axi_hier_radix ?= 20
 
 # Number of AXI masters per group
 axi_masters_per_group ?= 1
@@ -29,6 +32,10 @@ seq_mem_size ?= 2048
 ##  Xqueues configuration  ##
 #############################
 
-# Xqueue extension's queue size (in queue entries)
-# in each memory bank (assume banking factor of 4)
+# Hardware queues for systolic (atomic ISA extension in TCDM adapter)
+xqueue ?= 1
+
+# Systolic queues size (assume banking factor of 4) for:
+# - software queues emulation (size measured in queue entries)
+# - hardware xqueue's queue in each memory bank (size measured in words)
 xqueue_size ?= 4
diff --git a/config/terapool.mk b/config/terapool.mk
index 5d3f90854..a9df13cba 100644
--- a/config/terapool.mk
+++ b/config/terapool.mk
@@ -17,6 +17,9 @@ num_groups ?= 8
 # Number of cores per Terapool tile
 num_cores_per_tile ?= 8
 
+# L1 scratchpad banking factor
+banking_factor ?= 4
+
 # Radix for hierarchical AXI interconnect
 axi_hier_radix ?= 8
 
diff --git a/hardware/Makefile b/hardware/Makefile
index 7965053d4..5c1cd7ab9 100644
--- a/hardware/Makefile
+++ b/hardware/Makefile
@@ -47,6 +47,7 @@ verilator_top   ?= mempool_tb_verilator
 python          ?= python3
 # Enable tracing
 snitch_trace    ?= 0
+bank_trace		  ?= 0
 
 # Check if the specified QuestaSim version exists
 ifeq (, $(shell which $(questa_cmd)))
@@ -87,15 +88,19 @@ endif
 vlog_args += -suppress vlog-2583 -suppress vlog-13314 -suppress vlog-13233
 vlog_args += -work $(library)
 # Defines
-vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups)
+vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups) -DBANKING_FACTOR=$(banking_factor)
 vlog_defs += -DL2_BASE=$(l2_base) -DL2_SIZE=$(l2_size) -DL2_BANKS=$(l2_banks)
 vlog_defs += -DBOOT_ADDR=$(boot_addr) -DXPULPIMG=$(xpulpimg)
-vlog_defs += -DSNITCH_TRACE=$(snitch_trace)
+vlog_defs += -DSNITCH_TRACE=$(snitch_trace) -DBANK_TRACE=$(bank_trace)
 vlog_defs += -DAXI_DATA_WIDTH=$(axi_data_width)
 vlog_defs += -DRO_LINE_WIDTH=$(ro_line_width)
 vlog_defs += -DDMAS_PER_GROUP=$(dmas_per_group)
 vlog_defs += -DAXI_HIER_RADIX=$(axi_hier_radix) -DAXI_MASTERS_PER_GROUP=$(axi_masters_per_group)
-vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE_SIZE=$(xqueue_size)
+vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE=$(xqueue) -DXQUEUE_SIZE=$(xqueue_size)
+
+ifeq ($(xqueue),1)
+	vlog_defs+= -DXQUEUE_TCDM_ADAPTER 
+endif
 
 # Traffic generation enabled
 ifdef tg
diff --git a/hardware/deps/snitch/src/riscv_instr.sv b/hardware/deps/snitch/src/riscv_instr.sv
index 23107aa70..afbd2cd7c 100644
--- a/hardware/deps/snitch/src/riscv_instr.sv
+++ b/hardware/deps/snitch/src/riscv_instr.sv
@@ -935,6 +935,8 @@ package riscv_instr;
   localparam logic [31:0] PV_PACK_H          = 32'b1101001??????????000?????1010111;
   localparam logic [31:0] PV_PACKHI_B        = 32'b1101100??????????001?????1010111;
   localparam logic [31:0] PV_PACKLO_B        = 32'b1110000??????????001?????1010111;
+  localparam logic [31:0] Q_PUSH             = 32'b00111????????????010?????0101111;
+  localparam logic [31:0] Q_POP              = 32'b00110??00000?????010?????0101111;
   /* CSR Addresses */
   localparam logic [11:0] CSR_FFLAGS = 12'h1;
   localparam logic [11:0] CSR_FRM = 12'h2;
diff --git a/hardware/deps/snitch/src/snitch.sv b/hardware/deps/snitch/src/snitch.sv
index e4d48bb18..fd2927834 100644
--- a/hardware/deps/snitch/src/snitch.sv
+++ b/hardware/deps/snitch/src/snitch.sv
@@ -18,7 +18,8 @@ module snitch
   parameter logic [31:0] MTVEC     = BootAddr, // Exception Base Address (see privileged spec 3.1.7)
   parameter bit          RVE       = 0,   // Reduced-register Extension
   parameter bit          RVM       = 1,   // Enable IntegerMmultiplication & Division Extension
-  parameter int    RegNrWritePorts = 2    // Implement one or two write ports into the register file
+  parameter int    RegNrWritePorts = 2,   // Implement one or two write ports into the register file
+  parameter bit          Xqueue    = 0
 ) (
   input  logic          clk_i,
   input  logic          rst_i,
@@ -152,7 +153,10 @@ module snitch
     AMOMin  = 4'h8,
     AMOMinu = 4'h9,
     AMOLR   = 4'hA,
-    AMOSC   = 4'hB
+    AMOSC   = 4'hB,
+    // TODO(smazzola): parametrize
+    QPush   = 4'hC, // Only used when Xqueue is enabled
+    QPop    = 4'hD  // Only used when Xqueue is enabled
   } ls_amo;
 
   logic [31:0] ld_result;
@@ -1324,6 +1328,41 @@ module snitch
       end
 /* end of Xpulpimg extension */
 
+/* Xqueues extension */
+      // TODO(khovg): Add define to include instr
+      riscv_instr::Q_PUSH: begin
+        if (Xqueue) begin
+          alu_op = BypassA;
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          is_load = 1'b1;
+          is_signed = 1'b1;
+          ls_size = Word;
+          ls_amo = QPush;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      // TODO(khovg): Two source registers are unnnecessary
+      riscv_instr::Q_POP: begin
+        if (Xqueue) begin
+          alu_op = BypassA;
+          write_rd = 1'b0;
+          uses_rd = 1'b1;
+          is_load = 1'b1;
+          is_signed = 1'b1;
+          ls_size = Word;
+          ls_amo = QPop;
+          opa_select = Reg;
+          opb_select = Reg;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+/* end of Xqueues extension */
+
       // TODO(zarubaf): Illegal Instructions
       default: begin
         illegal_inst = 1'b1;
diff --git a/hardware/src/mempool_cc.sv b/hardware/src/mempool_cc.sv
index 096156608..3c86b19d4 100644
--- a/hardware/src/mempool_cc.sv
+++ b/hardware/src/mempool_cc.sv
@@ -57,10 +57,11 @@ module mempool_cc
 
   // Snitch Integer Core
   snitch #(
-    .BootAddr ( BootAddr ),
-    .MTVEC    ( MTVEC    ),
-    .RVE      ( RVE      ),
-    .RVM      ( RVM      )
+    .BootAddr ( BootAddr            ),
+    .MTVEC    ( MTVEC               ),
+    .RVE      ( RVE                 ),
+    .RVM      ( RVM                 ),
+    .Xqueue   ( mempool_pkg::Xqueue )
   ) i_snitch (
     .clk_i                                   ,
     .rst_i                                   ,
diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv
index a11eeeff1..ce7915ee3 100644
--- a/hardware/src/mempool_pkg.sv
+++ b/hardware/src/mempool_pkg.sv
@@ -35,7 +35,7 @@ package mempool_pkg;
   localparam integer unsigned DataWidth        = 32;
   localparam integer unsigned BeWidth          = DataWidth / 8;
   localparam integer unsigned ByteOffset       = $clog2(BeWidth);
-  localparam integer unsigned BankingFactor    = 4;
+  localparam integer unsigned BankingFactor    = `ifdef BANKING_FACTOR `BANKING_FACTOR `else 0 `endif;
   localparam bit              LrScEnable       = 1'b1;
   localparam integer unsigned TCDMSizePerBank  = 1024; // [B]
   localparam integer unsigned NumBanks         = NumCores * BankingFactor;
@@ -258,6 +258,9 @@ package mempool_pkg;
    *  QUEUE PARAMETERS  *
    **********************/
 
+  // Size of queues in words (must be a power of two)
+  localparam bit Xqueue = `ifdef XQUEUE `XQUEUE `else 1'b0 `endif;
+
   // Size of xqueues in words (must be a power of two)
   localparam int unsigned XQueueSize = `ifdef XQUEUE_SIZE `XQUEUE_SIZE `else 0 `endif;
 
diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv
index a3a6aa50b..19dacacae 100644
--- a/hardware/src/mempool_tile.sv
+++ b/hardware/src/mempool_tile.sv
@@ -381,34 +381,65 @@ module mempool_tile
     assign bank_resp_payload[b].rdata.amo     = '0; // Don't care
     assign bank_resp_wide[b]                  = meta_out.wide;
 
-    tcdm_adapter #(
-      .AddrWidth  (TCDMAddrMemWidth),
-      .DataWidth  (DataWidth       ),
-      .metadata_t (bank_metadata_t ),
-      .LrScEnable (LrScEnable      ),
-      .RegisterAmo(1'b0            )
-    ) i_tcdm_adapter (
-      .clk_i       (clk_i                                                                       ),
-      .rst_ni      (rst_ni                                                                      ),
-      .in_valid_i  (bank_req_valid[b]                                                           ),
-      .in_ready_o  (bank_req_ready[b]                                                           ),
-      .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]),
-      .in_amo_i    (bank_req_payload[b].wdata.amo                                               ),
-      .in_write_i  (bank_req_payload[b].wen                                                     ),
-      .in_wdata_i  (bank_req_payload[b].wdata.data                                              ),
-      .in_meta_i   (meta_in                                                                     ),
-      .in_be_i     (bank_req_payload[b].be                                                      ),
-      .in_valid_o  (bank_resp_valid[b]                                                          ),
-      .in_ready_i  (bank_resp_ready[b]                                                          ),
-      .in_rdata_o  (bank_resp_payload[b].rdata.data                                             ),
-      .in_meta_o   (meta_out                                                                    ),
-      .out_req_o   (req_valid                                                                   ),
-      .out_add_o   (req_addr                                                                    ),
-      .out_write_o (req_write                                                                   ),
-      .out_wdata_o (req_wdata                                                                   ),
-      .out_be_o    (req_be                                                                      ),
-      .out_rdata_i (resp_rdata                                                                  )
-    );
+    if (Xqueue) begin: gen_tcdm_adapter_xqueue
+      tcdm_adapter_xqueue #(
+        .AddrWidth  (TCDMAddrMemWidth),
+        .DataWidth  (DataWidth       ),
+        .XQueueSize (XQueueSize      ),
+        .metadata_t (bank_metadata_t ),
+        .RegisterAmo(1'b0            )
+      ) i_tcdm_adapter (
+        .clk_i       (clk_i                                                                       ),
+        .rst_ni      (rst_ni                                                                      ),
+        .in_valid_i  (bank_req_valid[b]                                                           ),
+        .in_ready_o  (bank_req_ready[b]                                                           ),
+        .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]),
+        .in_amo_i    (bank_req_payload[b].wdata.amo                                               ),
+        .in_write_i  (bank_req_payload[b].wen                                                     ),
+        .in_wdata_i  (bank_req_payload[b].wdata.data                                              ),
+        .in_meta_i   (meta_in                                                                     ),
+        .in_be_i     (bank_req_payload[b].be                                                      ),
+        .in_valid_o  (bank_resp_valid[b]                                                          ),
+        .in_ready_i  (bank_resp_ready[b]                                                          ),
+        .in_rdata_o  (bank_resp_payload[b].rdata.data                                             ),
+        .in_meta_o   (meta_out                                                                    ),
+        .out_req_o   (req_valid                                                                   ),
+        .out_add_o   (req_addr                                                                    ),
+        .out_write_o (req_write                                                                   ),
+        .out_wdata_o (req_wdata                                                                   ),
+        .out_be_o    (req_be                                                                      ),
+        .out_rdata_i (resp_rdata                                                                  )
+      );
+    end else begin: gen_tcdm_adapter
+      tcdm_adapter #(
+        .AddrWidth  (TCDMAddrMemWidth),
+        .DataWidth  (DataWidth       ),
+        .metadata_t (bank_metadata_t ),
+        .LrScEnable (LrScEnable      ),
+        .RegisterAmo(1'b0            )
+      ) i_tcdm_adapter (
+        .clk_i       (clk_i                                                                       ),
+        .rst_ni      (rst_ni                                                                      ),
+        .in_valid_i  (bank_req_valid[b]                                                           ),
+        .in_ready_o  (bank_req_ready[b]                                                           ),
+        .in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]),
+        .in_amo_i    (bank_req_payload[b].wdata.amo                                               ),
+        .in_write_i  (bank_req_payload[b].wen                                                     ),
+        .in_wdata_i  (bank_req_payload[b].wdata.data                                              ),
+        .in_meta_i   (meta_in                                                                     ),
+        .in_be_i     (bank_req_payload[b].be                                                      ),
+        .in_valid_o  (bank_resp_valid[b]                                                          ),
+        .in_ready_i  (bank_resp_ready[b]                                                          ),
+        .in_rdata_o  (bank_resp_payload[b].rdata.data                                             ),
+        .in_meta_o   (meta_out                                                                    ),
+        .out_req_o   (req_valid                                                                   ),
+        .out_add_o   (req_addr                                                                    ),
+        .out_write_o (req_write                                                                   ),
+        .out_wdata_o (req_wdata                                                                   ),
+        .out_be_o    (req_be                                                                      ),
+        .out_rdata_i (resp_rdata                                                                  )
+      );
+    end
 
     // Bank
     tc_sram #(
diff --git a/hardware/src/tcdm_adapter_xqueue.sv b/hardware/src/tcdm_adapter_xqueue.sv
new file mode 100644
index 000000000..196ed2222
--- /dev/null
+++ b/hardware/src/tcdm_adapter_xqueue.sv
@@ -0,0 +1,508 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Description: Handles the protocol conversion from valid/ready to req/gnt and correctly returns
+// the metadata. Additionally, it handles atomics. Hence, it needs to be instantiated in front of
+// an SRAM over which it has exclusive access.
+//
+// Author: Samuel Riedel <sriedel@iis.ee.ethz.ch>
+
+`include "common_cells/registers.svh"
+
+module tcdm_adapter_xqueue
+  import cf_math_pkg::idx_width;
+#(
+  parameter int unsigned AddrWidth    = 32,
+  parameter int unsigned DataWidth    = 32,
+  parameter int unsigned XQueueSize   = 4,
+  parameter type         metadata_t   = logic,
+  parameter bit          RegisterAmo  = 1'b0, // Cut path between request and response at the cost of increased AMO latency
+  // Dependent parameters. DO NOT CHANGE.
+  localparam int unsigned BeWidth     = DataWidth/8,
+  localparam int unsigned QCntWidth   = idx_width(XQueueSize)
+) (
+  input  logic                 clk_i,
+  input  logic                 rst_ni,
+  // master side
+  input  logic                 in_valid_i,   // Bank request
+  output logic                 in_ready_o,   // Bank grant
+  input  logic [AddrWidth-1:0] in_address_i, // Address
+  input  logic [3:0]           in_amo_i,     // Atomic Memory Operation
+  input  logic                 in_write_i,   // 1: Store, 0: Load
+  input  logic [DataWidth-1:0] in_wdata_i,   // Write data
+  input  metadata_t            in_meta_i,    // Meta data
+  input  logic [BeWidth-1:0]   in_be_i,      // Byte enable
+  output logic                 in_valid_o,   // Response valid
+  input  logic                 in_ready_i,   // Response ready
+  output logic [DataWidth-1:0] in_rdata_o,   // Read data
+  output metadata_t            in_meta_o,    // Meta data
+  // slave side
+  output logic                 out_req_o,   // Bank request
+  output logic [AddrWidth-1:0] out_add_o,   // Address
+  output logic                 out_write_o, // 1: Store, 0: Load
+  output logic [DataWidth-1:0] out_wdata_o, // Write data
+  output logic [BeWidth-1:0]   out_be_o,    // Bit enable
+  input  logic [DataWidth-1:0] out_rdata_i  // Read data
+);
+
+  typedef enum logic [3:0] {
+      AMONone = 4'h0,
+      AMOSwap = 4'h1,
+      AMOAdd  = 4'h2,
+      AMOAnd  = 4'h3,
+      AMOOr   = 4'h4,
+      AMOXor  = 4'h5,
+      AMOMax  = 4'h6,
+      AMOMaxu = 4'h7,
+      AMOMin  = 4'h8,
+      AMOMinu = 4'h9,
+      AMOLR   = 4'hA,
+      AMOSC   = 4'hB,
+      QPush   = 4'hC,
+      QPop    = 4'hD
+  } amo_op_t;
+
+  typedef enum logic [2:0] {
+    Idle, DoAMO, WriteBackAMO, ResolveQPushStall, ResolveQPopStall
+  } state_e;
+
+  // Stored data in spill registers and fall through register
+  metadata_t           stored_meta_data;
+  metadata_t           stored_smeta_data;
+  logic[DataWidth-1:0] resp_in_data;
+
+  // Handshake signals for spill registers and fall through register
+  logic meta_in_vld, meta_in_rdy, meta_out_vld, meta_out_rdy;
+  logic smeta_in_vld, smeta_in_rdy, smeta_out_vld, smeta_out_rdy;
+  logic rdata_in_vld_d, rdata_in_vld_q;
+  logic rdata_in_rdy, rdata_out_vld, rdata_out_rdy;
+
+  // Response meta data selection and valid signals
+  logic sresp_select_d, sresp_select_q;
+  logic resp_vld;
+  logic sresp_vld;
+
+  // Helper signals to determine response data acquisition
+  logic mem_req;
+  logic prevent_resp_acq;
+
+  // FSM related signals
+  state_e state_q, state_d;
+  logic   vld_amo_op;
+  logic   req_accepted, resp_accepted;
+  logic   queue_stalled_d, queue_stalled_q;
+  logic   amo_wb;
+
+  // Temporary storage for AMO operations
+  amo_op_t              amo_op_d, amo_op_q;
+  logic [AddrWidth-1:0] addr_d, addr_q;
+
+  // AMO ALU signals
+  logic [31:0] amo_operand_a;
+  logic [31:0] amo_operand_b_d, amo_operand_b_q;
+  logic [31:0] amo_result, amo_result_q;
+
+  // Queue counters
+  logic unsigned [QCntWidth-1:0] curr_tail_d, curr_tail_q;
+  logic unsigned [QCntWidth-1:0] next_tail_d, next_tail_q;
+  logic unsigned [QCntWidth-1:0] curr_head_d, curr_head_q;
+
+  // Queue counter increment
+  logic unsigned [QCntWidth-1:0] increment_operand, increment_result;
+
+  // Queue management signals
+  logic queue_empty;
+  logic queue_full;
+  logic increment_tail, increment_head;
+  logic stalled_queue_op;
+
+  // Stores the metadata at handshake (except stalled queue operations)
+  spill_register #(
+    .T     (metadata_t),
+    .Bypass(1'b0      )
+  ) i_meta_register (
+    .clk_i  (clk_i           ),
+    .rst_ni (rst_ni          ),
+    .valid_i(meta_in_vld     ),
+    .ready_o(meta_in_rdy     ),
+    .data_i (in_meta_i       ),
+    .valid_o(meta_out_vld    ),
+    .ready_i(meta_out_rdy    ),
+    .data_o (stored_meta_data)
+  );
+  assign meta_in_vld  = req_accepted & !stalled_queue_op;
+  assign meta_out_rdy = sresp_select_q ? 1'b0 : resp_accepted;
+
+  // Stores the metadata at handshake of stalled queue operations
+  spill_register #(
+    .T     (metadata_t),
+    .Bypass(1'b0      )
+  ) i_stallmeta_register (
+    .clk_i  (clk_i            ),
+    .rst_ni (rst_ni           ),
+    .valid_i(smeta_in_vld     ),
+    .ready_o(smeta_in_rdy     ),
+    .data_i (in_meta_i        ),
+    .valid_o(smeta_out_vld    ),
+    .ready_i(smeta_out_rdy    ),
+    .data_o (stored_smeta_data)
+  );
+  assign smeta_in_vld  = req_accepted & stalled_queue_op;
+  assign smeta_out_rdy = sresp_select_q ? resp_accepted : 1'b0;
+
+  // Store response data if it's not accepted immediately
+  fall_through_register #(
+    .T(logic[DataWidth-1:0])
+  ) i_rdata_register (
+    .clk_i     (clk_i         ),
+    .rst_ni    (rst_ni        ),
+    .clr_i     (1'b0          ),
+    .testmode_i(1'b0          ),
+    .data_i    (resp_in_data  ),
+    .valid_i   (rdata_in_vld_q),
+    .ready_o   (rdata_in_rdy  ),
+    .data_o    (in_rdata_o    ),
+    .valid_o   (rdata_out_vld ),
+    .ready_i   (rdata_out_rdy )
+  );
+  assign resp_in_data  = out_rdata_i;
+  assign rdata_out_rdy = resp_accepted;
+
+  // Set if memory read/write request occurs this cycle
+  assign mem_req = out_req_o && !amo_wb;
+  // Acquire response data a cycle after a memory read/write request (can be forced or prevented)
+  assign rdata_in_vld_d = mem_req & !prevent_resp_acq;
+
+  // Output response valid if both meta and read data are available (the read data will always be last)
+  assign resp_vld   = meta_out_vld  & rdata_out_vld;
+  assign sresp_vld  = smeta_out_vld & rdata_out_vld;
+  // Select output valid depending on response selection
+  assign in_valid_o = sresp_select_q ? sresp_vld         : resp_vld;
+  // Select output meta data depending on response selection
+  assign in_meta_o  = sresp_select_q ? stored_smeta_data : stored_meta_data;
+
+  // Exclude queue operations as valid amo operations
+  assign vld_amo_op    = !(amo_op_t'(in_amo_i) inside {AMONone, QPush, QPop});
+  // Request is accepted on successful input handshake
+  assign req_accepted  = in_valid_i & in_ready_o;
+  // Response is accepted on successful output handshake
+  assign resp_accepted = in_ready_i & in_valid_o;
+
+  always_comb begin
+    // Default
+    amo_op_d        = AMONone;
+    addr_d          = addr_q;
+    amo_operand_b_d = amo_operand_b_q;
+    amo_wb          = 1'b0;
+    state_d         = state_q;
+    sresp_select_d  = sresp_select_q;
+    queue_stalled_d = queue_stalled_q;
+
+    // While response is pending no requests are accepted
+    in_ready_o = in_valid_o & ~in_ready_i ? 1'b0 : 1'b1;
+
+    // Feed-through of request
+    out_req_o   = req_accepted;
+    out_add_o   = in_address_i;
+    out_write_o = in_write_i;
+    out_wdata_o = in_wdata_i;
+    out_be_o    = in_be_i;
+
+    // Response data as feed-through of read data
+    // resp_in_data   = out_rdata_i;
+
+    // Flag to prevent read/write response acquisition in case it does not actually happen
+    prevent_resp_acq = 1'b0;
+
+    // Flags to increment queue counters
+    increment_tail = 1'b0;
+    increment_head = 1'b0;
+
+    // FSM
+    unique case (state_q)
+      // Idle State handles normal load/stores, non-stalled queue operations
+      // and the initial read of AMO operations (single cycle operations)
+      // In case of pending queue stall or AMO operations transition away
+      Idle: begin
+        // Prepare queue push
+        if (amo_op_t'(in_amo_i) == QPush) begin
+          // Write data at tail of queue
+          out_add_o   = curr_tail_q;
+          out_write_o = 1'b1;
+        end
+
+        // Prepare queue pop
+        if (amo_op_t'(in_amo_i) == QPop) begin
+          // Read data at head of queue
+          out_add_o = curr_head_q;
+        end
+
+        // Request accepted (triggers memory access)
+        if (req_accepted) begin
+          // Reset meta data selection to default meta data
+          sresp_select_d = 1'b0;
+
+          // AMO operation
+          if (vld_amo_op) begin
+            amo_op_d        = amo_op_t'(in_amo_i);
+            addr_d          = in_address_i;
+            amo_operand_b_d = in_wdata_i;
+            state_d         = DoAMO;
+          end
+
+          // Queue push
+          if (amo_op_t'(in_amo_i) == QPush) begin
+            if (queue_full) begin
+              // Note: Memory write is still executed but the tail is not incremented
+              // Set stalled flag
+              queue_stalled_d   = 1'b1;
+              // Prevent acquisition of read/write response data
+              prevent_resp_acq = 1'b1;
+            end else begin
+              // Set increment flag
+              increment_tail  = 1'b1;
+              // Previous queue pop failed due to empty queue
+              if (queue_stalled_q) begin
+                queue_stalled_d = 1'b0;
+                state_d         = ResolveQPopStall;
+              end
+            end
+          end
+
+          // Queue pop
+          if (amo_op_t'(in_amo_i) == QPop) begin
+            if (queue_empty) begin
+              // Set stalled flag
+              queue_stalled_d   = 1'b1;
+              // Prevent acquisition of read/write response data
+              prevent_resp_acq = 1'b1;
+            end else begin
+              // Set increment flag
+              increment_head = 1'b1;
+              // Previous queue push failed due to full queue
+              if (queue_stalled_q) begin
+                queue_stalled_d = 1'b0;
+                state_d         = ResolveQPushStall;
+              end
+            end
+          end
+        end
+      end
+
+      // DoAMO & WriteBackAMO State claims the memory interface for AMO write
+      DoAMO, WriteBackAMO: begin
+        in_ready_o  = 1'b0;
+        // Return to Idle one cycle later if we cut the path
+        state_d     = (RegisterAmo && state_q != WriteBackAMO) ?  WriteBackAMO : Idle;
+        // Commit AMO
+        out_req_o   = 1'b1;
+        out_write_o = 1'b1;
+        out_add_o   = addr_q;
+        out_be_o    = 4'b1111;
+        amo_wb      = 1'b1;
+        // serve from register if we cut the path
+        if (RegisterAmo) begin
+          out_wdata_o = amo_result_q;
+        end else begin
+          out_wdata_o = amo_result;
+        end
+      end
+
+      // ResolveQPushStall State blocks any requests until queue pop response
+      // has been accepted and then prepares the queue push response
+      // (queue push stores data even in full queue but does not update tail)
+      ResolveQPushStall: begin
+        // Do not accept any requests during resolve
+        in_ready_o  = 1'b0;
+        // Retrieve queue push data as dummy response (read data at tail of queue)
+        out_add_o   = curr_tail_q;
+        out_write_o = 1'b0;
+        out_be_o    = 4'b1111;
+        // Wait until pop response accepted
+        if (resp_accepted) begin
+          // Set increment flag
+          increment_tail  = 1'b1;
+          // Trigger memory access
+          out_req_o       = 1'b1;
+          // Set meta data selection to stalled meta data
+          sresp_select_d  = 1'b1;
+          // Return to Idle
+          state_d         = Idle;
+        end
+      end
+
+      // ResolveQPushStall State blocks any requests until queue push response
+      // has been accepted and then executes the queue pop
+      ResolveQPopStall: begin
+        // Do not accept any requests during resolve
+        in_ready_o  = 1'b0;
+        // Prepare queue pop (read data at head of queue)
+        out_add_o   = curr_head_q;
+        out_write_o = 1'b0;
+        out_be_o    = 4'b1111;
+        // Wait until push response accepted
+        if (resp_accepted) begin
+          // Set increment flag
+          increment_head = 1'b1;
+          // Trigger memory access
+          out_req_o      = 1'b1;
+          // Set meta data selection to stalled meta data
+          sresp_select_d = 1'b1;
+          // Return to Idle
+          state_d        = Idle;
+        end
+      end
+      default:;
+    endcase
+  end
+
+  // ----------------
+  // AMO ALU
+  // ----------------
+  logic [33:0] adder_sum;
+  logic [32:0] adder_operand_a, adder_operand_b;
+
+  assign amo_operand_a = out_rdata_i;
+  assign adder_sum     = adder_operand_a + adder_operand_b;
+  /* verilator lint_off WIDTH */
+  always_comb begin : amo_alu
+
+    adder_operand_a = $signed(amo_operand_a);
+    adder_operand_b = $signed(amo_operand_b_q);
+
+    amo_result = amo_operand_b_q;
+
+    unique case (amo_op_q)
+      // the default is to output operand_b
+      AMOSwap:;
+      AMOAdd: amo_result = adder_sum[31:0];
+      AMOAnd: amo_result = amo_operand_a & amo_operand_b_q;
+      AMOOr:  amo_result = amo_operand_a | amo_operand_b_q;
+      AMOXor: amo_result = amo_operand_a ^ amo_operand_b_q;
+      AMOMax: begin
+        adder_operand_b = -$signed(amo_operand_b_q);
+        amo_result = adder_sum[32] ? amo_operand_b_q : amo_operand_a;
+      end
+      AMOMin: begin
+        adder_operand_b = -$signed(amo_operand_b_q);
+        amo_result = adder_sum[32] ? amo_operand_a : amo_operand_b_q;
+      end
+      AMOMaxu: begin
+        adder_operand_a = $unsigned(amo_operand_a);
+        adder_operand_b = -$unsigned(amo_operand_b_q);
+        amo_result = adder_sum[32] ? amo_operand_b_q : amo_operand_a;
+      end
+      AMOMinu: begin
+        adder_operand_a = $unsigned(amo_operand_a);
+        adder_operand_b = -$unsigned(amo_operand_b_q);
+        amo_result = adder_sum[32] ? amo_operand_a : amo_operand_b_q;
+      end
+      default: amo_result = '0;
+    endcase
+  end
+
+  if (RegisterAmo) begin : gen_amo_slice
+    `FFLNR(amo_result_q, amo_result, (state_q == DoAMO), clk_i)
+  end else begin : gen_amo_slice
+    assign amo_result_q = '0;
+  end
+
+  // ----------------
+  // QUEUE MANAGEMENT
+  // ----------------
+  assign queue_empty = (curr_head_q == curr_tail_q);
+  assign queue_full  = (curr_head_q == next_tail_q);
+
+  assign increment_result = increment_operand + 1;
+
+  always_comb begin : queue_management
+    // Default
+    curr_tail_d = curr_tail_q;
+    next_tail_d = next_tail_q;
+    curr_head_d = curr_head_q;
+
+    // Increment queue counters
+    increment_operand = curr_head_q;
+    if (increment_tail) begin
+      increment_operand = next_tail_q;
+      curr_tail_d       = next_tail_q;
+      next_tail_d       = increment_result;
+    end
+    if (increment_head) begin
+      increment_operand = curr_head_q;
+      curr_head_d       = increment_result;
+    end
+
+    // Select spill register for meta data
+    unique case (amo_op_t'(in_amo_i))
+      QPush:   stalled_queue_op = queue_full;
+      QPop:    stalled_queue_op = queue_empty;
+      default: stalled_queue_op = 1'b0;
+    endcase
+  end
+
+  // ----------------
+  // SEQUENTIAL PROCESS
+  // ----------------
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      state_q         <= Idle;
+      amo_op_q        <= amo_op_t'('0);
+      addr_q          <= '0;
+      amo_operand_b_q <= '0;
+      rdata_in_vld_q  <= 1'b0;
+      sresp_select_q  <= 1'b0;
+      curr_tail_q     <= 0;
+      next_tail_q     <= 1;
+      curr_head_q     <= 0;
+      queue_stalled_q <= 1'b0;
+    end else begin
+      state_q         <= state_d;
+      amo_op_q        <= amo_op_d;
+      addr_q          <= addr_d;
+      amo_operand_b_q <= amo_operand_b_d;
+      rdata_in_vld_q  <= rdata_in_vld_d;
+      sresp_select_q  <= sresp_select_d;
+      curr_tail_q     <= curr_tail_d;
+      next_tail_q     <= next_tail_d;
+      curr_head_q     <= curr_head_d;
+      queue_stalled_q <= queue_stalled_d;
+    end
+  end
+
+  // ----------------
+  // ASSERTIONS
+  // ----------------
+  // pragma translate_off
+  // Check for unsupported parameters
+  if (DataWidth != 32) begin
+    $error($sformatf("Module currently only supports DataWidth = 32. DataWidth is currently set to: %0d", DataWidth));
+  end
+
+  `ifndef VERILATOR
+    meta_full : assert property(
+      @(posedge clk_i) disable iff (~rst_ni) (meta_in_vld |-> meta_in_rdy))
+      else $fatal (1, "Trying to push new data although the i_meta_register is not ready.");
+  `endif
+
+  `ifndef VERILATOR
+    smeta_full : assert property(
+      @(posedge clk_i) disable iff (~rst_ni) (smeta_in_vld |-> smeta_in_rdy))
+      else $fatal (1, "Trying to push new data although the i_stallmeta_register is not ready.");
+  `endif
+
+  `ifndef VERILATOR
+    rdata_full : assert property(
+      @(posedge clk_i) disable iff (~rst_ni) (rdata_in_vld_q |-> rdata_in_rdy))
+      else $fatal (1, "Trying to push new data although the i_rdata_register is not ready.");
+  `endif
+
+  `ifndef VERILATOR
+    stalled_queue : assert property(
+      @(posedge clk_i) disable iff (~rst_ni) (!(queue_stalled_q && smeta_in_vld)))
+      else $fatal (1, "Trying to stall a queue operation despite an already stalled queue.");
+  `endif
+  // pragma translate_on
+
+endmodule
diff --git a/hardware/tb/mempool_tb.sv b/hardware/tb/mempool_tb.sv
index c8dd12e9b..fedf5c651 100644
--- a/hardware/tb/mempool_tb.sv
+++ b/hardware/tb/mempool_tb.sv
@@ -194,6 +194,183 @@ module mempool_tb;
     end: gen_wfi_tiles
   end: gen_wfi_groups
 
+`endif
+`endif
+
+  /************************
+   *  Mempool Bank Trace  *
+   ************************/
+//Accessing Signals heirarchically not supported by Verilator
+`ifndef TARGET_SYNTHESIS
+`ifndef TARGET_VERILATOR
+  //Hierarchy for TCDM adapter
+  `ifdef XQUEUE_TCDM_ADAPTER 
+    `define TCDM_ADAPTER(group,tile,bank) \
+    dut.i_mempool_cluster.gen_groups[group].i_group.gen_tiles[tile].i_tile.gen_banks[bank].gen_tcdm_adapter_xqueue.i_tcdm_adapter 
+  `else
+    `define TCDM_ADAPTER(group,tile,bank) \
+    dut.i_mempool_cluster.gen_groups[group].i_group.gen_tiles[tile].i_tile.gen_banks[bank].gen_tcdm_adapter.i_tcdm_adapter
+  `endif
+  int f;
+
+  initial begin
+    f = $fopen("trace_bank.dasm", "w");
+  end
+
+  localparam int BankTrace = `ifdef BANK_TRACE `BANK_TRACE `else 0 `endif;
+
+  genvar i,j,k;
+  generate;
+    for (i=0; i<NumGroups; ++i) begin : gen_bank_trace_groups
+      for (j=0; j<NumTilesPerGroup; ++j) begin : gen_bank_trace_tiles
+        for (k=0; k<NumBanksPerTile; ++k) begin : gen_bank_trace_banks
+          int unsigned stall_d, stall_q, stall;
+          group_id_t group_id, ini_group;
+          tile_group_id_t ini_tile;
+          tile_core_id_t ini_core;
+          logic increment_head_q, increment_tail_q, vld_amo_op_q;
+          logic [DataWidth-1:0] q_push_data_d, q_push_data_q;
+          string trace_entry; 
+          typedef logic [$clog2(NumCoresPerTile + NumGroups)-1:0] local_req_interco_addr_t;
+          typedef struct packed {
+            local_req_interco_addr_t ini_addr;
+            meta_id_t meta_id;
+            tile_group_id_t tile_id;
+            tile_core_id_t core_id;
+            logic wide;
+          } metadata_t;
+          metadata_t metadata_sel; 
+          logic print_stall_d, print_stall_q, print_lw_d, print_lw_q, print_sw_d, print_sw_q;
+          logic [31:0] in_addr_d, in_addr_q;
+          logic [31:0] sw_d, sw_q;
+
+          always_comb begin
+            group_id      = i;
+            metadata_sel  = `TCDM_ADAPTER(i,j,k).in_meta_o;
+            stall_d       = stall_q;
+            q_push_data_d = q_push_data_q;
+            print_stall_d = 1'b0;
+            print_lw_d    = 1'b0;
+            print_sw_d    = 1'b0;
+            in_addr_d     = in_addr_q;
+            sw_d          = sw_q;
+            //Storing Qpush data
+            if(`TCDM_ADAPTER(i,j,k).in_valid_i && `TCDM_ADAPTER(i,j,k).in_amo_i == 4'hC) begin
+              q_push_data_d = `TCDM_ADAPTER(i,j,k).in_wdata_i;
+            end
+            //Calculating Intitiating core from Response Metadata 
+            if (metadata_sel.ini_addr >= NumCoresPerTile) begin
+              ini_group = $bits(group_id_t)'(metadata_sel.ini_addr - NumCoresPerTile) ^ group_id;
+              ini_tile  = metadata_sel.tile_id;
+              ini_core  = metadata_sel.core_id;
+            end else begin
+              ini_group = group_id;
+              ini_tile  = j;
+              ini_core  = metadata_sel.ini_addr;
+            end
+            `ifdef XQUEUE_TCDM_ADAPTER
+              //Stall calculation for queue operations
+              if (`TCDM_ADAPTER(i,j,k).increment_head || `TCDM_ADAPTER(i,j,k).increment_tail) begin
+                stall_d <= 0;
+              end else begin
+                if (`TCDM_ADAPTER(i,j,k).queue_stalled_q) begin
+                  stall_d <= stall_q + 1;
+                end
+              end
+              //Print the cycles of stalled queue operation when it is resolved 
+              if (`TCDM_ADAPTER(i,j,k).queue_stalled_q && !(`TCDM_ADAPTER(i,j,k).queue_stalled_d)) begin
+                print_stall_d = 1'b1;
+                stall = stall_q;
+              end
+            `endif
+            //Print Non-Atomic Loads and Stores 
+            if ((`TCDM_ADAPTER(i,j,k).in_amo_i == '0) && `TCDM_ADAPTER(i,j,k).in_valid_i && `TCDM_ADAPTER(i,j,k).in_ready_o) begin
+              in_addr_d = `TCDM_ADAPTER(i,j,k).in_address_i;
+              if (`TCDM_ADAPTER(i,j,k).in_write_i) begin
+                print_sw_d  = 1'b1;
+                sw_d        = `TCDM_ADAPTER(i,j,k).in_wdata_i;
+              end else begin
+                print_lw_d  = 1'b1;
+              end
+            end
+          end
+
+          always_ff @(posedge clk or negedge rst_n) begin
+            if (!rst_n) begin
+              stall_q           <= 0;
+              increment_head_q  <= '0;
+              increment_tail_q  <= '0;
+              vld_amo_op_q      <= '0;
+              q_push_data_q     <= '0;
+              print_stall_q     <= '0;
+              print_lw_q        <= '0;
+              print_sw_q        <= '0;
+              in_addr_q         <= '0;
+              sw_q              <= '0;
+            end else begin
+              stall_q           <= stall_d;
+              `ifdef XQUEUE_TCDM_ADAPTER
+                increment_head_q  <= `TCDM_ADAPTER(i,j,k).increment_head;
+                increment_tail_q  <= `TCDM_ADAPTER(i,j,k).increment_tail;
+                vld_amo_op_q      <= `TCDM_ADAPTER(i,j,k).vld_amo_op && `TCDM_ADAPTER(i,j,k).req_accepted;
+              `else 
+                increment_head_q  <= '0;
+                increment_tail_q  <= '0;
+                vld_amo_op_q      <= '0;
+              `endif
+              q_push_data_q     <= q_push_data_d;
+              print_stall_q     <= print_stall_d;
+              print_lw_q        <= print_lw_d;
+              print_sw_q        <= print_sw_d;
+              in_addr_q         <= in_addr_d;
+              sw_q              <= sw_d;
+              //Print when a Bank Operation is retired
+              if (BankTrace && `TCDM_ADAPTER(i,j,k).in_valid_o)begin
+                `ifdef XQUEUE_TCDM_ADAPTER
+                  //AMO excluding Qpush and Qpop
+                  if(vld_amo_op_q)begin
+                    trace_entry = $sformatf("%t: (%1d,%2d,%2d): %s, init=(%1d,%2d,%2d), address= 0x%h, data= %d\n",$time,i,j,k,`TCDM_ADAPTER(i,j,k).amo_op_q, ini_group, ini_tile, ini_core, `TCDM_ADAPTER(i,j,k).addr_q,`TCDM_ADAPTER(i,j,k).amo_result);
+                    $fwrite(f, trace_entry);
+                  end
+                  //Queue operations
+                  if(increment_head_q || increment_tail_q) begin
+                    if (increment_head_q) begin
+                      trace_entry = $sformatf("%t: (%1d,%2d,%2d): Qpop ,",$time,i,j,k); 
+                      trace_entry = $sformatf("%s init=(%1d,%2d,%2d), data= %d", trace_entry, ini_group, ini_tile, ini_core, `TCDM_ADAPTER(i,j,k).in_rdata_o);
+                    end else if (increment_tail_q)begin
+                      trace_entry = $sformatf("%t: (%1d,%2d,%2d): Qpush,",$time,i,j,k); 
+                      trace_entry = $sformatf("%s init=(%1d,%2d,%2d), data= %d", trace_entry, ini_group, ini_tile, ini_core, q_push_data_q);
+                    end
+                    if(print_stall_q) begin 
+                      trace_entry = $sformatf("%s: Qstall=%d\n", trace_entry, stall);
+                    end else begin
+                      trace_entry = $sformatf("%s\n",trace_entry);
+                    end
+                    $fwrite(f, trace_entry);
+                  end
+                `endif
+                //Load
+                if (print_lw_q) begin
+                  trace_entry =  $sformatf("%t: (%1d,%2d,%2d): Load Word , init=(%1d,%2d,%2d), address= 0x%h, data = %d\n",$time,i,j,k, ini_group, ini_tile, ini_core, in_addr_q, `TCDM_ADAPTER(i,j,k).in_rdata_o);
+                  $fwrite(f, trace_entry);
+                end
+                //Store
+                if (print_sw_q) begin
+                  trace_entry =  $sformatf("%t: (%1d,%2d,%2d): Store Word, init=(%1d,%2d,%2d), address= 0x%h, data = %d\n",$time,i,j,k, ini_group, ini_tile, ini_core, in_addr_q, sw_q);
+                  $fwrite(f, trace_entry);
+                end 
+              end
+            end
+          end 
+        end
+      end
+    end
+  endgenerate
+  
+  final begin
+    $fclose(f);
+  end
+
 `endif
 `endif
 
diff --git a/software/apps/memcpy/main.c b/software/apps/memcpy/main.c
index c92a688a1..f93d2e0d1 100644
--- a/software/apps/memcpy/main.c
+++ b/software/apps/memcpy/main.c
@@ -27,7 +27,7 @@
 #ifndef SIZE
 #define SIZE ((NUM_CORES) * (NUM_CORES)*2)
 #endif
-#define BANKING_FACTOR (4)
+// Assume banking factor of 4
 
 uint32_t l2_data_a[SIZE] __attribute__((section(".l2")))
 __attribute__((aligned(NUM_CORES * 4 * 4)));
diff --git a/software/apps/systolic/conv_xqueue/main.c b/software/apps/systolic/conv_xqueue/main.c
new file mode 100644
index 000000000..f4c4339b8
--- /dev/null
+++ b/software/apps/systolic/conv_xqueue/main.c
@@ -0,0 +1,151 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Gua Hao Khov, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "alloc.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "systolic/conv_xqueue.h"
+
+// Dimensions of matrix X
+#define DIM_X_M 258
+#define DIM_X_N 61
+
+// Dimensions of matrix Y
+#define DIM_Y_M (DIM_X_M - 2)
+#define DIM_Y_N (DIM_X_N - 2)
+
+uint32_t *tile_map;
+uint32_t *core_map;
+
+int32_t *matrix_X;
+int32_t *matrix_Y;
+
+int32_t weights[3][3] = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+int32_t *matrix_W = (int32_t *)weights;
+
+void generate_gradient_matrix(int32_t **matrix, uint32_t num_rows,
+                              uint32_t num_cols) {
+  int32_t *new_matrix = (int32_t *)simple_malloc(num_rows * num_cols * 4);
+  for (uint32_t y = 0; y < num_rows; ++y) {
+    for (uint32_t x = 0; x < num_cols; ++x) {
+      new_matrix[y * num_cols + x] = (int32_t)(y + x);
+    }
+  }
+  *matrix = new_matrix;
+}
+
+void print_matrix(int32_t const *matrix, uint32_t num_rows,
+                  uint32_t num_columns) {
+  printf("Matrix at 0x%8X\n", (uint32_t)matrix);
+  for (uint32_t i = 0; i < num_rows; ++i) {
+    for (uint32_t j = 0; j < num_columns; ++j) {
+      printf("%5d ", matrix[i * num_columns + j]);
+    }
+    printf("\n");
+  }
+}
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t tile_id = core_id / 4;
+
+  // Initialize synchronization variables
+  mempool_barrier_init(core_id);
+
+  // Initialization
+  mempool_init(core_id, num_cores);
+
+  // Allocate tile and core maps
+  if (core_id == 0) {
+    tile_map = (uint32_t *)simple_malloc(num_cores * 4);
+    core_map = (uint32_t *)simple_malloc(num_cores * 4);
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Set tile and core maps
+  tile_map[core_id] = tile_id;
+  core_map[core_id] = core_id;
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Setup
+  if (core_id == 0) {
+    printf("> Initialize\n");
+
+    // Print out maps
+    // print_matrix((int32_t *)tile_map, 1, num_cores);
+    // print_matrix((int32_t *)core_map, 1, num_cores);
+
+    // Initialize systolic array
+    systolic_init(tile_map, core_map);
+
+    // Create and initialize matrices
+    generate_gradient_matrix(&matrix_X, DIM_X_M, DIM_X_N);
+    matrix_Y = (int32_t *)simple_malloc(DIM_Y_M * DIM_Y_N * 4);
+
+    // Print out matrix X
+    // printf("> Print Matrix X\n");
+    // print_matrix(matrix_X, DIM_X_M, DIM_X_N);
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    // Start benchmark
+    printf("> Start\n");
+    // mempool_start_benchmark();
+  }
+
+  // Start benchmark for all cores
+  mempool_barrier(num_cores);
+  mempool_start_benchmark();
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  switch (core_id) {
+  case 0:
+    systolic_conv_front(DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y);
+    break;
+  case (NUM_CORES - 1):
+    systolic_conv_end(core_id, DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y);
+    break;
+  default:
+    systolic_conv_mid(core_id, DIM_X_M, DIM_X_N, matrix_X, matrix_W, matrix_Y);
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Stop benchmark for all cores
+  mempool_stop_benchmark();
+  mempool_barrier(num_cores);
+
+  // Print out benchmark
+  if (core_id == 0) {
+    // Stop benchmark
+    // mempool_stop_benchmark();
+    printf("> End\n");
+
+    // Print out matrix Y
+    // printf("> Print Matrix Y\n");
+    // print_matrix(matrix_Y, DIM_Y_M, DIM_Y_N);
+  }
+
+  // wait until all cores have finished
+  mempool_barrier(num_cores);
+  return 0;
+}
diff --git a/software/apps/systolic/matmul_xqueue/main.c b/software/apps/systolic/matmul_xqueue/main.c
new file mode 100644
index 000000000..5c69fde7e
--- /dev/null
+++ b/software/apps/systolic/matmul_xqueue/main.c
@@ -0,0 +1,224 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Gua Hao Khov, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "alloc.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "systolic/matmul_xqueue.h"
+
+// Dimensions of matrices
+#define DIM_M 24
+#define DIM_N 24
+#define DIM_P 24
+
+uint32_t *tile_mapping;
+uint32_t *core_mapping;
+
+int32_t *matrix_A;
+int32_t *matrix_B;
+
+uint32_t rep_count;
+
+systolic_matrix_t *syst_matrix_A;
+systolic_matrix_t *syst_matrix_B;
+systolic_matrix_t *syst_matrix_C;
+
+void generate_gradient_matrix(int32_t **matrix, uint32_t num_rows,
+                              uint32_t num_cols) {
+  int32_t *new_matrix = (int32_t *)simple_malloc(num_rows * num_cols * 4);
+  for (uint32_t y = 0; y < num_rows; ++y) {
+    for (uint32_t x = 0; x < num_cols; ++x) {
+      new_matrix[y * num_cols + x] = (int32_t)(y + x);
+    }
+  }
+  *matrix = new_matrix;
+}
+
+void print_matrix(int32_t const *matrix, uint32_t num_rows,
+                  uint32_t num_columns) {
+  printf("Matrix at 0x%8X\n", (uint32_t)matrix);
+  for (uint32_t i = 0; i < num_rows; ++i) {
+    for (uint32_t j = 0; j < num_columns; ++j) {
+      printf("%5d ", matrix[i * num_columns + j]);
+    }
+    printf("\n");
+  }
+}
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t tile_id = core_id / 4;
+
+  // Initialize synchronization variables
+  mempool_barrier_init(core_id);
+
+  // Initialization
+  mempool_init(core_id, num_cores);
+
+  // Allocate systolic grid mapping
+  if (core_id == 0) {
+    tile_mapping = (uint32_t *)simple_malloc(num_cores * 4);
+    core_mapping = (uint32_t *)simple_malloc(num_cores * 4);
+  }
+
+#if NUM_CORES == 16
+  // ----------
+  // 16 CORES
+  // ----------
+
+  // Assign grid position (row wise)
+  // uint32_t col_idx = core_id % 4;
+  // uint32_t row_idx = core_id / 4;
+
+  // Assign grid position (col wise)
+  uint32_t col_idx = core_id / 4;
+  uint32_t row_idx = core_id % 4;
+
+  // Assign grid position (square wise)
+  // uint32_t col_idx = tile_id % 2;
+  // col_idx *= 2;
+  // col_idx += core_id % 2;
+  // uint32_t row_idx = tile_id / 2;
+  // row_idx *= 2;
+  // row_idx += (core_id % 4) / 2;
+#elif NUM_CORES == 256
+  // ----------
+  // 256 CORES
+  // ----------
+
+  // Assign grid position (row wise)
+  // uint32_t col_idx = core_id % 16;
+  // uint32_t row_idx = core_id / 16;
+
+  // Assign grid position (col wise)
+  uint32_t col_idx = core_id / 16;
+  uint32_t row_idx = core_id % 16;
+
+  // Assign grid position (square wise)
+  // uint32_t col_idx = tile_id % 8;
+  // col_idx *= 2;
+  // col_idx += core_id % 2;
+  // uint32_t row_idx = tile_id / 8;
+  // row_idx *= 2;
+  // row_idx += (core_id % 4) / 2;
+
+  // Assign grid position (square square wise)
+  // uint32_t group_id = tile_id / 16;
+  // uint32_t add_col = group_id % 2;
+  // uint32_t add_row = group_id / 2;
+  // uint32_t col_idx = tile_id % 4;
+  // col_idx *= 2;
+  // col_idx += core_id % 2;
+  // col_idx += add_col * 8;
+  // uint32_t row_idx = (tile_id % 16) / 4;
+  // row_idx *= 2;
+  // row_idx += (core_id % 4) / 2;
+  // row_idx += add_row * 8;
+#else
+#error Unsupported NUM_CORES
+#endif
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Set tile and core mapping
+  tile_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = tile_id;
+  core_mapping[row_idx * SYSTOLIC_SIZE + col_idx] = core_id;
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Setup
+  if (core_id == 0) {
+    printf("> Initialize\n");
+
+    // Print out tile mapping
+    //print_matrix((int32_t *)tile_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
+
+    // Print out core mapping
+    //print_matrix((int32_t *)core_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
+
+    // Initialize systolic array
+    systolic_init(tile_mapping, core_mapping);
+
+    // Create systolic matrices
+    generate_gradient_matrix(&matrix_A, DIM_M, DIM_N);
+    systolic_matrix_create(&syst_matrix_A, matrix_A, DIM_M, DIM_N);
+    simple_free(matrix_A);
+    generate_gradient_matrix(&matrix_B, DIM_N, DIM_P);
+    systolic_matrix_create(&syst_matrix_B, matrix_B, DIM_N, DIM_P);
+    simple_free(matrix_B);
+    systolic_matrix_allocate(&syst_matrix_C, DIM_M, DIM_P);
+
+    // Print out systolic matrices A & B
+    // printf("> Print Systolic Matrices A & B\n");
+    // systolic_matrix_print(syst_matrix_A);
+    // systolic_matrix_print(syst_matrix_B);
+
+    // Set repetition count per submatrix of C (A->num_cols == B->num_rows)
+    rep_count = syst_matrix_A->num_cols / 2;
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    // Start benchmark
+    printf("> Start\n");
+    mempool_start_benchmark();
+  }
+
+  // Start benchmark for all cores
+  // mempool_barrier(num_cores);
+  // mempool_start_benchmark();
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  if ((row_idx == 0) && (col_idx == 0)) {
+    systolic_rcp_pe(rep_count, syst_matrix_A, syst_matrix_B, syst_matrix_C);
+  }
+
+  if ((row_idx == 0) && (col_idx != 0)) {
+    systolic_cp_pe(col_idx, rep_count, syst_matrix_B, syst_matrix_C);
+  }
+
+  if ((row_idx != 0) && (col_idx == 0)) {
+    systolic_rp_pe(row_idx, rep_count, syst_matrix_A, syst_matrix_C);
+  }
+
+  if ((row_idx != 0) && (col_idx != 0)) {
+    systolic_np_pe(row_idx, col_idx, rep_count, syst_matrix_C);
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Stop benchmark for all cores
+  // mempool_stop_benchmark();
+  // mempool_barrier(num_cores);
+
+  // Print out benchmark
+  if (core_id == 0) {
+    // Stop benchmark
+    mempool_stop_benchmark();
+    printf("> End\n");
+
+    // Print out systolic matrix C
+    // printf("> Print Systolic Matrix C\n");
+    // systolic_matrix_print(syst_matrix_C);
+  }
+
+  // wait until all cores have finished
+  mempool_barrier(num_cores);
+  return 0;
+}
diff --git a/software/apps/systolic/xqueue_test/main.c b/software/apps/systolic/xqueue_test/main.c
new file mode 100644
index 000000000..ee4b7ee92
--- /dev/null
+++ b/software/apps/systolic/xqueue_test/main.c
@@ -0,0 +1,100 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Gua Hao Khov, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "alloc.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+int32_t *queue = 0;
+
+int32_t producer_check, consumer_check, dummy_check;
+
+// queue push
+static inline int32_t queue_push(void *const queue, int32_t data) {
+  int32_t ret;
+  asm volatile("q.push.w %0, %1, (%2)" : "=r"(ret) : "r"(data), "r"(queue));
+  return ret;
+}
+
+// queue pop
+inline int32_t queue_pop(void *const queue) {
+  int32_t ret;
+  asm volatile("q.pop.w %0, 0(%1)" : "=r"(ret) : "r"(queue));
+  return ret;
+}
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  extern int32_t __seq_start;
+
+  // Initialize synchronization variables
+  mempool_barrier_init(core_id);
+
+  // Initialization
+  mempool_init(core_id, num_cores);
+
+  // Setup
+  if (core_id == 0) {
+    printf("Initialize\n");
+    queue = &__seq_start;
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Producer
+  if (core_id == 0) {
+    int32_t data[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+    int32_t check = 0;
+    int32_t resp;
+    int32_t dummy = 0;
+    for (uint32_t i = 0; i < 16; ++i) {
+      resp = queue_push(queue, data[i]);
+      dummy += resp;
+    }
+    for (uint32_t i = 0; i < 16; ++i) {
+      resp = queue_push(queue, data[i]);
+      dummy += resp;
+      check += data[i];
+    }
+    producer_check = check;
+    dummy_check = dummy;
+  }
+
+  // Consumer
+  if (core_id == 1) {
+    int32_t read_data;
+    int32_t check = 0;
+    for (uint32_t i = 0; i < 16; ++i) {
+      read_data = queue_pop(queue);
+      printf("Rx: %d\n", read_data);
+    }
+    printf("Burst Test\n");
+    for (uint32_t i = 0; i < 16; ++i) {
+      read_data = queue_pop(queue);
+      check += read_data;
+    }
+    consumer_check = check;
+  }
+
+  // Wait for all cores
+  mempool_barrier(num_cores);
+
+  // Print both checks
+  if (core_id == 0) {
+    printf("Check: %d/%d/%d\n", producer_check, consumer_check, dummy_check);
+  }
+
+  // wait until all cores have finished
+  mempool_barrier(num_cores);
+  return 0;
+}
diff --git a/software/runtime/encoding.h b/software/runtime/encoding.h
index ce0ce72fa..ee518a9ea 100644
--- a/software/runtime/encoding.h
+++ b/software/runtime/encoding.h
@@ -2148,6 +2148,10 @@
 #define MASK_PV_PACKHI_B  0xfe00707f
 #define MATCH_PV_PACKLO_B 0xe0001057
 #define MASK_PV_PACKLO_B  0xfe00707f
+#define MATCH_Q_PUSH 0x3800202f
+#define MASK_Q_PUSH  0xf800707f
+#define MATCH_Q_POP 0x3000202f
+#define MASK_Q_POP  0xf9f0707f
 #define CSR_FFLAGS 0x1
 #define CSR_FRM 0x2
 #define CSR_FCSR 0x3
@@ -3379,6 +3383,8 @@ DECLARE_INSN(pv_pack, MATCH_PV_PACK, MASK_PV_PACK)
 DECLARE_INSN(pv_pack_h, MATCH_PV_PACK_H, MASK_PV_PACK_H)
 DECLARE_INSN(pv_packhi_b, MATCH_PV_PACKHI_B, MASK_PV_PACKHI_B)
 DECLARE_INSN(pv_packlo_b, MATCH_PV_PACKLO_B, MASK_PV_PACKLO_B)
+DECLARE_INSN(q_push, MATCH_Q_PUSH, MASK_Q_PUSH)
+DECLARE_INSN(q_pop, MATCH_Q_POP, MASK_Q_POP)
 #endif
 #ifdef DECLARE_CSR
 DECLARE_CSR(fflags, CSR_FFLAGS)
diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h
index 108e217a5..12376e52d 100644
--- a/software/runtime/runtime.h
+++ b/software/runtime/runtime.h
@@ -11,6 +11,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#define NUM_BANKS_PER_TILE NUM_CORES_PER_TILE *BANKING_FACTOR
+
 extern char l1_alloc_base;
 extern uint32_t atomic_barrier;
 extern volatile uint32_t wake_up_reg;
diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk
index 717a432d9..ee4003663 100644
--- a/software/runtime/runtime.mk
+++ b/software/runtime/runtime.mk
@@ -64,6 +64,7 @@ DEFINES += -DPRINTF_DISABLE_SUPPORT_FLOAT -DPRINTF_DISABLE_SUPPORT_LONG_LONG -DP
 DEFINES += -DNUM_CORES=$(num_cores)
 DEFINES += -DNUM_GROUPS=$(num_groups)
 DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile)
+DEFINES += -DBANKING_FACTOR=$(banking_factor)
 DEFINES += -DNUM_CORES_PER_GROUP=$(shell awk 'BEGIN{print $(num_cores)/$(num_groups)}')
 DEFINES += -DNUM_TILES_PER_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)}')
 DEFINES += -DLOG2_NUM_CORES_PER_TILE=$(shell awk 'BEGIN{print log($(num_cores_per_tile))/log(2)}')
diff --git a/software/runtime/systolic/conv_xqueue.h b/software/runtime/systolic/conv_xqueue.h
new file mode 100644
index 000000000..8e6e251de
--- /dev/null
+++ b/software/runtime/systolic/conv_xqueue.h
@@ -0,0 +1,1065 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Gua Hao Khov, ETH Zurich
+
+/* This library implements a simple systolic architecture emulation
+ * using global code based orchestration
+ */
+
+/* TODO DESCRIPTION
+ * TODO: LIMITATION NUM_COLS_Y >= 2 <=> NUM_COLS >= 4
+ * TODO: COMPLETELY FIXED TO KERNEL SIZE OF 3
+ *
+ *
+ *
+ *
+ */
+
+#include "alloc.h"
+#include "printf.h"
+
+// Array of queue ptrs in row-major order (concatenated kernels)
+int32_t *queues_x_0[NUM_CORES];
+int32_t *queues_x_1[NUM_CORES];
+
+// queue push
+static inline void queue_push(void *const queue, int32_t data,
+                              int32_t *const ret) {
+  asm volatile("q.push.w %0, %1, (%2)"
+               : "+r"(*ret)
+               : "r"(data), "r"(queue)
+               : "memory");
+}
+
+// queue pop
+inline void queue_pop(void *const queue, int32_t *const ret) {
+  asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue) : "memory");
+}
+
+void systolic_init(uint32_t const *tile_map, uint32_t const *core_map) {
+  // Create systolic array via queues
+  extern int32_t __seq_start;
+  uint32_t tile_id;
+  uint32_t core_id;
+  uint32_t tile_offset;
+  uint32_t core_offset;
+
+  for (uint32_t i = 0; i < NUM_CORES; ++i) {
+    tile_id = tile_map[i];
+    core_id = core_map[i];
+    tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4;
+    core_offset = core_id % 4 * 4;
+    queues_x_0[i] = &__seq_start + tile_offset + core_offset + 0;
+    queues_x_1[i] = &__seq_start + tile_offset + core_offset + 1;
+  }
+
+  // Print out queue addresses
+  // printf("queues_x_0\n");
+  // for (uint32_t i = 0; i < NUM_CORES; ++i) {
+  //   printf("%5d ", queues_x_0[i]);
+  // }
+  // printf("\n");
+  // printf("queues_x_1\n");
+  // for (uint32_t i = 0; i < NUM_CORES; ++i) {
+  //   printf("%5d ", queues_x_1[i]);
+  // }
+  // printf("\n");
+}
+
+void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
+                         int32_t const *__restrict__ X,
+                         int32_t const *__restrict__ W,
+                         int32_t *__restrict__ Y) {
+  int32_t *queue_next_x_0;
+  int32_t *queue_next_x_1;
+  int32_t resp_x_0 __attribute__((unused)) = 0;
+  int32_t resp_x_1 __attribute__((unused)) = 0;
+  int32_t weights[3][3];
+  int32_t curr_x[3];
+  register int32_t acc_y[3] = {0, 0, 0};
+  uint32_t row;
+  uint32_t col;
+  uint32_t num_cols_y = num_cols - 2;
+
+  // Assign queues
+  queue_next_x_0 = queues_x_0[1];
+  queue_next_x_1 = queues_x_1[1];
+
+  // Load weights
+  for (uint32_t y = 0; y < 3; ++y) {
+    for (uint32_t x = 0; x < 3; ++x) {
+      weights[y][x] = W[y * 3 + x];
+    }
+  }
+
+  // Execute row-wise systolic 2d convolution
+  row = 2;
+  while (row < num_rows - 1) {
+    // ----------
+    // POPULATE 0
+    // ----------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + 0];
+    curr_x[2] = X[(row - 0) * num_cols + 0];
+    curr_x[0] = X[(row - 2) * num_cols + 0];
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 1st column of weights
+    acc_y[2] += curr_x[0] * weights[0][0];
+    acc_y[2] += curr_x[1] * weights[1][0];
+    acc_y[2] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("" ::: "memory");
+    // ----------
+    // POPULATE 1
+    // ----------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + 1];
+    curr_x[2] = X[(row - 0) * num_cols + 1];
+    curr_x[0] = X[(row - 2) * num_cols + 1];
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 1st row of weights
+    acc_y[2] += curr_x[0] * weights[0][1];
+    acc_y[0] += curr_x[0] * weights[0][0];
+    // MACs with 2nd row of weights
+    acc_y[2] += curr_x[1] * weights[1][1];
+    acc_y[0] += curr_x[1] * weights[1][0];
+    // MACs with 3rd row of weights
+    acc_y[2] += curr_x[2] * weights[2][1];
+    acc_y[0] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("" ::: "memory");
+    // ------------------
+    // CONVOLUTION BURSTS
+    // ------------------
+    col = 2;
+    while (col < num_cols_y) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 0];
+      curr_x[2] = X[(row - 0) * num_cols + col + 0];
+      curr_x[0] = X[(row - 2) * num_cols + col + 0];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 1];
+      curr_x[2] = X[(row - 0) * num_cols + col + 1];
+      curr_x[0] = X[(row - 2) * num_cols + col + 1];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 2
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 2];
+      curr_x[2] = X[(row - 0) * num_cols + col + 2];
+      curr_x[0] = X[(row - 2) * num_cols + col + 2];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[1] += curr_x[0] * weights[0][2];
+      acc_y[2] += curr_x[0] * weights[0][1];
+      acc_y[0] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[1] += curr_x[1] * weights[1][2];
+      acc_y[2] += curr_x[1] * weights[1][1];
+      acc_y[0] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[1] += curr_x[2] * weights[2][2];
+      acc_y[2] += curr_x[2] * weights[2][1];
+      acc_y[0] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
+      // Reset finished accumulation
+      acc_y[1] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // ----------------
+      // INCREMENT COLUMN
+      // ----------------
+      col += 3;
+    }
+    __asm__ __volatile__("" ::: "memory");
+    // ---------------------
+    // CONVOLUTION REMAINDER
+    // ---------------------
+    while (col < num_cols) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 0];
+      curr_x[2] = X[(row - 0) * num_cols + col + 0];
+      curr_x[0] = X[(row - 2) * num_cols + col + 0];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
+      // Increment column index
+      ++col;
+      if (col >= num_cols)
+        break;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 1];
+      curr_x[2] = X[(row - 0) * num_cols + col + 1];
+      curr_x[0] = X[(row - 2) * num_cols + col + 1];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
+    }
+    // ------------------
+    // RESET ACCUMULATORS
+    // ------------------
+    acc_y[0] = 0;
+    acc_y[1] = 0;
+    acc_y[2] = 0;
+    // -------------
+    // INCREMENT ROW
+    // -------------
+    row += NUM_CORES;
+  }
+
+  // Finish last row of systolic 2d convolution without pushing
+  if (row == num_rows - 1) {
+    // ----------
+    // POPULATE 0
+    // ----------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + 0];
+    curr_x[2] = X[(row - 0) * num_cols + 0];
+    curr_x[0] = X[(row - 2) * num_cols + 0];
+    // MACs with 1st column of weights
+    acc_y[2] += curr_x[0] * weights[0][0];
+    acc_y[2] += curr_x[1] * weights[1][0];
+    acc_y[2] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("" ::: "memory");
+    // ----------
+    // POPULATE 1
+    // ----------
+    // Load x vector
+    curr_x[1] = X[(row - 1) * num_cols + 1];
+    curr_x[2] = X[(row - 0) * num_cols + 1];
+    curr_x[0] = X[(row - 2) * num_cols + 1];
+    // MACs with 1st row of weights
+    acc_y[2] += curr_x[0] * weights[0][1];
+    acc_y[0] += curr_x[0] * weights[0][0];
+    // MACs with 2nd row of weights
+    acc_y[2] += curr_x[1] * weights[1][1];
+    acc_y[0] += curr_x[1] * weights[1][0];
+    // MACs with 3rd row of weights
+    acc_y[2] += curr_x[2] * weights[2][1];
+    acc_y[0] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("" ::: "memory");
+    // ------------------
+    // CONVOLUTION BURSTS
+    // ------------------
+    col = 2;
+    while (col < num_cols_y) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 0];
+      curr_x[2] = X[(row - 0) * num_cols + col + 0];
+      curr_x[0] = X[(row - 2) * num_cols + col + 0];
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 1];
+      curr_x[2] = X[(row - 0) * num_cols + col + 1];
+      curr_x[0] = X[(row - 2) * num_cols + col + 1];
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 2
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 2];
+      curr_x[2] = X[(row - 0) * num_cols + col + 2];
+      curr_x[0] = X[(row - 2) * num_cols + col + 2];
+      // MACs with 1st row of weights
+      acc_y[1] += curr_x[0] * weights[0][2];
+      acc_y[2] += curr_x[0] * weights[0][1];
+      acc_y[0] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[1] += curr_x[1] * weights[1][2];
+      acc_y[2] += curr_x[1] * weights[1][1];
+      acc_y[0] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[1] += curr_x[2] * weights[2][2];
+      acc_y[2] += curr_x[2] * weights[2][1];
+      acc_y[0] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
+      // Reset finished accumulation
+      acc_y[1] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // ----------------
+      // INCREMENT COLUMN
+      // ----------------
+      col += 3;
+    }
+    __asm__ __volatile__("" ::: "memory");
+    // ---------------------
+    // CONVOLUTION REMAINDER
+    // ---------------------
+    while (col < num_cols) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 0];
+      curr_x[2] = X[(row - 0) * num_cols + col + 0];
+      curr_x[0] = X[(row - 2) * num_cols + col + 0];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
+      // Increment column index
+      ++col;
+      if (col >= num_cols)
+        break;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Load x vector
+      curr_x[1] = X[(row - 1) * num_cols + col + 1];
+      curr_x[2] = X[(row - 0) * num_cols + col + 1];
+      curr_x[0] = X[(row - 2) * num_cols + col + 1];
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
+    }
+  }
+}
+
+void systolic_conv_mid(const uint32_t kernel_id, const uint32_t num_rows,
+                       const uint32_t num_cols, int32_t const *__restrict__ X,
+                       int32_t const *__restrict__ W, int32_t *__restrict__ Y) {
+  int32_t *queue_prev_x_0;
+  int32_t *queue_next_x_0;
+  int32_t *queue_prev_x_1;
+  int32_t *queue_next_x_1;
+  int32_t resp_x_0 __attribute__((unused)) = 0;
+  int32_t resp_x_1 __attribute__((unused)) = 0;
+  int32_t weights[3][3];
+  int32_t curr_x[3];
+  register int32_t acc_y[3] = {0, 0, 0};
+  uint32_t row;
+  uint32_t col;
+  uint32_t num_cols_y = num_cols - 2;
+
+  // Assign queues
+  queue_prev_x_0 = queues_x_0[kernel_id];
+  queue_next_x_0 = queues_x_0[kernel_id + 1];
+  queue_prev_x_1 = queues_x_1[kernel_id];
+  queue_next_x_1 = queues_x_1[kernel_id + 1];
+
+  // Load weights
+  for (uint32_t y = 0; y < 3; ++y) {
+    for (uint32_t x = 0; x < 3; ++x) {
+      weights[y][x] = W[y * 3 + x];
+    }
+  }
+
+  // Execute row-wise systolic 2d convolution
+  row = kernel_id + 2;
+  while (row < num_rows - 1) {
+    // ----------
+    // POPULATE 0
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 0];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 1st column of weights
+    acc_y[2] += curr_x[0] * weights[0][0];
+    acc_y[2] += curr_x[1] * weights[1][0];
+    acc_y[2] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("" ::: "memory");
+    // ----------
+    // POPULATE 1
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 1];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // Push lower part of x vector
+    queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+    queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+    // MACs with 1st row of weights
+    acc_y[2] += curr_x[0] * weights[0][1];
+    acc_y[0] += curr_x[0] * weights[0][0];
+    // MACs with 2nd row of weights
+    acc_y[2] += curr_x[1] * weights[1][1];
+    acc_y[0] += curr_x[1] * weights[1][0];
+    // MACs with 3rd row of weights
+    acc_y[2] += curr_x[2] * weights[2][1];
+    acc_y[0] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("" ::: "memory");
+    // ------------------
+    // CONVOLUTION BURSTS
+    // ------------------
+    col = 2;
+    while (col < num_cols_y) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 0];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 1];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 2
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 2];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[1] += curr_x[0] * weights[0][2];
+      acc_y[2] += curr_x[0] * weights[0][1];
+      acc_y[0] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[1] += curr_x[1] * weights[1][2];
+      acc_y[2] += curr_x[1] * weights[1][1];
+      acc_y[0] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[1] += curr_x[2] * weights[2][2];
+      acc_y[2] += curr_x[2] * weights[2][1];
+      acc_y[0] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
+      // Reset finished accumulation
+      acc_y[1] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // ----------------
+      // INCREMENT COLUMN
+      // ----------------
+      col += 3;
+    }
+    __asm__ __volatile__("" ::: "memory");
+    // ---------------------
+    // CONVOLUTION REMAINDER
+    // ---------------------
+    while (col < num_cols) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
+      // Increment column index
+      ++col;
+      if (col >= num_cols)
+        break;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // Push lower part of x vector
+      queue_push(queue_next_x_0, curr_x[1], &resp_x_0);
+      queue_push(queue_next_x_1, curr_x[2], &resp_x_1);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
+    }
+    // ------------------
+    // RESET ACCUMULATORS
+    // ------------------
+    acc_y[0] = 0;
+    acc_y[1] = 0;
+    acc_y[2] = 0;
+    // -------------
+    // INCREMENT ROW
+    // -------------
+    row += NUM_CORES;
+  }
+
+  // Finish last row of systolic 2d convolution without pushing
+  if (row == num_rows - 1) {
+    // ----------
+    // POPULATE 0
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 0];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 1st column of weights
+    acc_y[2] += curr_x[0] * weights[0][0];
+    acc_y[2] += curr_x[1] * weights[1][0];
+    acc_y[2] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("" ::: "memory");
+    // ----------
+    // POPULATE 1
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 1];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 1st row of weights
+    acc_y[2] += curr_x[0] * weights[0][1];
+    acc_y[0] += curr_x[0] * weights[0][0];
+    // MACs with 2nd row of weights
+    acc_y[2] += curr_x[1] * weights[1][1];
+    acc_y[0] += curr_x[1] * weights[1][0];
+    // MACs with 3rd row of weights
+    acc_y[2] += curr_x[2] * weights[2][1];
+    acc_y[0] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("" ::: "memory");
+    // ------------------
+    // CONVOLUTION BURSTS
+    // ------------------
+    col = 2;
+    while (col < num_cols_y) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 0];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 1];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 2
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 2];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[1] += curr_x[0] * weights[0][2];
+      acc_y[2] += curr_x[0] * weights[0][1];
+      acc_y[0] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[1] += curr_x[1] * weights[1][2];
+      acc_y[2] += curr_x[1] * weights[1][1];
+      acc_y[0] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[1] += curr_x[2] * weights[2][2];
+      acc_y[2] += curr_x[2] * weights[2][1];
+      acc_y[0] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
+      // Reset finished accumulation
+      acc_y[1] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // ----------------
+      // INCREMENT COLUMN
+      // ----------------
+      col += 3;
+    }
+    __asm__ __volatile__("" ::: "memory");
+    // ---------------------
+    // CONVOLUTION REMAINDER
+    // ---------------------
+    while (col < num_cols) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
+      // Increment column index
+      ++col;
+      if (col >= num_cols)
+        break;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
+    }
+  }
+}
+
+void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
+                       const uint32_t num_cols, int32_t const *__restrict__ X,
+                       int32_t const *__restrict__ W, int32_t *__restrict__ Y) {
+  int32_t *queue_prev_x_0;
+  int32_t *queue_prev_x_1;
+  int32_t weights[3][3];
+  int32_t curr_x[3];
+  register int32_t acc_y[3] = {0, 0, 0};
+  uint32_t col;
+  uint32_t num_cols_y = num_cols - 2;
+
+  // Assign queues
+  queue_prev_x_0 = queues_x_0[kernel_id];
+  queue_prev_x_1 = queues_x_1[kernel_id];
+
+  // Load weights
+  for (uint32_t y = 0; y < 3; ++y) {
+    for (uint32_t x = 0; x < 3; ++x) {
+      weights[y][x] = W[y * 3 + x];
+    }
+  }
+
+  // Execute row-wise systolic 2d convolution
+  for (uint32_t row = kernel_id + 2; row < num_rows; row += NUM_CORES) {
+    // ----------
+    // POPULATE 0
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 0];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 1st column of weights
+    acc_y[2] += curr_x[0] * weights[0][0];
+    acc_y[2] += curr_x[1] * weights[1][0];
+    acc_y[2] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("" ::: "memory");
+    // ----------
+    // POPULATE 1
+    // ----------
+    // Pop and load x vector
+    queue_pop(queue_prev_x_1, &curr_x[1]);
+    curr_x[2] = X[row * num_cols + 1];
+    queue_pop(queue_prev_x_0, &curr_x[0]);
+    // MACs with 1st row of weights
+    acc_y[2] += curr_x[0] * weights[0][1];
+    acc_y[0] += curr_x[0] * weights[0][0];
+    // MACs with 2nd row of weights
+    acc_y[2] += curr_x[1] * weights[1][1];
+    acc_y[0] += curr_x[1] * weights[1][0];
+    // MACs with 3rd row of weights
+    acc_y[2] += curr_x[2] * weights[2][1];
+    acc_y[0] += curr_x[2] * weights[2][0];
+    __asm__ __volatile__("" ::: "memory");
+    // ------------------
+    // CONVOLUTION BURSTS
+    // ------------------
+    col = 2;
+    while (col < num_cols_y) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 0];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 0] = acc_y[2];
+      // Reset finished accumulation
+      acc_y[2] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 1];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 1] = acc_y[0];
+      // Reset finished accumulation
+      acc_y[0] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 2
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col + 2];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[1] += curr_x[0] * weights[0][2];
+      acc_y[2] += curr_x[0] * weights[0][1];
+      acc_y[0] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[1] += curr_x[1] * weights[1][2];
+      acc_y[2] += curr_x[1] * weights[1][1];
+      acc_y[0] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[1] += curr_x[2] * weights[2][2];
+      acc_y[2] += curr_x[2] * weights[2][1];
+      acc_y[0] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2) + 2] = acc_y[1];
+      // Reset finished accumulation
+      acc_y[1] = 0;
+      __asm__ __volatile__("" ::: "memory");
+      // ----------------
+      // INCREMENT COLUMN
+      // ----------------
+      col += 3;
+    }
+    __asm__ __volatile__("" ::: "memory");
+    // ---------------------
+    // CONVOLUTION REMAINDER
+    // ---------------------
+    while (col < num_cols) {
+      // -----------
+      // ITERATION 0
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[2] += curr_x[0] * weights[0][2];
+      acc_y[0] += curr_x[0] * weights[0][1];
+      acc_y[1] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[2] += curr_x[1] * weights[1][2];
+      acc_y[0] += curr_x[1] * weights[1][1];
+      acc_y[1] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[2] += curr_x[2] * weights[2][2];
+      acc_y[0] += curr_x[2] * weights[2][1];
+      acc_y[1] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[2];
+      // Increment column index
+      ++col;
+      if (col >= num_cols)
+        break;
+      __asm__ __volatile__("" ::: "memory");
+      // -----------
+      // ITERATION 1
+      // -----------
+      // Pop and load x vector
+      queue_pop(queue_prev_x_1, &curr_x[1]);
+      curr_x[2] = X[row * num_cols + col];
+      queue_pop(queue_prev_x_0, &curr_x[0]);
+      // MACs with 1st row of weights
+      acc_y[0] += curr_x[0] * weights[0][2];
+      acc_y[1] += curr_x[0] * weights[0][1];
+      acc_y[2] += curr_x[0] * weights[0][0];
+      // MACs with 2nd row of weights
+      acc_y[0] += curr_x[1] * weights[1][2];
+      acc_y[1] += curr_x[1] * weights[1][1];
+      acc_y[2] += curr_x[1] * weights[1][0];
+      // MACs with 3rd row of weights
+      acc_y[0] += curr_x[2] * weights[2][2];
+      acc_y[1] += curr_x[2] * weights[2][1];
+      acc_y[2] += curr_x[2] * weights[2][0];
+      // Store finished accumulation
+      Y[(row - 2) * num_cols_y + (col - 2)] = acc_y[0];
+    }
+    // ------------------
+    // RESET ACCUMULATORS
+    // ------------------
+    acc_y[0] = 0;
+    acc_y[1] = 0;
+    acc_y[2] = 0;
+  }
+}
diff --git a/software/runtime/systolic/matmul_xqueue.h b/software/runtime/systolic/matmul_xqueue.h
new file mode 100644
index 000000000..c1f8aac3b
--- /dev/null
+++ b/software/runtime/systolic/matmul_xqueue.h
@@ -0,0 +1,996 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Gua Hao Khov, ETH Zurich
+
+/* This library implements a simple systolic architecture emulation
+ * using global code based orchestration
+ */
+
+/* A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix
+ * C = AB
+ * (max dimension is 16-bit)
+ * Matrix is processed in 2x2 submatrices with the following indexing
+ *
+ *        B B          0 1
+ *        B B          2 3
+ *
+ *   A A  C C  =  0 2  0 1
+ *   A A  C C     1 3  2 3
+ *
+ * e.g. C0 = A2 * B2 + A0 * B0
+ *
+ * We use two interleaved queues per direction
+ */
+
+#include "alloc.h"
+#include "printf.h"
+
+// Dimensions of square systolic array
+#define SYSTOLIC_SIZE 16
+
+// Systolic matrix
+typedef struct {
+  int32_t *matrix;
+  uint32_t num_rows;
+  uint32_t num_cols;
+} systolic_matrix_t;
+
+// TODO: SQRT ROOT OF NUM_CORES FOR SYSTOLIC SIZE
+
+// Array of queue ptrs in row-major order
+int32_t *queues_vert_0[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
+int32_t *queues_vert_1[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
+int32_t *queues_horz_0[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
+int32_t *queues_horz_1[SYSTOLIC_SIZE][SYSTOLIC_SIZE];
+
+// queue push
+static inline void queue_push(void *const queue, int32_t data,
+                              int32_t *const ret) {
+  asm volatile("q.push.w %0, %1, (%2)" : "+r"(*ret) : "r"(data), "r"(queue));
+}
+
+// queue pop
+inline void queue_pop(void *const queue, int32_t *const ret) {
+  asm volatile("q.pop.w %0, 0(%1)" : "=r"(*ret) : "r"(queue));
+}
+
+void systolic_init(uint32_t const *tile_mapping, uint32_t const *core_mapping) {
+  // Create systolic array via queues
+  extern int32_t __seq_start;
+  uint32_t grid_pos = 0;
+  uint32_t tile_id;
+  uint32_t core_id;
+  uint32_t tile_offset;
+  uint32_t core_offset;
+  for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
+    for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
+      tile_id = tile_mapping[grid_pos];
+      core_id = core_mapping[grid_pos];
+      tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4;
+      core_offset = core_id % 4 * 4;
+      queues_vert_0[y][x] = &__seq_start + tile_offset + core_offset + 0;
+      queues_vert_1[y][x] = &__seq_start + tile_offset + core_offset + 1;
+      queues_horz_0[y][x] = &__seq_start + tile_offset + core_offset + 2;
+      queues_horz_1[y][x] = &__seq_start + tile_offset + core_offset + 3;
+      ++grid_pos;
+    }
+  }
+
+  // Print out queue addresses
+  // printf("queues_vert_0\n");
+  // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
+  //     printf("%5d ", queues_vert_0[y][x]);
+  //   }
+  //   printf("\n");
+  // }
+  // printf("queues_vert_1\n");
+  // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
+  //     printf("%5d ", queues_vert_1[y][x]);
+  //   }
+  //   printf("\n");
+  // }
+  // printf("queues_horz_0\n");
+  // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
+  //     printf("%5d ", queues_horz_0[y][x]);
+  //   }
+  //   printf("\n");
+  // }
+  // printf("queues_horz_1\n");
+  // for (uint32_t y = 0; y < SYSTOLIC_SIZE; ++y) {
+  //   for (uint32_t x = 0; x < SYSTOLIC_SIZE; ++x) {
+  //     printf("%5d ", queues_horz_1[y][x]);
+  //   }
+  //   printf("\n");
+  // }
+}
+
+void systolic_matrix_allocate(systolic_matrix_t **syst_matrix,
+                              uint32_t num_rows, uint32_t num_cols) {
+  // Round up row and col dimension to next multiple of two
+  uint32_t syst_num_rows = (uint32_t)((num_rows + 1) & 0xFFFE);
+  uint32_t syst_num_cols = (uint32_t)((num_cols + 1) & 0xFFFE);
+
+  // Allocate matrix array
+  int32_t *array = (int32_t *)simple_malloc(syst_num_rows * syst_num_cols * 4);
+
+  // Allocate systolic matrix
+  systolic_matrix_t *new_matrix = (systolic_matrix_t *)simple_malloc(3 * 4);
+
+  // Assign values to systolic matrix
+  new_matrix->matrix = array;
+  new_matrix->num_rows = syst_num_rows;
+  new_matrix->num_cols = syst_num_cols;
+
+  *syst_matrix = new_matrix;
+}
+
+void systolic_matrix_create(systolic_matrix_t **syst_matrix, int32_t *matrix,
+                            uint32_t num_rows, uint32_t num_cols) {
+  // Round up row and col dimension to next multiple of two
+  uint32_t syst_num_rows = (uint32_t)((num_rows + 1) & 0xFFFE);
+  uint32_t syst_num_cols = (uint32_t)((num_cols + 1) & 0xFFFE);
+
+  // Allocate matrix array
+  int32_t *array = (int32_t *)simple_malloc(syst_num_rows * syst_num_cols * 4);
+
+  // Copy data into new matrix array
+  for (uint32_t y = 0; y < num_rows; ++y) {
+    for (uint32_t x = 0; x < num_cols; ++x) {
+      array[y * syst_num_cols + x] = matrix[y * num_cols + x];
+    }
+  }
+
+  // Zero padding of matrix array
+  if (syst_num_cols != num_cols) {
+    for (uint32_t y = 0; y < syst_num_rows; ++y) {
+      array[y * syst_num_cols + syst_num_cols - 1] = 0;
+    }
+  }
+  if (syst_num_rows != num_rows) {
+    for (uint32_t x = 0; x < syst_num_cols; ++x) {
+      array[(syst_num_rows - 1) * syst_num_cols + x] = 0;
+    }
+  }
+
+  // Allocate systolic matrix
+  systolic_matrix_t *new_matrix = (systolic_matrix_t *)simple_malloc(3 * 4);
+
+  // Assign values to systolic matrix
+  new_matrix->matrix = array;
+  new_matrix->num_rows = syst_num_rows;
+  new_matrix->num_cols = syst_num_cols;
+
+  *syst_matrix = new_matrix;
+}
+
+void systolic_matrix_print(systolic_matrix_t *syst_matrix) {
+  printf("Systolic matrix at 0x%08X\n", (uint32_t)syst_matrix);
+  uint32_t num_rows = syst_matrix->num_rows;
+  uint32_t num_cols = syst_matrix->num_cols;
+  int32_t *matrix = syst_matrix->matrix;
+  for (uint32_t y = 0; y < num_rows; ++y) {
+    for (uint32_t x = 0; x < num_cols; ++x) {
+      printf("%5d ", matrix[y * num_cols + x]);
+    }
+    printf("\n");
+  }
+}
+
+// row and column producing processing element
+void systolic_rcp_pe(const uint32_t rep_count,
+                     systolic_matrix_t const *__restrict__ A,
+                     systolic_matrix_t const *__restrict__ B,
+                     systolic_matrix_t const *__restrict__ C) {
+  int32_t *queue_next_horz_0;
+  int32_t *queue_next_horz_1;
+  int32_t *queue_next_vert_0;
+  int32_t *queue_next_vert_1;
+  int32_t data_horz[4] = {0, 0, 0, 0};
+  int32_t data_vert[4] = {0, 0, 0, 0};
+  int32_t resp_horz_0 __attribute__((unused)) = 0;
+  int32_t resp_horz_1 __attribute__((unused)) = 0;
+  int32_t resp_vert_0 __attribute__((unused)) = 0;
+  int32_t resp_vert_1 __attribute__((unused)) = 0;
+  int32_t *matrix_A;
+  int32_t *matrix_B;
+  int32_t *matrix_C;
+  uint32_t num_cols_A;
+  uint32_t num_cols_B;
+  uint32_t num_rows_C;
+  uint32_t num_cols_C;
+  int32_t curr_element_0_C;
+  int32_t curr_element_1_C;
+  int32_t curr_element_2_C;
+  int32_t curr_element_3_C;
+  uint32_t anchor_row_0;
+  uint32_t anchor_row_1;
+
+  // Assign queues
+  queue_next_horz_0 = queues_horz_0[0][1];
+  queue_next_horz_1 = queues_horz_1[0][1];
+  queue_next_vert_0 = queues_vert_0[1][0];
+  queue_next_vert_1 = queues_vert_1[1][0];
+
+  // Get matrix arrays
+  matrix_A = A->matrix;
+  matrix_B = B->matrix;
+  matrix_C = C->matrix;
+
+  // Get dimensions of matrices
+  num_cols_A = A->num_cols;
+  num_cols_B = B->num_cols;
+  num_rows_C = C->num_rows;
+  num_cols_C = C->num_cols;
+
+  // Execute step-wise matrix multiplication
+  for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+    for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+      // Reset values
+      curr_element_0_C = 0;
+      curr_element_1_C = 0;
+      curr_element_2_C = 0;
+      curr_element_3_C = 0;
+
+      // Systolic matrix multiplication through MACs
+      for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+        data_horz[0] = matrix_A[y * num_cols_A + i];
+        data_vert[0] = matrix_B[i * num_cols_B + x];
+        data_horz[1] = matrix_A[(y + 1) * num_cols_A + i];
+        data_vert[1] = matrix_B[i * num_cols_B + x + 1];
+        queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+        queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+        queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+        queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+        curr_element_0_C += data_horz[0] * data_vert[0];
+        curr_element_1_C += data_horz[0] * data_vert[1];
+        curr_element_2_C += data_horz[1] * data_vert[0];
+        curr_element_3_C += data_horz[1] * data_vert[1];
+        data_horz[2] = matrix_A[y * num_cols_A + i + 1];
+        data_vert[2] = matrix_B[(i + 1) * num_cols_B + x];
+        data_horz[3] = matrix_A[(y + 1) * num_cols_A + i + 1];
+        data_vert[3] = matrix_B[(i + 1) * num_cols_B + x + 1];
+        queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+        queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+        queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+        queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+        curr_element_0_C += data_horz[2] * data_vert[2];
+        curr_element_1_C += data_horz[2] * data_vert[3];
+        curr_element_2_C += data_horz[3] * data_vert[2];
+        curr_element_3_C += data_horz[3] * data_vert[3];
+      }
+
+      // Store values
+      anchor_row_0 = y * num_cols_C + x;
+      anchor_row_1 = anchor_row_0 + num_cols_C;
+      matrix_C[anchor_row_0] = curr_element_0_C;
+      matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+      matrix_C[anchor_row_1] = curr_element_2_C;
+      matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+    }
+  }
+}
+
+// column producing processing element
+void systolic_cp_pe(const uint32_t col_idx, const uint32_t rep_count,
+                    systolic_matrix_t const *__restrict__ B,
+                    systolic_matrix_t const *__restrict__ C) {
+  int32_t *queue_prev_horz_0;
+  int32_t *queue_prev_horz_1;
+  int32_t *queue_next_horz_0;
+  int32_t *queue_next_horz_1;
+  int32_t *queue_next_vert_0;
+  int32_t *queue_next_vert_1;
+  int32_t data_horz[4] = {0, 0, 0, 0};
+  int32_t data_vert[4] = {0, 0, 0, 0};
+  int32_t resp_horz_0 __attribute__((unused)) = 0;
+  int32_t resp_horz_1 __attribute__((unused)) = 0;
+  int32_t resp_vert_0 __attribute__((unused)) = 0;
+  int32_t resp_vert_1 __attribute__((unused)) = 0;
+  int32_t *matrix_B;
+  int32_t *matrix_C;
+  uint32_t num_cols_B;
+  uint32_t num_rows_C;
+  uint32_t num_cols_C;
+  uint32_t shifted_x;
+  int32_t curr_element_0_C;
+  int32_t curr_element_1_C;
+  int32_t curr_element_2_C;
+  int32_t curr_element_3_C;
+  uint32_t anchor_row_0;
+  uint32_t anchor_row_1;
+
+  // Assign queues
+  queue_prev_horz_0 = queues_horz_0[0][col_idx];
+  queue_prev_horz_1 = queues_horz_1[0][col_idx];
+  if (col_idx == SYSTOLIC_SIZE - 1) {
+    queue_next_horz_0 = NULL;
+    queue_next_horz_1 = NULL;
+  } else {
+    queue_next_horz_0 = queues_horz_0[0][col_idx + 1];
+    queue_next_horz_1 = queues_horz_1[0][col_idx + 1];
+  }
+  queue_next_vert_0 = queues_vert_0[1][col_idx];
+  queue_next_vert_1 = queues_vert_1[1][col_idx];
+
+  // Get matrix arrays
+  matrix_B = B->matrix;
+  matrix_C = C->matrix;
+
+  // Get dimensions of matrices
+  num_cols_B = B->num_cols;
+  num_rows_C = C->num_rows;
+  num_cols_C = C->num_cols;
+
+  // Check if PE is at the right boundary
+  if (queue_next_horz_0) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x
+        shifted_x = x + 2 * col_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            data_vert[0] = matrix_B[i * num_cols_B + shifted_x];
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1];
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x];
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+          }
+        }
+      }
+    }
+  } else {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x
+        shifted_x = x + 2 * col_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            data_vert[0] = matrix_B[i * num_cols_B + shifted_x];
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            data_vert[1] = matrix_B[i * num_cols_B + shifted_x + 1];
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            data_vert[2] = matrix_B[(i + 1) * num_cols_B + shifted_x];
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            data_vert[3] = matrix_B[(i + 1) * num_cols_B + shifted_x + 1];
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_push(queue_next_vert_0, data_horz[0], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_horz[1], &resp_vert_1);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_push(queue_next_vert_0, data_horz[2], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_horz[3], &resp_vert_1);
+          }
+        }
+      }
+    }
+  }
+}
+
+// row producing processing element
+void systolic_rp_pe(const uint32_t row_idx, const uint32_t rep_count,
+                    systolic_matrix_t const *__restrict__ A,
+                    systolic_matrix_t const *__restrict__ C) {
+  int32_t *queue_next_horz_0;
+  int32_t *queue_next_horz_1;
+  int32_t *queue_prev_vert_0;
+  int32_t *queue_prev_vert_1;
+  int32_t *queue_next_vert_0;
+  int32_t *queue_next_vert_1;
+  int32_t data_horz[4] = {0, 0, 0, 0};
+  int32_t data_vert[4] = {0, 0, 0, 0};
+  int32_t resp_horz_0 __attribute__((unused)) = 0;
+  int32_t resp_horz_1 __attribute__((unused)) = 0;
+  int32_t resp_vert_0 __attribute__((unused)) = 0;
+  int32_t resp_vert_1 __attribute__((unused)) = 0;
+  int32_t *matrix_A;
+  int32_t *matrix_C;
+  uint32_t num_cols_A;
+  uint32_t num_rows_C;
+  uint32_t num_cols_C;
+  uint32_t shifted_y;
+  int32_t curr_element_0_C;
+  int32_t curr_element_1_C;
+  int32_t curr_element_2_C;
+  int32_t curr_element_3_C;
+  uint32_t anchor_row_0;
+  uint32_t anchor_row_1;
+
+  // Assign queues
+  queue_next_horz_0 = queues_horz_0[row_idx][1];
+  queue_next_horz_1 = queues_horz_1[row_idx][1];
+  queue_prev_vert_0 = queues_vert_0[row_idx][0];
+  queue_prev_vert_1 = queues_vert_1[row_idx][0];
+  if (row_idx == SYSTOLIC_SIZE - 1) {
+    queue_next_vert_0 = NULL;
+    queue_next_vert_1 = NULL;
+  } else {
+    queue_next_vert_0 = queues_vert_0[row_idx + 1][0];
+    queue_next_vert_1 = queues_vert_1[row_idx + 1][0];
+  }
+
+  // Get matrix arrays
+  matrix_A = A->matrix;
+  matrix_C = C->matrix;
+
+  // Get dimensions of matrices
+  num_cols_A = A->num_cols;
+  num_rows_C = C->num_rows;
+  num_cols_C = C->num_cols;
+
+  // Check if PE is at the bottom boundary
+  if (queue_next_vert_0) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift y
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+            data_horz[0] = matrix_A[shifted_y * num_cols_A + i];
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            data_horz[1] = matrix_A[(shifted_y + 1) * num_cols_A + i];
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            data_horz[2] = matrix_A[shifted_y * num_cols_A + i + 1];
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+          }
+        }
+      }
+    }
+  } else {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift y
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < 2 * rep_count; i += 2) {
+            data_horz[0] = matrix_A[shifted_y * num_cols_A + i];
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            data_horz[1] = matrix_A[(shifted_y + 1) * num_cols_A + i];
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            data_horz[2] = matrix_A[shifted_y * num_cols_A + i + 1];
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            data_horz[3] = matrix_A[(shifted_y + 1) * num_cols_A + i + 1];
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_vert[0], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_vert[1], &resp_horz_1);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_vert[2], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_vert[3], &resp_horz_1);
+          }
+        }
+      }
+    }
+  }
+}
+
+// non-producing processing element
+void systolic_np_pe(const uint32_t row_idx, const uint32_t col_idx,
+                    const uint32_t rep_count,
+                    systolic_matrix_t const *__restrict__ C) {
+  int32_t *queue_prev_horz_0;
+  int32_t *queue_prev_horz_1;
+  int32_t *queue_next_horz_0;
+  int32_t *queue_next_horz_1;
+  int32_t *queue_prev_vert_0;
+  int32_t *queue_prev_vert_1;
+  int32_t *queue_next_vert_0;
+  int32_t *queue_next_vert_1;
+  int32_t data_horz[4] = {0, 0, 0, 0};
+  int32_t data_vert[4] = {0, 0, 0, 0};
+  int32_t data_dummy __attribute__((unused)) = 0;
+  int32_t resp_horz_0 __attribute__((unused)) = 0;
+  int32_t resp_horz_1 __attribute__((unused)) = 0;
+  int32_t resp_vert_0 __attribute__((unused)) = 0;
+  int32_t resp_vert_1 __attribute__((unused)) = 0;
+  int32_t *matrix_C;
+  uint32_t num_rows_C;
+  uint32_t num_cols_C;
+  uint32_t shifted_x;
+  uint32_t shifted_y;
+  int32_t curr_element_0_C;
+  int32_t curr_element_1_C;
+  int32_t curr_element_2_C;
+  int32_t curr_element_3_C;
+  uint32_t anchor_row_0;
+  uint32_t anchor_row_1;
+
+  // Assign queues
+  queue_prev_horz_0 = queues_horz_0[row_idx][col_idx];
+  queue_prev_horz_1 = queues_horz_1[row_idx][col_idx];
+  if (col_idx == SYSTOLIC_SIZE - 1) {
+    queue_next_horz_0 = NULL;
+    queue_next_horz_1 = NULL;
+  } else {
+    queue_next_horz_0 = queues_horz_0[row_idx][col_idx + 1];
+    queue_next_horz_1 = queues_horz_1[row_idx][col_idx + 1];
+  }
+  queue_prev_vert_0 = queues_vert_0[row_idx][col_idx];
+  queue_prev_vert_1 = queues_vert_1[row_idx][col_idx];
+  if (row_idx == SYSTOLIC_SIZE - 1) {
+    queue_next_vert_0 = NULL;
+    queue_next_vert_1 = NULL;
+  } else {
+    queue_next_vert_0 = queues_vert_0[row_idx + 1][col_idx];
+    queue_next_vert_1 = queues_vert_1[row_idx + 1][col_idx];
+  }
+
+  // Get matrix arrays
+  matrix_C = C->matrix;
+
+  // Get dimensions of matrices
+  num_rows_C = C->num_rows;
+  num_cols_C = C->num_cols;
+
+  // PE is not at a boundary
+  if (queue_next_horz_0 && queue_next_vert_0) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x and y
+        shifted_x = x + 2 * col_idx;
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+          }
+        }
+      }
+    }
+  }
+
+  // PE is at the right boundary
+  if (!queue_next_horz_0 && queue_next_vert_0) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x and y
+        shifted_x = x + 2 * col_idx;
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            data_vert[0] += data_horz[0];
+            data_vert[1] += data_horz[1];
+            queue_push(queue_next_vert_0, data_vert[0], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[1], &resp_vert_1);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            data_vert[2] += data_horz[2];
+            data_vert[3] += data_horz[3];
+            queue_push(queue_next_vert_0, data_vert[2], &resp_vert_0);
+            queue_push(queue_next_vert_1, data_vert[3], &resp_vert_1);
+          }
+        }
+      }
+    }
+  }
+
+  // PE is at the bottom boundary
+  if (queue_next_horz_0 && !queue_next_vert_0) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x and y
+        shifted_x = x + 2 * col_idx;
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            data_horz[0] += data_vert[0];
+            data_horz[1] += data_vert[1];
+            queue_push(queue_next_horz_0, data_horz[0], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[1], &resp_horz_1);
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            data_horz[2] += data_vert[2];
+            data_horz[3] += data_vert[3];
+            queue_push(queue_next_horz_0, data_horz[2], &resp_horz_0);
+            queue_push(queue_next_horz_1, data_horz[3], &resp_horz_1);
+          }
+        }
+      }
+    }
+  }
+
+  // PE is at the bottom right corner
+  if (!queue_next_horz_0 && !queue_next_vert_0) {
+    // Execute step-wise matrix multiplication
+    for (uint32_t y = 0; y < num_rows_C; y += 2 * SYSTOLIC_SIZE) {
+      for (uint32_t x = 0; x < num_cols_C; x += 2 * SYSTOLIC_SIZE) {
+        // Shift x and y
+        shifted_x = x + 2 * col_idx;
+        shifted_y = y + 2 * row_idx;
+
+        // Check if this PE is currently within the matrix C
+        if (shifted_x < num_cols_C && shifted_y < num_rows_C) {
+          // Reset values
+          curr_element_0_C = 0;
+          curr_element_1_C = 0;
+          curr_element_2_C = 0;
+          curr_element_3_C = 0;
+
+          // Systolic matrix multiplication through MACs
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            curr_element_0_C += data_horz[0] * data_vert[0];
+            curr_element_1_C += data_horz[0] * data_vert[1];
+            curr_element_2_C += data_horz[1] * data_vert[0];
+            curr_element_3_C += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            curr_element_0_C += data_horz[2] * data_vert[2];
+            curr_element_1_C += data_horz[2] * data_vert[3];
+            curr_element_2_C += data_horz[3] * data_vert[2];
+            curr_element_3_C += data_horz[3] * data_vert[3];
+          }
+
+          // Store values
+          anchor_row_0 = shifted_y * num_cols_C + shifted_x;
+          anchor_row_1 = anchor_row_0 + num_cols_C;
+          matrix_C[anchor_row_0] = curr_element_0_C;
+          matrix_C[anchor_row_0 + 1] = curr_element_1_C;
+          matrix_C[anchor_row_1] = curr_element_2_C;
+          matrix_C[anchor_row_1 + 1] = curr_element_3_C;
+        } else {
+          // Pop and push dummy data
+          for (uint32_t i = 0; i < rep_count; ++i) {
+            queue_pop(queue_prev_horz_0, &data_horz[0]);
+            queue_pop(queue_prev_vert_0, &data_vert[0]);
+            queue_pop(queue_prev_horz_1, &data_horz[1]);
+            queue_pop(queue_prev_vert_1, &data_vert[1]);
+            data_dummy += data_horz[0] * data_vert[0];
+            data_dummy += data_horz[1] * data_vert[1];
+            queue_pop(queue_prev_horz_0, &data_horz[2]);
+            queue_pop(queue_prev_vert_0, &data_vert[2]);
+            queue_pop(queue_prev_horz_1, &data_horz[3]);
+            queue_pop(queue_prev_vert_1, &data_vert[3]);
+            data_dummy += data_horz[2] * data_vert[2];
+            data_dummy += data_horz[3] * data_vert[3];
+            // TODO: FIND SAFER WAY TO ENFORCE DATA DEPENDENCY
+            if (!data_dummy)
+              break;
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/toolchain/riscv-gnu-toolchain b/toolchain/riscv-gnu-toolchain
index 70acebe25..3b3b3dcbc 160000
--- a/toolchain/riscv-gnu-toolchain
+++ b/toolchain/riscv-gnu-toolchain
@@ -1 +1 @@
-Subproject commit 70acebe256fc49114b5f068fa79f03eb9affed09
+Subproject commit 3b3b3dcbc2c759924d25833374f4402d817b4b9c
diff --git a/toolchain/riscv-isa-sim/disasm/disasm.cc b/toolchain/riscv-isa-sim/disasm/disasm.cc
index fbb889775..d3d92c4ac 100644
--- a/toolchain/riscv-isa-sim/disasm/disasm.cc
+++ b/toolchain/riscv-isa-sim/disasm/disasm.cc
@@ -1515,6 +1515,10 @@ disassembler_t::disassembler_t(int xlen)
   DEFINE_RTYPE(pv_shuffle2_h);
   DEFINE_RTYPE(pv_shuffle2_b);
 
+  // Xqueues extension
+  DEFINE_XAMO(q_push)
+  DEFINE_XAMO_LR(q_pop)
+
   // provide a default disassembly for all instructions as a fallback
   #define DECLARE_INSN(code, match, mask) \
    add_insn(new disasm_insn_t(#code " (args unknown)", match, mask, {}));
diff --git a/toolchain/riscv-opcodes b/toolchain/riscv-opcodes
index 6bda68aa8..00b89eb39 160000
--- a/toolchain/riscv-opcodes
+++ b/toolchain/riscv-opcodes
@@ -1 +1 @@
-Subproject commit 6bda68aa82b78b47a61cbf0c08e39cf83a03f152
+Subproject commit 00b89eb39dbe8a980dd1485732b78231d01217c3