Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
dfacf4d
[config] Parametrize scratchpad banking factor
sermazz Sep 14, 2022
3a13c7b
Merge pull request #57 from pulp-platform/cgra-sw-fix
SamuelRiedel Sep 14, 2022
738a8cf
[hardware] Add support for atomic Xqueue operations to TCDM adapter
khovg May 17, 2021
49bc8a5
[toolchain] Add toolchain support to xqueues extension (in standard a…
sermazz Aug 22, 2022
d7bad0a
[snitch] Add xqueues extension to instruction decoder
khovg May 18, 2021
1b6ea2b
[hardware] Fix response acquisition
khovg May 18, 2021
6224f5f
[hardware] Remove qpush data registers by abusing buffer slot
khovg May 18, 2021
03d622e
[apps/hardware] Implement xqueue_test app
SamuelRiedel Mar 23, 2022
20248cb
[apps] Implement systolic matmul_xqueue (1x1 matmul)
khovg May 19, 2021
ac43b0c
[apps] Optimize systolic matmul_xqueue for 2x2 matmul
khovg May 19, 2021
51b6eb5
[apps] Improve matmul_xqueue code
khovg May 20, 2021
5f2d0bc
[apps] Optimize matmul_xqueue with asm inline
khovg May 27, 2021
72441bc
[apps] Use 2 interleaved queues per direction in matmul_xqueue
khovg Jun 10, 2021
3674ea7
[apps] Improve matmul_xqueue code
khovg Jun 15, 2021
3ea900b
[apps] Implement systolic xqueue 2d convolution
sermazz Sep 13, 2022
8a3b524
[apps] Improve conv_xqueue code
khovg Jun 23, 2021
e29d84c
[apps] Improve 2d conv density for conv_xqueue
khovg Jun 25, 2021
0fbe88b
[apps] Improve conv_xqueue ecode
khovg Jun 25, 2021
244b9a5
[apps] Improve regularity of conv_xqueue
khovg Jul 16, 2021
4517da9
[apps] Fix license and format
sermazz Sep 14, 2022
4691691
[CHANGELOG] Add Xqueue extension and sw kernels
sermazz Sep 15, 2022
8e210c7
[config] Update systolic config
sermazz Sep 16, 2022
26f3877
[hardware] :bug: Add write response to Xqueue TCDM adapter
sermazz Sep 23, 2022
2d8fa4e
[software] Generalize systolic matmul NUM_CORES
sermazz Nov 21, 2022
0c2dffe
[tb] Add support to trace retired TCDM operations
Dec 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Bender.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ sources:
- hardware/src/mempool_cc.sv
- hardware/src/snitch_addr_demux.sv
- hardware/src/tcdm_adapter.sv
- hardware/src/tcdm_adapter_xqueue.sv
- hardware/src/tcdm_shim.sv
- hardware/src/tcdm_wide_narrow_mux.sv
- hardware/src/address_scrambler.sv
Expand Down
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

### Added
- Add a DMA
- Add support to hardrware-accelerated queues for CGRA (RV32A extension)
- Add systolic implementation of matmul and 2d convolution exploiting hardware-accelerated queues
- Add ability to Trace the operations retired by the TCDM adapters

### Fixed
- Measure the `wfi` stalls and stalls caused by `opc` properly
Expand All @@ -34,7 +37,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Add the `terapool` configuration
- Add read-only caches to the hierarchical AXI interconnect
- Add a `memcpy` benchmark
- Add a systolic configuration including runtime support and a matmul application
- Add a systolic configuration for software-emulated CGRA including runtime support and a systolic matmul
- Add `axpy` kernel
- Add Spyglass linting scripts
- Add an OpenMP runtime and example applications
Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ Thanks to all.
* Marc Gantenbein
* Marco Bertuletti
* Sergio Mazzola
* Vaibhav Krishna
* Yichao Zhang
1 change: 1 addition & 0 deletions config/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ flavors of MemPool. We currently support three flavors:
- `terapool`: 1024 cores, organized into 128 tiles with eight cores each
- `mempool`: 256 cores, organized into 64 tiles with four cores each (default)
- `minpool`: 16 cores, organized into 4 tiles with four cores each
- `systolic`: same as `mempool` but the cores form a CGRA

Use the `config` variable to define which configuration to take. For example,
to run a simulation with the `minpool` configuration, you would run
Expand Down
3 changes: 3 additions & 0 deletions config/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ dmas_per_group ?= 4
## Xqueues configuration ##
#############################

# Hardware queues for systolic (atomic ISA extension in TCDM adapter)
xqueue ?= 0

# XQueue extension's queue size in each memory bank (in words)
xqueue_size ?= 0

Expand Down
3 changes: 3 additions & 0 deletions config/mempool.mk
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ num_groups ?= 4
# Number of cores per MemPool tile
num_cores_per_tile ?= 4

# L1 scratchpad banking factor
banking_factor ?= 4

# Radix for hierarchical AXI interconnect
axi_hier_radix ?= 20

Expand Down
3 changes: 3 additions & 0 deletions config/minpool.mk
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ num_groups ?= 4
# Number of cores per MemPool tile
num_cores_per_tile ?= 4

# L1 scratchpad banking factor
banking_factor ?= 4

# Number of DMA backends in each group
dmas_per_group ?= 1

Expand Down
13 changes: 10 additions & 3 deletions config/systolic.mk
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ num_groups ?= 4
# Number of cores per MemPool tile
num_cores_per_tile ?= 4

# L1 scratchpad banking factor
banking_factor ?= 4

# Radix for hierarchical AXI interconnect
axi_hier_radix ?= 16
axi_hier_radix ?= 20

# Number of AXI masters per group
axi_masters_per_group ?= 1
Expand All @@ -29,6 +32,10 @@ seq_mem_size ?= 2048
## Xqueues configuration ##
#############################

# Xqueue extension's queue size (in queue entries)
# in each memory bank (assume banking factor of 4)
# Hardware queues for systolic (atomic ISA extension in TCDM adapter)
xqueue ?= 1

# Systolic queues size (assume banking factor of 4) for:
# - software queues emulation (size measured in queue entries)
# - hardware xqueue's queue in each memory bank (size measured in words)
xqueue_size ?= 4
3 changes: 3 additions & 0 deletions config/terapool.mk
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ num_groups ?= 8
# Number of cores per Terapool tile
num_cores_per_tile ?= 8

# L1 scratchpad banking factor
banking_factor ?= 4

# Radix for hierarchical AXI interconnect
axi_hier_radix ?= 8

Expand Down
11 changes: 8 additions & 3 deletions hardware/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ verilator_top ?= mempool_tb_verilator
python ?= python3
# Enable tracing
snitch_trace ?= 0
bank_trace ?= 0

# Check if the specified QuestaSim version exists
ifeq (, $(shell which $(questa_cmd)))
Expand Down Expand Up @@ -87,15 +88,19 @@ endif
vlog_args += -suppress vlog-2583 -suppress vlog-13314 -suppress vlog-13233
vlog_args += -work $(library)
# Defines
vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups)
vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups) -DBANKING_FACTOR=$(banking_factor)
vlog_defs += -DL2_BASE=$(l2_base) -DL2_SIZE=$(l2_size) -DL2_BANKS=$(l2_banks)
vlog_defs += -DBOOT_ADDR=$(boot_addr) -DXPULPIMG=$(xpulpimg)
vlog_defs += -DSNITCH_TRACE=$(snitch_trace)
vlog_defs += -DSNITCH_TRACE=$(snitch_trace) -DBANK_TRACE=$(bank_trace)
vlog_defs += -DAXI_DATA_WIDTH=$(axi_data_width)
vlog_defs += -DRO_LINE_WIDTH=$(ro_line_width)
vlog_defs += -DDMAS_PER_GROUP=$(dmas_per_group)
vlog_defs += -DAXI_HIER_RADIX=$(axi_hier_radix) -DAXI_MASTERS_PER_GROUP=$(axi_masters_per_group)
vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE_SIZE=$(xqueue_size)
vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE=$(xqueue) -DXQUEUE_SIZE=$(xqueue_size)

ifeq ($(xqueue),1)
vlog_defs+= -DXQUEUE_TCDM_ADAPTER
endif

# Traffic generation enabled
ifdef tg
Expand Down
2 changes: 2 additions & 0 deletions hardware/deps/snitch/src/riscv_instr.sv
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,8 @@ package riscv_instr;
localparam logic [31:0] PV_PACK_H = 32'b1101001??????????000?????1010111;
localparam logic [31:0] PV_PACKHI_B = 32'b1101100??????????001?????1010111;
localparam logic [31:0] PV_PACKLO_B = 32'b1110000??????????001?????1010111;
localparam logic [31:0] Q_PUSH = 32'b00111????????????010?????0101111;
localparam logic [31:0] Q_POP = 32'b00110??00000?????010?????0101111;
/* CSR Addresses */
localparam logic [11:0] CSR_FFLAGS = 12'h1;
localparam logic [11:0] CSR_FRM = 12'h2;
Expand Down
43 changes: 41 additions & 2 deletions hardware/deps/snitch/src/snitch.sv
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ module snitch
parameter logic [31:0] MTVEC = BootAddr, // Exception Base Address (see privileged spec 3.1.7)
parameter bit RVE = 0, // Reduced-register Extension
parameter bit RVM = 1, // Enable IntegerMmultiplication & Division Extension
parameter int RegNrWritePorts = 2 // Implement one or two write ports into the register file
parameter int RegNrWritePorts = 2, // Implement one or two write ports into the register file
parameter bit Xqueue = 0
) (
input logic clk_i,
input logic rst_i,
Expand Down Expand Up @@ -152,7 +153,10 @@ module snitch
AMOMin = 4'h8,
AMOMinu = 4'h9,
AMOLR = 4'hA,
AMOSC = 4'hB
AMOSC = 4'hB,
// TODO(smazzola): parametrize
QPush = 4'hC, // Only used when Xqueue is enabled
QPop = 4'hD // Only used when Xqueue is enabled
} ls_amo;

logic [31:0] ld_result;
Expand Down Expand Up @@ -1324,6 +1328,41 @@ module snitch
end
/* end of Xpulpimg extension */

/* Xqueues extension */
// TODO(khovg): Add define to include instr
riscv_instr::Q_PUSH: begin
if (Xqueue) begin
alu_op = BypassA;
write_rd = 1'b0;
uses_rd = 1'b1;
is_load = 1'b1;
is_signed = 1'b1;
ls_size = Word;
ls_amo = QPush;
opa_select = Reg;
opb_select = Reg;
end else begin
illegal_inst = 1'b1;
end
end
// TODO(khovg): Two source registers are unnnecessary
riscv_instr::Q_POP: begin
if (Xqueue) begin
alu_op = BypassA;
write_rd = 1'b0;
uses_rd = 1'b1;
is_load = 1'b1;
is_signed = 1'b1;
ls_size = Word;
ls_amo = QPop;
opa_select = Reg;
opb_select = Reg;
end else begin
illegal_inst = 1'b1;
end
end
/* end of Xqueues extension */

// TODO(zarubaf): Illegal Instructions
default: begin
illegal_inst = 1'b1;
Expand Down
9 changes: 5 additions & 4 deletions hardware/src/mempool_cc.sv
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,11 @@ module mempool_cc

// Snitch Integer Core
snitch #(
.BootAddr ( BootAddr ),
.MTVEC ( MTVEC ),
.RVE ( RVE ),
.RVM ( RVM )
.BootAddr ( BootAddr ),
.MTVEC ( MTVEC ),
.RVE ( RVE ),
.RVM ( RVM ),
.Xqueue ( mempool_pkg::Xqueue )
) i_snitch (
.clk_i ,
.rst_i ,
Expand Down
5 changes: 4 additions & 1 deletion hardware/src/mempool_pkg.sv
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ package mempool_pkg;
localparam integer unsigned DataWidth = 32;
localparam integer unsigned BeWidth = DataWidth / 8;
localparam integer unsigned ByteOffset = $clog2(BeWidth);
localparam integer unsigned BankingFactor = 4;
localparam integer unsigned BankingFactor = `ifdef BANKING_FACTOR `BANKING_FACTOR `else 0 `endif;
localparam bit LrScEnable = 1'b1;
localparam integer unsigned TCDMSizePerBank = 1024; // [B]
localparam integer unsigned NumBanks = NumCores * BankingFactor;
Expand Down Expand Up @@ -258,6 +258,9 @@ package mempool_pkg;
* QUEUE PARAMETERS *
**********************/

// Size of queues in words (must be a power of two)
localparam bit Xqueue = `ifdef XQUEUE `XQUEUE `else 1'b0 `endif;

// Size of xqueues in words (must be a power of two)
localparam int unsigned XQueueSize = `ifdef XQUEUE_SIZE `XQUEUE_SIZE `else 0 `endif;

Expand Down
87 changes: 59 additions & 28 deletions hardware/src/mempool_tile.sv
Original file line number Diff line number Diff line change
Expand Up @@ -381,34 +381,65 @@ module mempool_tile
assign bank_resp_payload[b].rdata.amo = '0; // Don't care
assign bank_resp_wide[b] = meta_out.wide;

tcdm_adapter #(
.AddrWidth (TCDMAddrMemWidth),
.DataWidth (DataWidth ),
.metadata_t (bank_metadata_t ),
.LrScEnable (LrScEnable ),
.RegisterAmo(1'b0 )
) i_tcdm_adapter (
.clk_i (clk_i ),
.rst_ni (rst_ni ),
.in_valid_i (bank_req_valid[b] ),
.in_ready_o (bank_req_ready[b] ),
.in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]),
.in_amo_i (bank_req_payload[b].wdata.amo ),
.in_write_i (bank_req_payload[b].wen ),
.in_wdata_i (bank_req_payload[b].wdata.data ),
.in_meta_i (meta_in ),
.in_be_i (bank_req_payload[b].be ),
.in_valid_o (bank_resp_valid[b] ),
.in_ready_i (bank_resp_ready[b] ),
.in_rdata_o (bank_resp_payload[b].rdata.data ),
.in_meta_o (meta_out ),
.out_req_o (req_valid ),
.out_add_o (req_addr ),
.out_write_o (req_write ),
.out_wdata_o (req_wdata ),
.out_be_o (req_be ),
.out_rdata_i (resp_rdata )
);
if (Xqueue) begin: gen_tcdm_adapter_xqueue
tcdm_adapter_xqueue #(
.AddrWidth (TCDMAddrMemWidth),
.DataWidth (DataWidth ),
.XQueueSize (XQueueSize ),
.metadata_t (bank_metadata_t ),
.RegisterAmo(1'b0 )
) i_tcdm_adapter (
.clk_i (clk_i ),
.rst_ni (rst_ni ),
.in_valid_i (bank_req_valid[b] ),
.in_ready_o (bank_req_ready[b] ),
.in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]),
.in_amo_i (bank_req_payload[b].wdata.amo ),
.in_write_i (bank_req_payload[b].wen ),
.in_wdata_i (bank_req_payload[b].wdata.data ),
.in_meta_i (meta_in ),
.in_be_i (bank_req_payload[b].be ),
.in_valid_o (bank_resp_valid[b] ),
.in_ready_i (bank_resp_ready[b] ),
.in_rdata_o (bank_resp_payload[b].rdata.data ),
.in_meta_o (meta_out ),
.out_req_o (req_valid ),
.out_add_o (req_addr ),
.out_write_o (req_write ),
.out_wdata_o (req_wdata ),
.out_be_o (req_be ),
.out_rdata_i (resp_rdata )
);
end else begin: gen_tcdm_adapter
tcdm_adapter #(
.AddrWidth (TCDMAddrMemWidth),
.DataWidth (DataWidth ),
.metadata_t (bank_metadata_t ),
.LrScEnable (LrScEnable ),
.RegisterAmo(1'b0 )
) i_tcdm_adapter (
.clk_i (clk_i ),
.rst_ni (rst_ni ),
.in_valid_i (bank_req_valid[b] ),
.in_ready_o (bank_req_ready[b] ),
.in_address_i(bank_req_payload[b].tgt_addr[idx_width(NumBanksPerTile) +: TCDMAddrMemWidth]),
.in_amo_i (bank_req_payload[b].wdata.amo ),
.in_write_i (bank_req_payload[b].wen ),
.in_wdata_i (bank_req_payload[b].wdata.data ),
.in_meta_i (meta_in ),
.in_be_i (bank_req_payload[b].be ),
.in_valid_o (bank_resp_valid[b] ),
.in_ready_i (bank_resp_ready[b] ),
.in_rdata_o (bank_resp_payload[b].rdata.data ),
.in_meta_o (meta_out ),
.out_req_o (req_valid ),
.out_add_o (req_addr ),
.out_write_o (req_write ),
.out_wdata_o (req_wdata ),
.out_be_o (req_be ),
.out_rdata_i (resp_rdata )
);
end

// Bank
tc_sram #(
Expand Down
Loading