Skip to content

Commit 58d6ecc

Browse files
committed
[software] Fix systolic software parametrization
1 parent 5397946 commit 58d6ecc

File tree

4 files changed

+36
-21
lines changed

4 files changed

+36
-21
lines changed

software/apps/systolic/conv_xqueue/main.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ void print_matrix(int32_t const *matrix, uint32_t num_rows,
5656
int main() {
5757
uint32_t core_id = mempool_get_core_id();
5858
uint32_t num_cores = mempool_get_core_count();
59-
uint32_t tile_id = core_id / 4;
59+
uint32_t tile_id = core_id / NUM_CORES_PER_TILE;
6060

6161
// Initialize synchronization variables
6262
mempool_barrier_init(core_id);
@@ -66,8 +66,8 @@ int main() {
6666

6767
// Allocate tile and core maps
6868
if (core_id == 0) {
69-
tile_map = (uint32_t *)simple_malloc(num_cores * 4);
70-
core_map = (uint32_t *)simple_malloc(num_cores * 4);
69+
tile_map = (uint32_t *)simple_malloc(num_cores * sizeof(uint32_t));
70+
core_map = (uint32_t *)simple_malloc(num_cores * sizeof(uint32_t));
7171
}
7272

7373
// Wait for all cores

software/apps/systolic/matmul_xqueue/main.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,10 @@ int main() {
142142
printf("> Initialize\n");
143143

144144
// Print out tile mapping
145-
//print_matrix((int32_t *)tile_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
145+
// print_matrix((int32_t *)tile_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
146146

147147
// Print out core mapping
148-
//print_matrix((int32_t *)core_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
148+
// print_matrix((int32_t *)core_mapping, SYSTOLIC_SIZE, SYSTOLIC_SIZE);
149149

150150
// Initialize systolic array
151151
systolic_init(tile_mapping, core_mapping);

software/runtime/runtime.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,12 @@
1111
#include <stddef.h>
1212
#include <stdint.h>
1313

14-
#define NUM_BANKS_PER_TILE NUM_CORES_PER_TILE * BANKING_FACTOR
14+
#define NUM_BANKS_PER_TILE NUM_CORES_PER_TILE *BANKING_FACTOR
1515

1616
#define SIZEOF_INT32_T 4 // hack: result of sizeof(int32_t)
17-
#if ((NUM_CORES * SEQ_MEM_SIZE) < (NUM_CORES * STACK_SIZE + NUM_CORES * BANKING_FACTOR * SIZEOF_INT32_T * XQUEUE_SIZE))
17+
#if ((NUM_CORES * SEQ_MEM_SIZE) < \
18+
(NUM_CORES * STACK_SIZE + \
19+
NUM_CORES * BANKING_FACTOR * SIZEOF_INT32_T * XQUEUE_SIZE))
1820
#error Sequential memory required for stack and Xqueues is bigger than the available one
1921
#endif
2022

software/runtime/systolic/conv_xqueue.h

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,17 @@
1111
/* TODO DESCRIPTION
1212
* TODO: LIMITATION NUM_COLS_Y >= 2 <=> NUM_COLS >= 4
1313
* TODO: COMPLETELY FIXED TO KERNEL SIZE OF 3
14-
*
15-
*
16-
*
17-
*
1814
*/
1915

16+
// FIXME: Does not work with GCC -O3 optimization as only a limited number of
17+
// outstanding queue ops are supported
18+
2019
#include "alloc.h"
2120
#include "printf.h"
2221

22+
// Queue dump macro
23+
dump(queue, 16);
24+
2325
// Array of queue ptrs in row-major order (concatenated kernels)
2426
int32_t *queues_x_0[NUM_CORES];
2527
int32_t *queues_x_1[NUM_CORES];
@@ -49,23 +51,21 @@ void systolic_init(uint32_t const *tile_map, uint32_t const *core_map) {
4951
for (uint32_t i = 0; i < NUM_CORES; ++i) {
5052
tile_id = tile_map[i];
5153
core_id = core_map[i];
52-
tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4;
53-
core_offset = core_id % 4 * 4;
54+
tile_offset = tile_id * NUM_CORES_PER_TILE * SEQ_MEM_SIZE / sizeof(int32_t);
55+
core_offset = core_id % NUM_CORES_PER_TILE * 4;
5456
queues_x_0[i] = &__seq_start + tile_offset + core_offset + 0;
5557
queues_x_1[i] = &__seq_start + tile_offset + core_offset + 1;
5658
}
5759

5860
// Print out queue addresses
59-
// printf("queues_x_0\n");
61+
// printf("\n[QUEUE] queues_x_0\n");
6062
// for (uint32_t i = 0; i < NUM_CORES; ++i) {
61-
// printf("%5d ", queues_x_0[i]);
62-
// }
63-
// printf("\n");
64-
// printf("queues_x_1\n");
63+
// dump_queue((uint32_t)(queues_x_0[i]));
64+
//}
65+
// printf("\n[QUEUE] queues_x_1\n");
6566
// for (uint32_t i = 0; i < NUM_CORES; ++i) {
66-
// printf("%5d ", queues_x_1[i]);
67-
// }
68-
// printf("\n");
67+
// dump_queue((uint32_t)(queues_x_1[i]));
68+
//}
6969
}
7070

7171
void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols,
@@ -894,6 +894,19 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows,
894894
// ----------
895895
// POPULATE 0
896896
// ----------
897+
898+
curr_x[2] = X[row * num_cols + 0];
899+
asm volatile(
900+
"q.pop.w %[curr_x_1], 0(%[queue_1]) \n\t"
901+
"q.pop.w %[curr_x_0], 0(%[queue_0]) \n\t"
902+
"p.mac %[acc_y_2], \n\t"
903+
: [curr_x_0] "=r"(curr_x[0]), [curr_x_1] "=r"(curr_x[1]),
904+
[acc_y_2] "+r"(acc_y[2])
905+
: [queue_0] "r"(queue_prev_x_0), [queue_1] "r"(queue_prev_x_1),
906+
[curr_x_2] "r"(curr_x[2])[weight_0_0] "r"(weights[0][0]),
907+
[weight_1_0] "r"(weights[1][0]), [weight_2_0] "r"(weights[2][0])
908+
: "memory");
909+
897910
// Pop and load x vector
898911
queue_pop(queue_prev_x_1, &curr_x[1]);
899912
curr_x[2] = X[row * num_cols + 0];

0 commit comments

Comments
 (0)