|
11 | 11 | /* TODO DESCRIPTION |
12 | 12 | * TODO: LIMITATION NUM_COLS_Y >= 2 <=> NUM_COLS >= 4 |
13 | 13 | * TODO: COMPLETELY FIXED TO KERNEL SIZE OF 3 |
14 | | - * |
15 | | - * |
16 | | - * |
17 | | - * |
18 | 14 | */ |
19 | 15 |
|
| 16 | +// FIXME: Does not work with GCC -O3 optimization as only a limited number of |
| 17 | +// outstanding queue ops are supported |
| 18 | + |
20 | 19 | #include "alloc.h" |
21 | 20 | #include "printf.h" |
22 | 21 |
|
| 22 | +// Queue dump macro |
| 23 | +dump(queue, 16); |
| 24 | + |
23 | 25 | // Array of queue ptrs in row-major order (concatenated kernels) |
24 | 26 | int32_t *queues_x_0[NUM_CORES]; |
25 | 27 | int32_t *queues_x_1[NUM_CORES]; |
@@ -49,23 +51,21 @@ void systolic_init(uint32_t const *tile_map, uint32_t const *core_map) { |
49 | 51 | for (uint32_t i = 0; i < NUM_CORES; ++i) { |
50 | 52 | tile_id = tile_map[i]; |
51 | 53 | core_id = core_map[i]; |
52 | | - tile_offset = tile_id * 4 * SEQ_MEM_SIZE / 4; |
53 | | - core_offset = core_id % 4 * 4; |
| 54 | + tile_offset = tile_id * NUM_CORES_PER_TILE * SEQ_MEM_SIZE / sizeof(int32_t); |
| 55 | + core_offset = core_id % NUM_CORES_PER_TILE * 4; |
54 | 56 | queues_x_0[i] = &__seq_start + tile_offset + core_offset + 0; |
55 | 57 | queues_x_1[i] = &__seq_start + tile_offset + core_offset + 1; |
56 | 58 | } |
57 | 59 |
|
58 | 60 | // Print out queue addresses |
59 | | - // printf("queues_x_0\n"); |
| 61 | + // printf("\n[QUEUE] queues_x_0\n"); |
60 | 62 | // for (uint32_t i = 0; i < NUM_CORES; ++i) { |
61 | | - // printf("%5d ", queues_x_0[i]); |
62 | | - // } |
63 | | - // printf("\n"); |
64 | | - // printf("queues_x_1\n"); |
| 63 | + // dump_queue((uint32_t)(queues_x_0[i])); |
| 64 | + //} |
| 65 | + // printf("\n[QUEUE] queues_x_1\n"); |
65 | 66 | // for (uint32_t i = 0; i < NUM_CORES; ++i) { |
66 | | - // printf("%5d ", queues_x_1[i]); |
67 | | - // } |
68 | | - // printf("\n"); |
| 67 | + // dump_queue((uint32_t)(queues_x_1[i])); |
| 68 | + //} |
69 | 69 | } |
70 | 70 |
|
71 | 71 | void systolic_conv_front(const uint32_t num_rows, const uint32_t num_cols, |
@@ -894,6 +894,19 @@ void systolic_conv_end(const uint32_t kernel_id, const uint32_t num_rows, |
894 | 894 | // ---------- |
895 | 895 | // POPULATE 0 |
896 | 896 | // ---------- |
| 897 | + |
| 898 | + curr_x[2] = X[row * num_cols + 0]; |
| 899 | + asm volatile( |
| 900 | + "q.pop.w %[curr_x_1], 0(%[queue_1]) \n\t" |
| 901 | + "q.pop.w %[curr_x_0], 0(%[queue_0]) \n\t" |
| 902 | + "p.mac %[acc_y_2], \n\t" |
| 903 | + : [curr_x_0] "=r"(curr_x[0]), [curr_x_1] "=r"(curr_x[1]), |
| 904 | + [acc_y_2] "+r"(acc_y[2]) |
| 905 | + : [queue_0] "r"(queue_prev_x_0), [queue_1] "r"(queue_prev_x_1), |
| 906 | + [curr_x_2] "r"(curr_x[2])[weight_0_0] "r"(weights[0][0]), |
| 907 | + [weight_1_0] "r"(weights[1][0]), [weight_2_0] "r"(weights[2][0]) |
| 908 | + : "memory"); |
| 909 | +
|
897 | 910 | // Pop and load x vector |
898 | 911 | queue_pop(queue_prev_x_1, &curr_x[1]); |
899 | 912 | curr_x[2] = X[row * num_cols + 0]; |
|
0 commit comments