Skip to content

Commit 7a28eff

Browse files
committed
restored drl align stage
1 parent bced714 commit 7a28eff

File tree

5 files changed

+106
-106
lines changed

5 files changed

+106
-106
lines changed

hw/rtl/tcu/VX_tcu_core.sv

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,11 @@ module VX_tcu_core import VX_gpu_pkg::*, VX_tcu_pkg::*; #(
4646
localparam FACC_LATENCY = 2;
4747
localparam FEDP_LATENCY = FMUL_LATENCY + FACC_LATENCY;
4848
`else // TCU_DRL
49-
localparam FMUL_LATENCY = 2;
49+
localparam FMUL_LATENCY = 1;
50+
localparam FALN_LATENCY = 1;
5051
localparam FACC_LATENCY = 1;
5152
localparam FRND_LATENCY = 1;
52-
localparam FEDP_LATENCY = FMUL_LATENCY + FACC_LATENCY + FRND_LATENCY;
53+
localparam FEDP_LATENCY = FMUL_LATENCY + FALN_LATENCY + FACC_LATENCY + FRND_LATENCY;
5354
`endif
5455

5556
localparam PIPE_LATENCY = FEDP_LATENCY + 1;

hw/rtl/tcu/VX_tcu_fedp_drl.sv

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,11 @@ module VX_tcu_fedp_drl #(
3131
);
3232

3333
localparam TCK = 2 * N;
34-
localparam FMUL_LATENCY = 2;
34+
localparam FMUL_LATENCY = 1;
35+
localparam ALN_LATENCY = 1;
3536
localparam ACC_LATENCY = 1;
3637
localparam FRND_LATENCY = 1;
37-
localparam TOTAL_LATENCY= FMUL_LATENCY + ACC_LATENCY + FRND_LATENCY;
38+
localparam TOTAL_LATENCY= FMUL_LATENCY + ALN_LATENCY + ACC_LATENCY + FRND_LATENCY;
3839
`STATIC_ASSERT (LATENCY == 0 || LATENCY == TOTAL_LATENCY, ("invalid latency! expected=%0d, actual=%0d", TOTAL_LATENCY, LATENCY));
3940

4041
`UNUSED_VAR ({fmt_d, c_val});
@@ -50,52 +51,86 @@ module VX_tcu_fedp_drl #(
5051
assign b_col16[2*i+1] = b_col[i][31:16];
5152
end
5253

53-
//Transprecision Mul & Max Exp & Align Sigs
54-
wire [7:0] raw_max_exp;
55-
wire [TCK:0][24:0] aln_sigs;
54+
//Transprecision Mul & Max Exp
5655
wire [6:0] hi_c = c_val[31:25]; //c_val[24:0] acc is taken care of in acc stage
5756
wire fmt_sel = fmt_s[3];
57+
wire [7:0] raw_max_exp;
58+
wire [TCK:0][7:0] shift_amounts;
59+
wire [TCK:0][24:0] raw_sigs;
5860

5961
VX_tcu_drl_mul_exp #(
6062
.N(TCK+1)
6163
) mul_exp (
62-
.enable (enable),
63-
.fmt_s (fmt_s),
64-
.a_rows (a_row16),
65-
.b_cols (b_col16),
66-
.c_val (c_val[31:0]),
67-
.raw_max_exp (raw_max_exp),
68-
.sigs_out (aln_sigs)
64+
.enable (enable),
65+
.fmt_s (fmt_s),
66+
.a_rows (a_row16),
67+
.b_cols (b_col16),
68+
.c_val (c_val[31:0]),
69+
.raw_max_exp (raw_max_exp),
70+
.shift_amounts (shift_amounts),
71+
.raw_sigs (raw_sigs)
6972
);
7073

71-
//Stage 1/2 pipeline reg
74+
//Stage 1 pipeline reg
7275
wire [7:0] pipe_raw_max_exp;
73-
wire [TCK:0][24:0] pipe_aln_sigs;
76+
wire [TCK:0][7:0] pipe_shift_amounts;
77+
wire [TCK:0][24:0] pipe_raw_sigs;
7478
wire [6:0] pipe_hi_c;
7579
wire pipe_fmt_sel;
7680
VX_pipe_register #(
77-
.DATAW (8+((TCK+1)*25)+7+1),
81+
.DATAW (8+((TCK+1)*8)+((TCK+1)*25)+7+1),
7882
.DEPTH (FMUL_LATENCY)
79-
) pipe_align (
83+
) pipe_fmul (
84+
.clk (clk),
85+
.reset (reset),
86+
.enable (enable),
87+
.data_in ({raw_max_exp, shift_amounts, raw_sigs, hi_c, fmt_sel}),
88+
.data_out({pipe_raw_max_exp, pipe_shift_amounts, pipe_raw_sigs, pipe_hi_c, pipe_fmt_sel})
89+
);
90+
91+
//Significand Alignment
92+
wire [TCK:0][24:0] aln_sigs;
93+
wire [7:0] aln_max_exp = pipe_raw_max_exp;
94+
wire [6:0] aln_hi_c = pipe_hi_c;
95+
wire aln_fmt_sel = pipe_fmt_sel;
96+
97+
VX_tcu_drl_align #(
98+
.N(TCK+1)
99+
) sigs_aln (
100+
.shift_amounts (pipe_shift_amounts),
101+
.sigs_in (pipe_raw_sigs),
102+
.fmt_sel (pipe_fmt_sel),
103+
.sigs_out (aln_sigs)
104+
);
105+
106+
//Stage 2 pipeline reg
107+
wire [7:0] pipe_aln_max_exp;
108+
wire [TCK:0][24:0] pipe_aln_sigs;
109+
wire [6:0] pipe_aln_hi_c;
110+
wire pipe_aln_fmt_sel;
111+
VX_pipe_register #(
112+
.DATAW (8+((TCK+1)*25)+7+1),
113+
.DEPTH (ALN_LATENCY)
114+
) pipe_aln (
80115
.clk (clk),
81116
.reset (reset),
82117
.enable (enable),
83-
.data_in ({raw_max_exp, aln_sigs, hi_c, fmt_sel}),
84-
.data_out({pipe_raw_max_exp, pipe_aln_sigs, pipe_hi_c, pipe_fmt_sel})
118+
.data_in ({aln_max_exp, aln_sigs, aln_hi_c, aln_fmt_sel}),
119+
.data_out({pipe_aln_max_exp, pipe_aln_sigs, pipe_aln_hi_c, pipe_aln_fmt_sel})
85120
);
86121

87122
//Accumulate CSA reduction tree
88-
wire [7:0] acc_max_exp = pipe_raw_max_exp;
89-
wire [6:0] acc_hi_c = pipe_hi_c;
90-
wire acc_fmt_sel = pipe_fmt_sel;
123+
wire [7:0] acc_max_exp = pipe_aln_max_exp;
124+
wire [6:0] acc_hi_c = pipe_aln_hi_c;
125+
wire acc_fmt_sel = pipe_aln_fmt_sel;
91126
wire [25+$clog2(TCK+1):0] acc_sig; //23 mantissa + 1 hidden + 1 sign + log2(N) bits
92127
wire [TCK-1:0] sigs_sign; //sign bits of all operands (for int math)
93128

94129
VX_tcu_drl_acc #(
95130
.N(TCK+1)
96131
) csa_acc (
97132
.sigsIn (pipe_aln_sigs),
98-
.fmt_sel (pipe_fmt_sel),
133+
.fmt_sel (pipe_aln_fmt_sel),
99134
.sigOut (acc_sig),
100135
.signOuts (sigs_sign)
101136
);

hw/rtl/tcu/drl/VX_tcu_drl_align.sv

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright © 2019-2023
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
`include "VX_define.vh"
15+
16+
module VX_tcu_drl_align #(
17+
parameter N = 5 //includes c_val
18+
) (
19+
input wire [N-1:0][7:0] shift_amounts,
20+
input wire [N-1:0][24:0] sigs_in,
21+
input wire fmt_sel,
22+
output logic [N-1:0][24:0] sigs_out
23+
);
24+
25+
//Aligned + signed significands
26+
for (genvar i = 0; i < N; i++) begin : g_align_signed
27+
wire fp_sign = sigs_in[i][24];
28+
wire [23:0] fp_sig = sigs_in[i][23:0];
29+
wire [23:0] adj_sig = fp_sig >> shift_amounts[i];
30+
wire [24:0] fp_val = fp_sign ? -adj_sig : {1'b0, adj_sig};
31+
assign sigs_out[i] = fmt_sel ? sigs_in[i] : fp_val;
32+
end
33+
34+
endmodule
35+
36+
/*
37+
wire [23:0] adj_sig = shift_amount[3] ? 24'd0 : full_sig[i] >> shift_amount; //reducing switching activity (power) by clamping to 0 if
38+
//input won't make a significant impact on accumulated value
39+
*/

hw/rtl/tcu/drl/VX_tcu_drl_mul_exp.sv

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ module VX_tcu_drl_mul_exp #(
2222
input wire [N-2:0][15:0] b_cols,
2323
input wire [31:0] c_val,
2424
output logic [7:0] raw_max_exp,
25-
output logic [N-1:0][24:0] sigs_out
25+
output logic [N-1:0][7:0] shift_amounts,
26+
output logic [N-1:0][24:0] raw_sigs
2627
);
2728

2829
//raw fp signals
@@ -155,20 +156,16 @@ module VX_tcu_drl_mul_exp #(
155156
assign int_vals_mux[N-1] = c_val[24:0];
156157

157158
//Raw maximum exponent finder (in parallel to mul) and shift amounts
158-
wire [N-1:0][7:0] shift_amounts;
159159
VX_tcu_drl_max_exp #(
160160
.N(N)
161161
) find_max_exp (
162-
.exponents (mul_exp_mux),
163-
.max_exp (raw_max_exp),
162+
.exponents (mul_exp_mux),
163+
.max_exp (raw_max_exp),
164164
.shift_amounts (shift_amounts)
165165
);
166-
167-
//Aligned + signed significands
168-
for (genvar i = 0; i < N; i++) begin : g_align_signed
169-
wire [23:0] adj_sig = mul_sig_mux[i] >> shift_amounts[i];
170-
wire [24:0] fp_val = mul_sign_mux[i] ? -adj_sig : {1'b0, adj_sig};
171-
assign sigs_out[i] = (fmt_s[3]) ? int_vals_mux[i] : fp_val;
166+
167+
for (genvar i = 0; i < N; i++) begin : g_fp_int_sig_sel
168+
assign raw_sigs[i] = fmt_s[3] ? int_vals_mux[i] : {mul_sign_mux[i], mul_sig_mux[i]};
172169
end
173170

174171
endmodule

hw/rtl/tcu/drl/archive/VX_tcu_drl_align.sv

Lines changed: 0 additions & 72 deletions
This file was deleted.

0 commit comments

Comments
 (0)