restored drl align stage

NikhilRout · NikhilRout · commit 7a28efff768e · 2025-10-01T00:59:53.000+05:30
diff --git a/hw/rtl/tcu/VX_tcu_core.sv b/hw/rtl/tcu/VX_tcu_core.sv
@@ -46,10 +46,11 @@ module VX_tcu_core import VX_gpu_pkg::*, VX_tcu_pkg::*; #(
     localparam FACC_LATENCY = 2;
     localparam FEDP_LATENCY = FMUL_LATENCY + FACC_LATENCY;
 `else // TCU_DRL
-    localparam FMUL_LATENCY = 2;
+    localparam FMUL_LATENCY = 1;
+    localparam FALN_LATENCY = 1;
     localparam FACC_LATENCY = 1;
     localparam FRND_LATENCY = 1;
-    localparam FEDP_LATENCY = FMUL_LATENCY + FACC_LATENCY + FRND_LATENCY;
+    localparam FEDP_LATENCY = FMUL_LATENCY + FALN_LATENCY + FACC_LATENCY + FRND_LATENCY;
 `endif
 
     localparam PIPE_LATENCY = FEDP_LATENCY + 1;
diff --git a/hw/rtl/tcu/VX_tcu_fedp_drl.sv b/hw/rtl/tcu/VX_tcu_fedp_drl.sv
@@ -31,10 +31,11 @@ module VX_tcu_fedp_drl #(
 );
 
     localparam TCK = 2 * N;
-    localparam FMUL_LATENCY = 2;
+    localparam FMUL_LATENCY = 1;
+    localparam ALN_LATENCY  = 1;
     localparam ACC_LATENCY  = 1;
     localparam FRND_LATENCY = 1;
-    localparam TOTAL_LATENCY= FMUL_LATENCY + ACC_LATENCY + FRND_LATENCY;
+    localparam TOTAL_LATENCY= FMUL_LATENCY + ALN_LATENCY + ACC_LATENCY + FRND_LATENCY;
     `STATIC_ASSERT (LATENCY == 0 || LATENCY == TOTAL_LATENCY, ("invalid latency! expected=%0d, actual=%0d", TOTAL_LATENCY, LATENCY));
 
     `UNUSED_VAR ({fmt_d, c_val});
@@ -50,52 +51,86 @@ module VX_tcu_fedp_drl #(
         assign b_col16[2*i+1] = b_col[i][31:16];
     end
 
-    //Transprecision Mul & Max Exp & Align Sigs
-    wire [7:0] raw_max_exp;
-    wire [TCK:0][24:0] aln_sigs;
+    //Transprecision Mul & Max Exp
     wire [6:0] hi_c = c_val[31:25];   //c_val[24:0] acc is taken care of in acc stage
     wire fmt_sel = fmt_s[3];
+    wire [7:0] raw_max_exp;
+    wire [TCK:0][7:0] shift_amounts;
+    wire [TCK:0][24:0] raw_sigs;
 
     VX_tcu_drl_mul_exp #(
         .N(TCK+1)
     ) mul_exp (
-        .enable       (enable),
-        .fmt_s        (fmt_s),
-        .a_rows       (a_row16),
-        .b_cols       (b_col16),
-        .c_val        (c_val[31:0]),
-        .raw_max_exp  (raw_max_exp),
-        .sigs_out     (aln_sigs)
+        .enable        (enable),
+        .fmt_s         (fmt_s),
+        .a_rows        (a_row16),
+        .b_cols        (b_col16),
+        .c_val         (c_val[31:0]),
+        .raw_max_exp   (raw_max_exp),
+        .shift_amounts (shift_amounts),
+        .raw_sigs      (raw_sigs)
     );
 
-    //Stage 1/2 pipeline reg
+    //Stage 1 pipeline reg
     wire [7:0] pipe_raw_max_exp;
-    wire [TCK:0][24:0] pipe_aln_sigs;
+    wire [TCK:0][7:0] pipe_shift_amounts;
+    wire [TCK:0][24:0] pipe_raw_sigs;
     wire [6:0] pipe_hi_c;
     wire pipe_fmt_sel;
     VX_pipe_register #(
-        .DATAW (8+((TCK+1)*25)+7+1),
+        .DATAW (8+((TCK+1)*8)+((TCK+1)*25)+7+1),
         .DEPTH (FMUL_LATENCY)
-    ) pipe_align (
+    ) pipe_fmul (
+        .clk     (clk),
+        .reset   (reset),
+        .enable  (enable),
+        .data_in ({raw_max_exp, shift_amounts, raw_sigs, hi_c, fmt_sel}),
+        .data_out({pipe_raw_max_exp, pipe_shift_amounts, pipe_raw_sigs, pipe_hi_c, pipe_fmt_sel})
+    );
+
+    //Significand Alignment
+    wire [TCK:0][24:0] aln_sigs;
+    wire [7:0] aln_max_exp = pipe_raw_max_exp;
+    wire [6:0] aln_hi_c = pipe_hi_c;
+    wire aln_fmt_sel = pipe_fmt_sel;
+
+    VX_tcu_drl_align #(
+        .N(TCK+1)
+    ) sigs_aln (
+        .shift_amounts (pipe_shift_amounts),
+        .sigs_in       (pipe_raw_sigs),
+        .fmt_sel       (pipe_fmt_sel),
+        .sigs_out      (aln_sigs)
+    );
+
+    //Stage 2 pipeline reg
+    wire [7:0] pipe_aln_max_exp;
+    wire [TCK:0][24:0] pipe_aln_sigs;
+    wire [6:0] pipe_aln_hi_c;
+    wire pipe_aln_fmt_sel;
+    VX_pipe_register #(
+        .DATAW (8+((TCK+1)*25)+7+1),
+        .DEPTH (ALN_LATENCY)
+    ) pipe_aln (
         .clk     (clk),
         .reset   (reset),
         .enable  (enable),
-        .data_in ({raw_max_exp, aln_sigs, hi_c, fmt_sel}),
-        .data_out({pipe_raw_max_exp, pipe_aln_sigs, pipe_hi_c, pipe_fmt_sel})
+        .data_in ({aln_max_exp, aln_sigs, aln_hi_c, aln_fmt_sel}),
+        .data_out({pipe_aln_max_exp, pipe_aln_sigs, pipe_aln_hi_c, pipe_aln_fmt_sel})
     );
 
     //Accumulate CSA reduction tree
-    wire [7:0] acc_max_exp = pipe_raw_max_exp;
-    wire [6:0] acc_hi_c = pipe_hi_c;
-    wire acc_fmt_sel = pipe_fmt_sel;
+    wire [7:0] acc_max_exp = pipe_aln_max_exp;
+    wire [6:0] acc_hi_c = pipe_aln_hi_c;
+    wire acc_fmt_sel = pipe_aln_fmt_sel;
     wire [25+$clog2(TCK+1):0] acc_sig;    //23 mantissa + 1 hidden + 1 sign + log2(N) bits
     wire [TCK-1:0] sigs_sign;    //sign bits of all operands (for int math)
 
     VX_tcu_drl_acc #(
         .N(TCK+1)
     ) csa_acc (
         .sigsIn   (pipe_aln_sigs),
-        .fmt_sel  (pipe_fmt_sel),
+        .fmt_sel  (pipe_aln_fmt_sel),
         .sigOut   (acc_sig),
         .signOuts (sigs_sign)
     );
diff --git a/hw/rtl/tcu/drl/VX_tcu_drl_align.sv b/hw/rtl/tcu/drl/VX_tcu_drl_align.sv
@@ -0,0 +1,39 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_define.vh"
+
+module VX_tcu_drl_align #(
+    parameter N = 5    //includes c_val
+) (
+    input wire [N-1:0][7:0] shift_amounts,
+    input wire [N-1:0][24:0] sigs_in,
+    input wire fmt_sel,
+    output logic [N-1:0][24:0] sigs_out
+);
+
+    //Aligned + signed significands
+    for (genvar i = 0; i < N; i++) begin : g_align_signed
+        wire fp_sign = sigs_in[i][24];
+        wire [23:0] fp_sig = sigs_in[i][23:0];
+        wire [23:0] adj_sig = fp_sig >> shift_amounts[i];
+        wire [24:0] fp_val = fp_sign ? -adj_sig : {1'b0, adj_sig};
+        assign sigs_out[i] = fmt_sel ? sigs_in[i] : fp_val;
+    end
+
+endmodule
+
+/*
+        wire [23:0] adj_sig = shift_amount[3] ? 24'd0 : full_sig[i] >> shift_amount;      //reducing switching activity (power) by clamping to 0 if
+                                                                                        //input won't make a significant impact on accumulated value
+*/
diff --git a/hw/rtl/tcu/drl/VX_tcu_drl_mul_exp.sv b/hw/rtl/tcu/drl/VX_tcu_drl_mul_exp.sv
@@ -22,7 +22,8 @@ module VX_tcu_drl_mul_exp #(
     input wire [N-2:0][15:0] b_cols,
     input wire [31:0] c_val,
     output logic [7:0] raw_max_exp,
-    output logic [N-1:0][24:0] sigs_out
+    output logic [N-1:0][7:0] shift_amounts,
+    output logic [N-1:0][24:0] raw_sigs
 );
 
     //raw fp signals
@@ -155,20 +156,16 @@ module VX_tcu_drl_mul_exp #(
     assign int_vals_mux[N-1] = c_val[24:0];
 
     //Raw maximum exponent finder (in parallel to mul) and shift amounts
-    wire [N-1:0][7:0] shift_amounts;
     VX_tcu_drl_max_exp #(
         .N(N)
     ) find_max_exp (
-        .exponents (mul_exp_mux),
-        .max_exp   (raw_max_exp),
+        .exponents     (mul_exp_mux),
+        .max_exp       (raw_max_exp),
         .shift_amounts (shift_amounts)
     );
-
-    //Aligned + signed significands
-    for (genvar i = 0; i < N; i++) begin : g_align_signed
-        wire [23:0] adj_sig = mul_sig_mux[i] >> shift_amounts[i];
-        wire [24:0] fp_val = mul_sign_mux[i] ? -adj_sig : {1'b0, adj_sig};
-        assign sigs_out[i] = (fmt_s[3]) ? int_vals_mux[i] : fp_val;
+    
+    for (genvar i = 0; i < N; i++) begin : g_fp_int_sig_sel
+        assign raw_sigs[i] = fmt_s[3] ? int_vals_mux[i] : {mul_sign_mux[i], mul_sig_mux[i]};
     end
 
 endmodule
diff --git a/hw/rtl/tcu/drl/archive/VX_tcu_drl_align.sv b/hw/rtl/tcu/drl/archive/VX_tcu_drl_align.sv