diff --git a/hw/ip/snitch/src/riscv_instr.sv b/hw/ip/snitch/src/riscv_instr.sv
index 089181a1..16eda666 100644
--- a/hw/ip/snitch/src/riscv_instr.sv
+++ b/hw/ip/snitch/src/riscv_instr.sv
@@ -1338,4 +1338,5 @@ package riscv_instr;
   localparam logic [11:0] CSR_MHPMCOUNTER29H = 12'hb9d;
   localparam logic [11:0] CSR_MHPMCOUNTER30H = 12'hb9e;
   localparam logic [11:0] CSR_MHPMCOUNTER31H = 12'hb9f;
+
 endpackage
diff --git a/hw/ip/snitch/src/snitch.sv b/hw/ip/snitch/src/snitch.sv
index 5e2d1a93..ecea6bd0 100644
--- a/hw/ip/snitch/src/snitch.sv
+++ b/hw/ip/snitch/src/snitch.sv
@@ -2335,6 +2335,16 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
       riscv_instr::VREDMINU_VS,
       riscv_instr::VREDMAX_VS,
       riscv_instr::VREDMAXU_VS,
+      // CMY: add VMANDNOT VMAND VMOR VMXOR VMORNOT VMNAND VMNOR VMXNOR, 8 masking instructions
+      riscv_instr::VMANDN_MM,
+      riscv_instr::VMAND_MM,
+      riscv_instr::VMOR_MM,
+      riscv_instr::VMXOR_MM,
+      riscv_instr::VMORN_MM,
+      riscv_instr::VMNAND_MM,
+      riscv_instr::VMNOR_MM,
+      riscv_instr::VMXNOR_MM,
+      //----------------------------------------------------------
       riscv_instr::VMSEQ_VV,
       riscv_instr::VMSEQ_VI,
       riscv_instr::VMSNE_VV,
diff --git a/hw/ip/spatz/src/spatz.sv b/hw/ip/spatz/src/spatz.sv
index cfe81ea2..dedc606b 100644
--- a/hw/ip/spatz/src/spatz.sv
+++ b/hw/ip/spatz/src/spatz.sv
@@ -289,7 +289,7 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #(
     .vfu_rsp_o        (vfu_rsp                                                 ),
     // VRF
     .vrf_waddr_o      (vrf_waddr[VFU_VD_WD]                                    ),
-    .vrf_wdata_o      (vrf_wdata[VFU_VD_WD]                                    ),
+    .vrf_wdata_o      (vrf_wdata[VFU_VD_WD]                                    ), // N_FU*ELEN bits
     .vrf_we_o         (sb_we[VFU_VD_WD]                                        ),
     .vrf_wbe_o        (vrf_wbe[VFU_VD_WD]                                      ),
     .vrf_wvalid_i     (vrf_wvalid[VFU_VD_WD]                                   ),
diff --git a/hw/ip/spatz/src/spatz_controller.sv b/hw/ip/spatz/src/spatz_controller.sv
index 2e8837e0..e6ed0a36 100644
--- a/hw/ip/spatz/src/spatz_controller.sv
+++ b/hw/ip/spatz/src/spatz_controller.sv
@@ -199,7 +199,7 @@ module spatz_controller
   logic       req_buffer_ready, req_buffer_valid, req_buffer_pop;
 
   // One element wide instruction buffer
-  fall_through_register #(
+  fall_through_register #( // a fifo.
     .T(spatz_req_t)
   ) i_req_buffer (
     .clk_i     (clk_i                ),
@@ -339,6 +339,12 @@ module spatz_controller
         scoreboard_d[spatz_req.id].deps[write_table_d[spatz_req.vd].id] |= write_table_d[spatz_req.vd].valid;
         read_table_d[spatz_req.vd] = {spatz_req.id, 1'b1};
       end
+      // CMY: tackling v0 RAW hazard-------------------------------------------------------
+      if (!spatz_req.op_arith.vm) begin
+        scoreboard_d[spatz_req.id].deps[write_table_d[0].id] |= write_table_d[0].valid;
+        read_table_d[0] = {spatz_req.id, 1'b1};
+      end
+      //--------------------------------------------------------------------------------------
 
       // WAW and WAR hazards
       if (spatz_req.use_vd) begin
@@ -454,7 +460,7 @@ module spatz_controller
     running_insn_d = running_insn_q;
 
     // New instruction!
-    if (spatz_req_valid && spatz_req.ex_unit != CON)
+    if (spatz_req_valid && spatz_req.ex_unit != CON) // declare a new instruction
       running_insn_d[next_insn_id] = 1'b1;
 
     // Finished a instruction
diff --git a/hw/ip/spatz/src/spatz_decoder.sv b/hw/ip/spatz/src/spatz_decoder.sv
index a41c5436..955dc18e 100644
--- a/hw/ip/spatz/src/spatz_decoder.sv
+++ b/hw/ip/spatz/src/spatz_decoder.sv
@@ -274,6 +274,16 @@ module spatz_decoder
         riscv_instr::VREDMINU_VS,
         riscv_instr::VREDMAX_VS,
         riscv_instr::VREDMAXU_VS,
+// CMY: add VMANDNOT VMAND VMOR VMXOR VMORNOT VMNAND VMNOR VMXNOR, 8 masking instructions
+        riscv_instr::VMANDN_MM,
+        riscv_instr::VMAND_MM,
+        riscv_instr::VMOR_MM,
+        riscv_instr::VMXOR_MM,
+        riscv_instr::VMORN_MM,
+        riscv_instr::VMNAND_MM,
+        riscv_instr::VMNOR_MM,
+        riscv_instr::VMXNOR_MM,
+//-------------------------------------------------------------
         riscv_instr::VMSEQ_VV,
         riscv_instr::VMSEQ_VX,
         riscv_instr::VMSEQ_VI,
@@ -348,7 +358,7 @@ module spatz_decoder
           automatic vreg_t arith_s1       = decoder_req_i.instr[19:15];
           automatic vreg_t arith_s2       = decoder_req_i.instr[24:20];
           automatic vreg_t arith_d        = decoder_req_i.instr[11:7];
-          automatic logic arith_vm        = decoder_req_i.instr[25];
+          automatic logic arith_vm        = decoder_req_i.instr[25]; //Vector Arithmetic Masking Enable bit
 
           spatz_req.op_arith.vm = arith_vm;
           spatz_req.op_sld.vm   = arith_vm;
@@ -827,6 +837,39 @@ module spatz_decoder
               end
             end
 
+       // CMY: Mask operations
+            riscv_instr::VMANDN_MM: begin
+              spatz_req.op = VMANDNOT;
+            end
+
+            riscv_instr::VMAND_MM: begin
+              spatz_req.op = VMAND;
+            end
+
+            riscv_instr::VMOR_MM: begin
+              spatz_req.op = VMOR;
+            end
+
+            riscv_instr::VMXOR_MM: begin
+              spatz_req.op = VMXOR;
+            end
+
+          riscv_instr::VMORN_MM: begin
+              spatz_req.op = VMORNOT;
+            end
+
+          riscv_instr::VMNAND_MM: begin
+              spatz_req.op = VMNAND;
+            end
+
+          riscv_instr::VMNOR_MM: begin
+              spatz_req.op = VMNOR;
+            end
+
+          riscv_instr::VMXNOR_MM: begin
+              spatz_req.op = VMXNOR;
+            end
+
             default: illegal_instr = 1'b1;
           endcase // Arithmetic Instruction Type
         end
diff --git a/hw/ip/spatz/src/spatz_ipu.sv b/hw/ip/spatz/src/spatz_ipu.sv
index 9d6f6485..2e4c1253 100644
--- a/hw/ip/spatz/src/spatz_ipu.sv
+++ b/hw/ip/spatz/src/spatz_ipu.sv
@@ -478,6 +478,7 @@ module spatz_ipu import spatz_pkg::*; import rvv_pkg::vew_e; #(
     ///////////////
 
     // Collect results from the SIMD lanes
+    // each lane is responsible for calculating one element.
     always_comb begin : collector
       unique case (sew)
         rvv_pkg::EW_8 : begin
diff --git a/hw/ip/spatz/src/spatz_pkg.sv.tpl b/hw/ip/spatz/src/spatz_pkg.sv.tpl
index 7d565323..c3c1cd8c 100644
--- a/hw/ip/spatz/src/spatz_pkg.sv.tpl
+++ b/hw/ip/spatz/src/spatz_pkg.sv.tpl
@@ -108,6 +108,10 @@ package spatz_pkg;
   typedef logic [$clog2(NrVRFWords)-1:0] vrf_addr_t;
   typedef logic [N_FU*ELENB-1:0] vrf_be_t;
   typedef logic [N_FU*ELEN-1:0] vrf_data_t;
+  // ELEN = 64
+  // The VRF is centralized and serves all functional units.
+  // Each VRF port is 64F-bit wide. F denotes the number of FPUs.
+  // the FU here doesn't refer to Functioan Units. N_FU=max{N_IPU,N_FPU}
 
   // Instruction ID
   typedef logic [$clog2(NrParallelInstructions)-1:0] spatz_id_t;
diff --git a/hw/ip/spatz/src/spatz_simd_lane.sv b/hw/ip/spatz/src/spatz_simd_lane.sv
index bc5c684a..2fa4bb9b 100644
--- a/hw/ip/spatz/src/spatz_simd_lane.sv
+++ b/hw/ip/spatz/src/spatz_simd_lane.sv
@@ -220,9 +220,14 @@ module spatz_simd_lane import spatz_pkg::*; import rvv_pkg::vew_e; #(
         VSUB, VRSUB, VNMSAC, VNMSUB, VSBC: simd_result = subtractor_result[Width-1:0];
         VMIN, VMINU                      : simd_result = $signed({op_s1_i[Width-1] & is_signed_i, op_s1_i}) <= $signed({op_s2_i[Width-1] & is_signed_i, op_s2_i}) ? op_s1_i : op_s2_i;
         VMAX, VMAXU                      : simd_result = $signed({op_s1_i[Width-1] & is_signed_i, op_s1_i}) > $signed({op_s2_i[Width-1] & is_signed_i, op_s2_i}) ? op_s1_i : op_s2_i;
-        VAND                             : simd_result = op_s1_i & op_s2_i;
-        VOR                              : simd_result = op_s1_i | op_s2_i;
-        VXOR                             : simd_result = op_s1_i ^ op_s2_i;
+        VAND, VMAND                      : simd_result = op_s1_i & op_s2_i; // CMY: add masking support
+        VOR , VMOR                       : simd_result = op_s1_i | op_s2_i; // like above
+        VXOR, VMXOR                      : simd_result = op_s1_i ^ op_s2_i; // like above
+        VMANDNOT                         : simd_result = ~op_s1_i & op_s2_i; // like above
+        VMORNOT                          : simd_result = ~op_s1_i | op_s2_i; // like above
+        VMNAND                           : simd_result = ~(op_s1_i & op_s2_i); // like above
+        VMNOR                            : simd_result = ~(op_s1_i | op_s2_i); // like above
+        VMXNOR                           : simd_result = ~(op_s1_i ^ op_s2_i); // like above
         VSLL                             : simd_result = shift_operand << shift_amount;
         VSRL                             : simd_result = shift_operand >> shift_amount;
         VSRA                             : simd_result = $signed(shift_operand) >>> shift_amount;
diff --git a/hw/ip/spatz/src/spatz_vfu.sv b/hw/ip/spatz/src/spatz_vfu.sv
index 01a73a56..1847e61d 100644
--- a/hw/ip/spatz/src/spatz_vfu.sv
+++ b/hw/ip/spatz/src/spatz_vfu.sv
@@ -125,12 +125,20 @@ module spatz_vfu
   // Do we have the reduction operand?
   logic reduction_operand_ready_d, reduction_operand_ready_q;
 
+// CMY: Are we reading operands or v0.t?
+  typedef enum logic{
+    READ_OPERANDS, READ_V0_t
+  } operand_state_t;
+   operand_state_t operand_state_d, operand_state_q;
+  `FF(operand_state_q, operand_state_d, READ_OPERANDS)
+
   // Are the VFU operands ready?
   logic op1_is_ready, op2_is_ready, op3_is_ready, operands_ready;
-  assign op1_is_ready   = spatz_req_valid && ((!spatz_req.op_arith.is_reduction && (!spatz_req.use_vs1 || vrf_rvalid_i[1])) || (spatz_req.op_arith.is_reduction && reduction_operand_ready_q));
-  assign op2_is_ready   = spatz_req_valid && ((!spatz_req.use_vs2 || vrf_rvalid_i[0]) || spatz_req.op_arith.is_reduction);
-  assign op3_is_ready   = spatz_req_valid && (!spatz_req.vd_is_src || vrf_rvalid_i[2]);
+  assign op1_is_ready   = spatz_req_valid && (operand_state_q == READ_OPERANDS) && ((!spatz_req.op_arith.is_reduction && (!spatz_req.use_vs1 || vrf_rvalid_i[1])) || (spatz_req.op_arith.is_reduction && reduction_operand_ready_q));
+  assign op2_is_ready   = spatz_req_valid && (operand_state_q == READ_OPERANDS) && ((!spatz_req.use_vs2 || vrf_rvalid_i[0]) || spatz_req.op_arith.is_reduction);
+  assign op3_is_ready   = spatz_req_valid && (operand_state_q == READ_OPERANDS) && (!spatz_req.vd_is_src || vrf_rvalid_i[2]);
   assign operands_ready = op1_is_ready && op2_is_ready && op3_is_ready && (!spatz_req.op_arith.is_scalar || vfu_rsp_ready_i) && !stall;
+// CMY: added (operand_state_q == READ_OPERANDS).
 
   // Valid operations
   logic [N_FU*ELENB-1:0] valid_operations;
@@ -150,6 +158,7 @@ module spatz_vfu
   // Is this a FPU instruction
   logic is_fpu_insn;
   assign is_fpu_insn = FPU && spatz_req.op inside {[VFADD:VSDOTP]};
+  // FPU is defined in spart_pkg ,   localparam bit FPU            = N_FPU != 0;
 
   // Is the FPU busy?
   logic is_fpu_busy;
@@ -167,6 +176,7 @@ module spatz_vfu
   typedef enum logic [2:0] {
     Reduction_NormalExecution,
     Reduction_Wait,
+    Reduction_Read_V0_t, // CMY added a state
     Reduction_Init,
     Reduction_Reduce,
     Reduction_WriteBack
@@ -217,7 +227,7 @@ module spatz_vfu
       vl_d              = vl_q + nr_elem_word;
       // Update narrowing information
       narrowing_upper_d = narrowing_upper_q ^ spatz_req.op_arith.is_narrowing;
-      widening_upper_d  = widening_upper_q ^ (spatz_req.op_arith.widen_vs1 || spatz_req.op_arith.widen_vs2);
+      widening_upper_d  = widening_upper_q ^ (spatz_req.op_arith.widen_vs1 || spatz_req.op_arith.widen_vs2); // toggle the signal if requires widening
     end
 
     // Current state of the VFU
@@ -226,7 +236,7 @@ module spatz_vfu
         VFU_RunningIPU: begin
           // Only go to the FPU state once the IPUs are no longer busy
           if (is_fpu_insn) begin
-            if (is_ipu_busy)
+            if (is_ipu_busy) // CMY: why should we waid for ipu idle when it is a fpu_insn?
               stall = 1'b1;
             else begin
               state_d = VFU_RunningFPU;
@@ -283,9 +293,126 @@ module spatz_vfu
   // Operands //
   //////////////
 
+//CMY: put the fpu_decoder in front of the reduction_useless_value selection.
+// because the value selection depends on the FPU source format.
+  operation_e fpu_op;
+  fp_format_e fpu_src_fmt, fpu_dst_fmt;
+  int_format_e fpu_int_fmt;
+  logic fpu_op_mode;
+  logic fpu_vectorial_op;
+
+  logic [N_FPU-1:0] fpu_busy_d, fpu_busy_q;
+  `FF(fpu_busy_q, fpu_busy_d, '0)
+
+  status_t [N_FPU-1:0] fpu_status_d, fpu_status_q;
+  `FF(fpu_status_q, fpu_status_d, '0)
+
+  always_comb begin: gen_decoder
+      fpu_op           = fpnew_pkg::FMADD;
+      fpu_op_mode      = 1'b0;
+      fpu_vectorial_op = 1'b0;
+      is_fpu_busy      = |fpu_busy_q;
+      fpu_src_fmt      = fpnew_pkg::FP32;
+      fpu_dst_fmt      = fpnew_pkg::FP32;
+      fpu_int_fmt      = fpnew_pkg::INT32;
+
+      fpu_status_o = '0;
+      for (int fpu = 0; fpu < N_FPU; fpu++)
+        fpu_status_o |= fpu_status_q[fpu];
+
+      if (FPU) begin
+        unique case (spatz_req.vtype.vsew)
+          EW_64: begin
+            if (RVD) begin
+              fpu_src_fmt = fpnew_pkg::FP64;
+              fpu_dst_fmt = fpnew_pkg::FP64;
+              fpu_int_fmt = fpnew_pkg::INT64;
+            end
+          end
+          EW_32: begin
+            fpu_src_fmt      = spatz_req.op_arith.is_narrowing || spatz_req.op_arith.widen_vs1 || spatz_req.op_arith.widen_vs2 ? fpnew_pkg::FP64 : fpnew_pkg::FP32;
+            fpu_dst_fmt      = spatz_req.op_arith.widen_vs1 || spatz_req.op_arith.widen_vs2 || spatz_req.op == VSDOTP ? fpnew_pkg::FP64          : fpnew_pkg::FP32;
+            fpu_int_fmt      = spatz_req.op_arith.is_narrowing && spatz_req.op inside {VI2F, VU2F} ? fpnew_pkg::INT64                            : fpnew_pkg::INT32;
+            fpu_vectorial_op = FLEN > 32;
+          end
+          EW_16: begin
+            fpu_src_fmt      = spatz_req.op_arith.is_narrowing || spatz_req.op_arith.widen_vs1 || spatz_req.op_arith.widen_vs2 ? fpnew_pkg::FP32 : (spatz_req.fm.src ? fpnew_pkg::FP16ALT : fpnew_pkg::FP16);
+            fpu_dst_fmt      = spatz_req.op_arith.widen_vs1 || spatz_req.op_arith.widen_vs2 || spatz_req.op == VSDOTP          ? fpnew_pkg::FP32 : (spatz_req.fm.dst ? fpnew_pkg::FP16ALT : fpnew_pkg::FP16);
+            fpu_int_fmt      = spatz_req.op_arith.is_narrowing && spatz_req.op inside {VI2F, VU2F}                             ? fpnew_pkg::INT32 : fpnew_pkg::INT16;
+            fpu_vectorial_op = 1'b1;
+          end
+          EW_8: begin
+            fpu_src_fmt      = spatz_req.op_arith.is_narrowing || spatz_req.op_arith.widen_vs1 || spatz_req.op_arith.widen_vs2 ? (spatz_req.fm.src ? fpnew_pkg::FP16ALT : fpnew_pkg::FP16) : (spatz_req.fm.src ? fpnew_pkg::FP8ALT : fpnew_pkg::FP8);
+            fpu_dst_fmt      = spatz_req.op_arith.widen_vs1 || spatz_req.op_arith.widen_vs2 || spatz_req.op == VSDOTP          ? (spatz_req.fm.dst ? fpnew_pkg::FP16ALT : fpnew_pkg::FP16) : (spatz_req.fm.dst ? fpnew_pkg::FP8ALT : fpnew_pkg::FP8);
+            fpu_int_fmt      = spatz_req.op_arith.is_narrowing && spatz_req.op inside {VI2F, VU2F}                             ? fpnew_pkg::INT16 : fpnew_pkg::INT8;
+            fpu_vectorial_op = 1'b1;
+          end
+          default:;
+        endcase
+
+        unique case (spatz_req.op)
+          VFADD: fpu_op = fpnew_pkg::ADD;
+          VFSUB: begin
+            fpu_op      = fpnew_pkg::ADD;
+            fpu_op_mode = 1'b1;
+          end
+          VFMUL  : fpu_op = fpnew_pkg::MUL;
+          VFMADD : fpu_op = fpnew_pkg::FMADD;
+          VFMSUB : begin
+            fpu_op      = fpnew_pkg::FMADD;
+            fpu_op_mode = 1'b1;
+          end
+          VFNMSUB: fpu_op = fpnew_pkg::FNMSUB;
+          VFNMADD: begin
+            fpu_op      = fpnew_pkg::FNMSUB;
+            fpu_op_mode = 1'b1;
+          end
+
+          VFMINMAX: begin
+            fpu_op = fpnew_pkg::MINMAX;
+            fpu_dst_fmt = fpu_src_fmt;
+          end
+
+
+          VFSGNJ : begin
+            fpu_op = fpnew_pkg::SGNJ;
+            fpu_dst_fmt = fpu_src_fmt;
+          end
+          VFCLASS: begin
+            fpu_op = fpnew_pkg::CLASSIFY;
+            fpu_dst_fmt = fpu_src_fmt;
+          end
+          VFCMP  : begin
+            fpu_op = fpnew_pkg::CMP;
+            fpu_dst_fmt = fpu_src_fmt;
+          end
+
+          VF2F: fpu_op = fpnew_pkg::F2F;
+          VF2I: fpu_op = fpnew_pkg::F2I;
+          VF2U: begin
+            fpu_op      = fpnew_pkg::F2I;
+            fpu_op_mode = 1'b1;
+          end
+          VI2F: fpu_op = fpnew_pkg::I2F;
+          VU2F: begin
+            fpu_op      = fpnew_pkg::I2F;
+            fpu_op_mode = 1'b1;
+          end
+
+          VSDOTP: fpu_op = fpnew_pkg::SDOTP;
+
+          default:;
+        endcase
+      end
+    end: gen_decoder
+//---------------------------------------------------
+
+
+
   // Reduction registers
   elen_t [1:0] reduction_q, reduction_d;
   `FFL(reduction_q, reduction_d, reduction_operand_ready_d, '0)
+  elen_t reduction_useless_value;
 
   // IPU results
   logic [N_FU*ELEN-1:0]  ipu_result;
@@ -299,34 +426,101 @@ module spatz_vfu
 
   // Operands and result signals
   logic [N_FU*ELEN-1:0]  operand1, operand2, operand3;
+  logic [N_FU*ELEN-1:0]  operand_v0_t_lo,operand_v0_t_lo_q; // CMY: v0 should be read from vrf
+  logic [N_FU*ELEN-1:0]  operand_v0_t_hi,operand_v0_t_hi_q;
   logic [N_FU*ELENB-1:0] in_ready;
-  always_comb begin: operand_proc
-    if (spatz_req.op_arith.is_scalar)
-      operand1 = {1*N_FU{spatz_req.rs1}};
-    else if (spatz_req.use_vs1)
-      operand1 = spatz_req.op_arith.is_reduction ? $unsigned(reduction_q[1]) : vrf_rdata_i[1];
-    else begin
-      // Replicate scalar operands
-      unique case (spatz_req.op == VSDOTP ? vew_e'(spatz_req.vtype.vsew + 1) : spatz_req.vtype.vsew)
-        EW_8 : operand1   = MAXEW == EW_32 ? {4*N_FU{spatz_req.rs1[7:0]}}  : {8*N_FU{spatz_req.rs1[7:0]}};
-        EW_16: operand1   = MAXEW == EW_32 ? {2*N_FU{spatz_req.rs1[15:0]}} : {4*N_FU{spatz_req.rs1[15:0]}};
-        EW_32: operand1   = MAXEW == EW_32 ? {1*N_FU{spatz_req.rs1[31:0]}} : {2*N_FU{spatz_req.rs1[31:0]}};
-        default: operand1 = {1*N_FU{spatz_req.rs1}};
-      endcase
-    end
 
-    if ((!spatz_req.op_arith.is_scalar || spatz_req.op == VADD) && spatz_req.use_vs2)
-      operand2 = spatz_req.op_arith.is_reduction ? $unsigned(reduction_q[0]) : vrf_rdata_i[0];
-    else
-      // Replicate scalar operands
-      unique case (spatz_req.op == VSDOTP ? vew_e'(spatz_req.vtype.vsew + 1) : spatz_req.vtype.vsew)
-        EW_8 : operand2   = MAXEW == EW_32 ? {4*N_FU{spatz_req.rs2[7:0]}}  : {8*N_FU{spatz_req.rs2[7:0]}};
-        EW_16: operand2   = MAXEW == EW_32 ? {2*N_FU{spatz_req.rs2[15:0]}} : {4*N_FU{spatz_req.rs2[15:0]}};
-        EW_32: operand2   = MAXEW == EW_32 ? {1*N_FU{spatz_req.rs2[31:0]}} : {2*N_FU{spatz_req.rs2[31:0]}};
-        default: operand2 = {1*N_FU{spatz_req.rs2}};
+  //CMY: have we fetched the v0.t in reduction masking instructions.
+  logic reduction_v0_t_is_ready;
+  assign reduction_v0_t_is_ready = (reduction_state_q == Reduction_Read_V0_t) && vrf_rvalid_i[0] && vrf_rvalid_i[1];
+  logic reduction_v0_t_read_done;
+  `FFLARNC(reduction_v0_t_read_done,1'b1,reduction_v0_t_is_ready,vfu_rsp_valid_o,1'b0,clk_i,rst_ni);
+  //----------------------------------------------------------------
+
+  // CMY: back up v0.t for reduction instructions.-----------------------
+  logic [N_FU*ELEN-1:0]  reduction_operand_v0_t_lo,reduction_operand_v0_t_lo_q;
+  logic [N_FU*ELEN-1:0]  reduction_operand_v0_t_hi,reduction_operand_v0_t_hi_q;
+  `FFL(reduction_operand_v0_t_lo_q, reduction_operand_v0_t_lo, reduction_v0_t_is_ready, '0)
+  `FFL(reduction_operand_v0_t_hi_q, reduction_operand_v0_t_hi, reduction_v0_t_is_ready, '0)
+  logic [VLEN-1:0] reduction_operand_v0_t_q;
+  assign reduction_operand_v0_t_q = {reduction_operand_v0_t_hi_q, reduction_operand_v0_t_lo_q};
+  //---------------------------------------------------------------------------------
+
+  // CMY:an FSM to manage operands between normal calculation and v0.t fetching-----------------
+
+  logic v0_t_is_ready;
+  assign v0_t_is_ready   = (operand_state_q == READ_V0_t) && vrf_rvalid_i[0] && vrf_rvalid_i[1];
+  logic v0_t_read_done;
+  `FFLARNC(v0_t_read_done,1'b1,v0_t_is_ready,vfu_rsp_valid_o,1'b0,clk_i,rst_ni);
+
+  always_comb begin: operand_selection
+    operand_state_d = operand_state_q;
+    // if(spatz_req_valid) begin
+      unique case(operand_state_q)
+        READ_V0_t:
+          if(v0_t_is_ready) operand_state_d = READ_OPERANDS;
+          else operand_state_d = operand_state_q;
+        READ_OPERANDS:
+          if(spatz_req_valid && !spatz_req.op_arith.is_scalar && !spatz_req.op_arith.vm && !v0_t_read_done && !spatz_req.op_arith.is_reduction)
+            operand_state_d = READ_V0_t;
+          else operand_state_d = READ_OPERANDS;
+        default: operand_state_d = operand_state_q;
       endcase
+    // end
+  end:operand_selection
+
+  vlen_t vl_q_plus_nr_elem_word;
+  assign vl_q_plus_nr_elem_word = vl_q + nr_elem_word; // CMY: for monitoring.
+
+  //--------------------------------------------
+
+  always_comb begin: operand_proc // CMY: turn it into a FSM
+    reduction_operand_v0_t_lo = '0;
+    reduction_operand_v0_t_hi = '0;
+    operand_v0_t_lo = '0;
+    operand_v0_t_hi = '0;
+    operand1 = '0;
+    operand2 = '0;
+    case (operand_state_q)
+      READ_OPERANDS: begin
+          if(reduction_state_q == Reduction_Read_V0_t) begin
+            reduction_operand_v0_t_lo = vrf_rdata_i[0];
+            reduction_operand_v0_t_hi = vrf_rdata_i[1];
+          end
+          else begin
+            if (spatz_req.op_arith.is_scalar)
+              operand1 = {1*N_FU{spatz_req.rs1}};
+            else if (spatz_req.use_vs1)
+              operand1 = spatz_req.op_arith.is_reduction ? $unsigned(reduction_q[1]) : vrf_rdata_i[1];
+            else begin
+              // Replicate scalar operands
+              unique case (spatz_req.op == VSDOTP ? vew_e'(spatz_req.vtype.vsew + 1) : spatz_req.vtype.vsew)
+                EW_8 : operand1   = MAXEW == EW_32 ? {4*N_FU{spatz_req.rs1[7:0]}}  : {8*N_FU{spatz_req.rs1[7:0]}};
+                EW_16: operand1   = MAXEW == EW_32 ? {2*N_FU{spatz_req.rs1[15:0]}} : {4*N_FU{spatz_req.rs1[15:0]}};
+                EW_32: operand1   = MAXEW == EW_32 ? {1*N_FU{spatz_req.rs1[31:0]}} : {2*N_FU{spatz_req.rs1[31:0]}};
+                default: operand1 = {1*N_FU{spatz_req.rs1}};
+              endcase
+            end
 
-    operand3 = spatz_req.op_arith.is_scalar ? {1*N_FU{spatz_req.rsd}} : vrf_rdata_i[2];
+            if ((!spatz_req.op_arith.is_scalar || spatz_req.op == VADD) && spatz_req.use_vs2)
+              operand2 = spatz_req.op_arith.is_reduction ? $unsigned(reduction_q[0]) : vrf_rdata_i[0];
+            else
+              // Replicate scalar operands
+              unique case (spatz_req.op == VSDOTP ? vew_e'(spatz_req.vtype.vsew + 1) : spatz_req.vtype.vsew)
+                EW_8 : operand2   = MAXEW == EW_32 ? {4*N_FU{spatz_req.rs2[7:0]}}  : {8*N_FU{spatz_req.rs2[7:0]}};
+                EW_16: operand2   = MAXEW == EW_32 ? {2*N_FU{spatz_req.rs2[15:0]}} : {4*N_FU{spatz_req.rs2[15:0]}};
+                EW_32: operand2   = MAXEW == EW_32 ? {1*N_FU{spatz_req.rs2[31:0]}} : {2*N_FU{spatz_req.rs2[31:0]}};
+                default: operand2 = {1*N_FU{spatz_req.rs2}};
+              endcase
+          end
+      end
+      READ_V0_t: begin
+        operand_v0_t_lo = vrf_rdata_i[0];
+        operand_v0_t_hi = vrf_rdata_i[1];
+      end
+      default:;
+    endcase
+    operand3 = spatz_req.op_arith.is_scalar ? {1*N_FU{spatz_req.rsd}} : vrf_rdata_i[2]; // VFU_VD_RD // operand3 is used in MAC computation, like VMADD
   end: operand_proc
 
   assign in_ready     = state_q == VFU_RunningIPU ? ipu_in_ready     : fpu_in_ready;
@@ -335,6 +529,12 @@ module spatz_vfu
 
   assign scalar_result = result[ELEN-1:0];
 
+  `FFL(operand_v0_t_lo_q, operand_v0_t_lo, v0_t_is_ready, '0) // CMY: backup v0.t
+  `FFL(operand_v0_t_hi_q, operand_v0_t_hi, v0_t_is_ready, '0)
+
+  logic [VLEN-1:0] operand_v0_t_q;
+  assign operand_v0_t_q = {operand_v0_t_hi_q,operand_v0_t_lo_q};
+
   ///////////////////////
   //  Reduction logic  //
   ///////////////////////
@@ -349,6 +549,77 @@ module spatz_vfu
   // Do we need to request reduction operands?
   logic [1:0] reduction_operand_request;
 
+  // CMY: reduction_useless_value selection-----------------------
+  always_comb begin: reduction_useless_value_selection
+    reduction_useless_value = '0;
+    if(spatz_req.op_arith.is_reduction == 1'b1) begin
+      case(spatz_req.op)
+        VADD: // riscv_instr::VREDSUM_VS,riscv_instr::VFREDUSUM_VS,riscv_instr::VFREDOSUM_VS
+          reduction_useless_value = '0;
+        VAND: //riscv_instr::VREDAND_VS:
+          reduction_useless_value = '1;
+        VOR, //riscv_instr::VREDOR_VS,
+        VXOR: //riscv_instr::VREDXOR_VS:
+          reduction_useless_value = '0;
+        VMINU: //riscv_instr::VREDMINU_VS:
+          reduction_useless_value = '1;
+        VMIN: //riscv_instr::VREDMIN_VS:
+          unique case(spatz_req.vtype.vsew)
+            EW_8:reduction_useless_value = {1'b0,7'h7f};
+            EW_16:reduction_useless_value = {1'b0,15'h7fff};
+            EW_32:reduction_useless_value = {1'b0,31'h7fffffff};
+            default:
+              if(MAXEW == EW_64) reduction_useless_value = {1'b0,63'h7fffffffffffffff};
+          endcase
+        VMAXU: //riscv_instr::VREDMAXU_VS:
+          reduction_useless_value = '0;
+        VMAX: //riscv_instr::VREDMAX_VS: //complement code of -infinity
+          unique case(spatz_req.vtype.vsew)
+            EW_8:reduction_useless_value = {1'b1,7'h0};
+            EW_16:reduction_useless_value = {1'b1,15'h0};
+            EW_32:reduction_useless_value = {1'b1,31'h0};
+            default:
+              if(MAXEW == EW_64) reduction_useless_value = {1'b1,63'h0};
+          endcase
+        VFMINMAX: begin
+         if(spatz_req.rm == fpnew_pkg::RNE) begin //riscv_instr::VFREDMIN_VS:
+          unique case(fpu_src_fmt) // fpu_src_fmt is synchronous with spatz_req.op, while fpu_src_fmt_q is synchronous with op_q
+          // + infinity
+            fpnew_pkg::FP64:reduction_useless_value = {1'b0,11'h7ff,52'h0};
+            fpnew_pkg::FP32:reduction_useless_value = {1'b0,8'hff,23'h0};
+            fpnew_pkg::FP16:reduction_useless_value = {1'b0,5'h1f,10'h0};
+            fpnew_pkg::FP16ALT:reduction_useless_value = {1'b0,8'hff,7'h0};
+            fpnew_pkg::FP8:reduction_useless_value = {1'b0,5'h1f,2'h0};
+            fpnew_pkg::FP8ALT:reduction_useless_value = {1'b0,4'hf,3'h0};
+          endcase
+         end
+         if (spatz_req.rm == fpnew_pkg::RTZ) begin //riscv_instr::VFREDMAX_VS:
+          unique case(fpu_src_fmt)
+          // - infinity
+            fpnew_pkg::FP64:reduction_useless_value = {1'b1,11'h7ff,52'h0};
+            fpnew_pkg::FP32:reduction_useless_value = {1'b1,8'hff,23'h0};
+            fpnew_pkg::FP16:reduction_useless_value = {1'b1,5'h1f,10'h0};
+            fpnew_pkg::FP16ALT:reduction_useless_value = {1'b1,8'hff,7'h0};
+            fpnew_pkg::FP8:reduction_useless_value = {1'b1,5'h1f,2'h0};
+            fpnew_pkg::FP8ALT:reduction_useless_value = {1'b1,4'hf,3'h0};
+          endcase
+         end
+        end
+        default: reduction_useless_value='0;
+      endcase
+    end
+  end
+
+  // -----------------------------------------------------------
+
+  // CMY: add monitor signals for reduction_pointer_q and reduction_opreand_v0_t_q[reduction_pointer_q]
+  logic [idx_width(N_FU*ELENB)-1:0]  reduction_pointer_q_idx_width_N_FU_ELENB_0;
+  assign reduction_pointer_q_idx_width_N_FU_ELENB_0 = reduction_pointer_q[idx_width(N_FU*ELENB):0];
+
+  logic v0_t_reduction_pointer_q_idx_width_N_FU_ELENB_0;
+  assign v0_t_reduction_pointer_q_idx_width_N_FU_ELENB_0 = reduction_operand_v0_t_q[reduction_pointer_q[idx_width(N_FU*ELENB):0]];
+  //------------------------------------------------------------------------------
+
   always_comb begin: proc_reduction
     // Maintain state
     reduction_state_d   = reduction_state_q;
@@ -372,7 +643,7 @@ module spatz_vfu
     reduction_operand_request[1] = (reduction_state_q inside {Reduction_Init, Reduction_Reduce}) || !spatz_req.op_arith.is_reduction;
 
     unique case (reduction_state_q)
-      Reduction_NormalExecution: begin
+      Reduction_NormalExecution: begin // not a reduction instruction
         // Did we issue a word to the FUs?
         word_issued = spatz_req_valid && &(in_ready | ~valid_operations) && operands_ready && !stall;
 
@@ -384,7 +655,7 @@ module spatz_vfu
 
         // Do we have a new reduction instruction?
         if (spatz_req_valid && !running_q[spatz_req.id] && spatz_req.op_arith.is_reduction)
-          reduction_state_d = is_fpu_busy ? Reduction_Wait : Reduction_Init;
+          reduction_state_d = (!spatz_req.op_arith.vm) ? Reduction_Read_V0_t : is_fpu_busy ? Reduction_Wait : Reduction_Init; // CMY: added Reduction_Read_V0_t state
       end
 
       Reduction_Wait: begin
@@ -395,21 +666,38 @@ module spatz_vfu
           reduction_state_d = Reduction_Init;
       end
 
+      Reduction_Read_V0_t:begin
+        if(reduction_v0_t_is_ready)
+          if (!is_fpu_busy)
+            reduction_state_d = Reduction_Init;
+          else reduction_state_d = Reduction_Wait;
+        else reduction_state_d = Reduction_Read_V0_t;
+      end
+
       Reduction_Init: begin
         // Initialize the reduction
         // verilator lint_off SELRANGE
         unique case (spatz_req.vtype.vsew)
           EW_8 : begin
             reduction_d[0] = $unsigned(vrf_rdata_i[0][7:0]);
-            reduction_d[1] = $unsigned(vrf_rdata_i[1][8*reduction_pointer_q[idx_width(N_FU*ELENB)-1:0] +: 8]);
+            if(!spatz_req.op_arith.vm && !reduction_operand_v0_t_q[reduction_pointer_q[idx_width(N_FU*ELENB):0]])
+              reduction_d[1] = reduction_useless_value;
+            else
+              reduction_d[1] = $unsigned(vrf_rdata_i[1][8*reduction_pointer_q[idx_width(N_FU*ELENB)-1:0] +: 8]);
           end
           EW_16: begin
             reduction_d[0] = $unsigned(vrf_rdata_i[0][15:0]);
-            reduction_d[1] = $unsigned(vrf_rdata_i[1][16*reduction_pointer_q[idx_width(N_FU*ELENB)-2:0] +: 16]);
+            if(!spatz_req.op_arith.vm && !reduction_operand_v0_t_q[reduction_pointer_q[idx_width(N_FU*ELENB):0]])
+              reduction_d[1] = reduction_useless_value;
+            else
+              reduction_d[1] = $unsigned(vrf_rdata_i[1][16*reduction_pointer_q[idx_width(N_FU*ELENB)-2:0] +: 16]);
           end
           EW_32: begin
             reduction_d[0] = $unsigned(vrf_rdata_i[0][31:0]);
-            reduction_d[1] = $unsigned(vrf_rdata_i[1][32*reduction_pointer_q[idx_width(N_FU*ELENB)-3:0] +: 32]);
+            if(!spatz_req.op_arith.vm && !reduction_operand_v0_t_q[reduction_pointer_q[idx_width(N_FU*ELENB):0]])
+              reduction_d[1] = reduction_useless_value;
+            else
+              reduction_d[1] = $unsigned(vrf_rdata_i[1][32*reduction_pointer_q[idx_width(N_FU*ELENB)-3:0] +: 32]);
           end
           default: begin
           `ifdef MEMPOOL_SPATZ
@@ -417,7 +705,10 @@ module spatz_vfu
           `else
             if (MAXEW == EW_64) begin
               reduction_d[0] = $unsigned(vrf_rdata_i[0][63:0]);
-              reduction_d[1] = $unsigned(vrf_rdata_i[1][64*reduction_pointer_q[idx_width(N_FU*ELENB)-4:0] +: 64]);
+              if(!spatz_req.op_arith.vm && !reduction_operand_v0_t_q[reduction_pointer_q[idx_width(N_FU*ELENB):0]])
+                reduction_d[1] = reduction_useless_value;
+              else
+                reduction_d[1] = $unsigned(vrf_rdata_i[1][64*reduction_pointer_q[idx_width(N_FU*ELENB)-4:0] +: 64]);
             end
           `endif
           end
@@ -444,15 +735,24 @@ module spatz_vfu
         unique case (spatz_req.vtype.vsew)
           EW_8 : begin
             reduction_d[0] = $unsigned(result[7:0]);
-            reduction_d[1] = $unsigned(vrf_rdata_i[1][8*reduction_pointer_q[idx_width(N_FU*ELENB)-1:0] +: 8]);
+            if(!spatz_req.op_arith.vm && !reduction_operand_v0_t_q[reduction_pointer_q[idx_width(N_FU*ELENB):0]])
+              reduction_d[1] = reduction_useless_value;
+            else
+              reduction_d[1] = $unsigned(vrf_rdata_i[1][8*reduction_pointer_q[idx_width(N_FU*ELENB)-1:0] +: 8]);
           end
           EW_16: begin
             reduction_d[0] = $unsigned(result[15:0]);
-            reduction_d[1] = $unsigned(vrf_rdata_i[1][16*reduction_pointer_q[idx_width(N_FU*ELENB)-2:0] +: 16]);
+            if(!spatz_req.op_arith.vm && !reduction_operand_v0_t_q[reduction_pointer_q[idx_width(N_FU*ELENB):0]])
+              reduction_d[1] = reduction_useless_value;
+            else
+              reduction_d[1] = $unsigned(vrf_rdata_i[1][16*reduction_pointer_q[idx_width(N_FU*ELENB)-2:0] +: 16]);
           end
           EW_32: begin
             reduction_d[0] = $unsigned(result[31:0]);
-            reduction_d[1] = $unsigned(vrf_rdata_i[1][32*reduction_pointer_q[idx_width(N_FU*ELENB)-3:0] +: 32]);
+            if(!spatz_req.op_arith.vm && !reduction_operand_v0_t_q[reduction_pointer_q[idx_width(N_FU*ELENB):0]])
+              reduction_d[1] = reduction_useless_value;
+            else
+              reduction_d[1] = $unsigned(vrf_rdata_i[1][32*reduction_pointer_q[idx_width(N_FU*ELENB)-3:0] +: 32]);
           end
           default: begin
           `ifdef MEMPOOL_SPATZ
@@ -460,7 +760,10 @@ module spatz_vfu
           `else
             if (MAXEW == EW_64) begin
               reduction_d[0] = $unsigned(result[63:0]);
-              reduction_d[1] = $unsigned(vrf_rdata_i[1][64*reduction_pointer_q[idx_width(N_FU*ELENB)-4:0] +: 64]);
+              if(!spatz_req.op_arith.vm && !reduction_operand_v0_t_q[reduction_pointer_q[idx_width(N_FU*ELENB):0]])
+                reduction_d[1] = reduction_useless_value;
+              else
+                reduction_d[1] = $unsigned(vrf_rdata_i[1][64*reduction_pointer_q[idx_width(N_FU*ELENB)-4:0] +: 64]);
             end
           `endif
           end
@@ -546,52 +849,135 @@ module spatz_vfu
       reduction      : spatz_req.op_arith.is_reduction
     };
 
-    if (spatz_req_valid && vl_q == '0) begin
-      vreg_addr_d[0] = (spatz_req.vs2 + vstart) << $clog2(NrWordsPerVector);
-      vreg_addr_d[1] = (spatz_req.vs1 + vstart) << $clog2(NrWordsPerVector);
-      vreg_addr_d[2] = (spatz_req.vd + vstart) << $clog2(NrWordsPerVector);
-
-      // Direct feedthrough
-      vrf_raddr_o = vreg_addr_d;
-      if (!spatz_req.op_arith.is_scalar)
-        input_tag.vd_addr = vreg_addr_d[2];
-
-      // Did we commit a word already?
-      if (word_issued) begin
-        vreg_addr_d[0] = vreg_addr_d[0] + (!spatz_req.op_arith.widen_vs2 || widening_upper_q);
-        vreg_addr_d[1] = vreg_addr_d[1] + (!spatz_req.op_arith.widen_vs1 || widening_upper_q);
-        vreg_addr_d[2] = vreg_addr_d[2] + (!spatz_req.op_arith.is_reduction && (!spatz_req.op_arith.is_narrowing || narrowing_upper_q));
-      end
-    end else if (spatz_req_valid && vl_q < spatz_req.vl && word_issued) begin
-      vreg_addr_d[0] = vreg_addr_q[0] + (!spatz_req.op_arith.widen_vs2 || widening_upper_q);
-      vreg_addr_d[1] = vreg_addr_q[1] + (!spatz_req.op_arith.widen_vs1 || widening_upper_q);
-      vreg_addr_d[2] = vreg_addr_q[2] + (!spatz_req.op_arith.is_reduction && (!spatz_req.op_arith.is_narrowing || narrowing_upper_q));
-    end
+    case(operand_state_q)// CMY modified
+       READ_OPERANDS:begin
+        if(reduction_state_q == Reduction_Read_V0_t) begin
+          vreg_addr_d[0] =  0 << $clog2(NrWordsPerVector);
+          vreg_addr_d[1] =  1 << $clog2(NrWordsPerVector);
+          vrf_raddr_o = vreg_addr_d;
+        end
+        else begin
+
+          if (spatz_req_valid && vl_q == '0) begin
+            vreg_addr_d[0] = (spatz_req.vs2 + vstart) << $clog2(NrWordsPerVector);
+            vreg_addr_d[1] = (spatz_req.vs1 + vstart) << $clog2(NrWordsPerVector);
+            vreg_addr_d[2] = (spatz_req.vd + vstart) << $clog2(NrWordsPerVector);
+
+          // Direct feedthrough
+          vrf_raddr_o = vreg_addr_d;
+          if (!spatz_req.op_arith.is_scalar)
+            input_tag.vd_addr = vreg_addr_d[2];
+
+          // Did we commit a word already?
+          if (word_issued) begin
+            vreg_addr_d[0] = vreg_addr_d[0] + (!spatz_req.op_arith.widen_vs2 || widening_upper_q);
+            vreg_addr_d[1] = vreg_addr_d[1] + (!spatz_req.op_arith.widen_vs1 || widening_upper_q); // if it is a widening operands, addr shouldn't add when reading the upper part.
+            vreg_addr_d[2] = vreg_addr_d[2] + (!spatz_req.op_arith.is_reduction && (!spatz_req.op_arith.is_narrowing || narrowing_upper_q));
+          end
+          end else if (spatz_req_valid && vl_q < spatz_req.vl && word_issued) begin
+            vreg_addr_d[0] = vreg_addr_q[0] + (!spatz_req.op_arith.widen_vs2 || widening_upper_q);
+            vreg_addr_d[1] = vreg_addr_q[1] + (!spatz_req.op_arith.widen_vs1 || widening_upper_q);
+            vreg_addr_d[2] = vreg_addr_q[2] + (!spatz_req.op_arith.is_reduction && (!spatz_req.op_arith.is_narrowing || narrowing_upper_q));
+          end
+        end
+       end
+       READ_V0_t: begin
+         vreg_addr_d[0] = ( 0 + vstart) << $clog2(NrWordsPerVector);
+         vreg_addr_d[1] = ( 1 + vstart) << $clog2(NrWordsPerVector);
+         vrf_raddr_o = vreg_addr_d;
+       end
+       default:;
+   endcase
   end: vreg_addr_proc
 
   always_comb begin : operand_req_proc
     vreg_r_req = '0;
     vreg_we    = '0;
-    vreg_wbe   = '0;
-
-    if (spatz_req_valid && vl_q < spatz_req.vl)
-      // Request operands
-      vreg_r_req = {spatz_req.vd_is_src, spatz_req.use_vs1 && reduction_operand_request[1], spatz_req.use_vs2 && reduction_operand_request[0]};
 
+    unique case(operand_state_q) // CMY: turn it into FSM logic
+      READ_V0_t: vreg_r_req = 3'b011;
+      READ_OPERANDS: begin
+        if(reduction_state_q == Reduction_Read_V0_t) vreg_r_req = 3'b011;
+        else
+          if (spatz_req_valid && vl_q < spatz_req.vl)
+            // Request operands
+            vreg_r_req = {spatz_req.vd_is_src, spatz_req.use_vs1 && reduction_operand_request[1], spatz_req.use_vs2 && reduction_operand_request[0]};
+      end
+      default:;
+    endcase
     // Got a new result
     if (&(result_valid | ~pending_results) && !result_tag.reduction) begin
       vreg_we  = !result_tag.wb;
-      vreg_wbe = '1;
+    end
+
+    // Reduction finished execution
+    if (reduction_state_q == Reduction_WriteBack && result_valid[0]) begin
+      vreg_we = 1'b1;
+    end
+  end : operand_req_proc
+
+  // CMY: vreg_wbe logic----------------------
+ vlen_t vreg_wb_word_cnt_q, vreg_wb_word_cnt_d;
+ `FF(vreg_wb_word_cnt_q, vreg_wb_word_cnt_d, '0)
+ vew_e sew_wb;
+ logic widening_wb;
+ assign widening_wb = spatz_req.op_arith.widen_vs1 || spatz_req.op_arith.widen_vs2;
+ assign sew_wb = vew_e'(int'(spatz_req.vtype.vsew) + widening_wb);
 
-      if (result_tag.narrowing) begin
-        // Only write half of the elements
-        vreg_wbe = result_tag.narrowing_upper ? {{(N_FU*ELENB/2){1'b1}}, {(N_FU*ELENB/2){1'b0}}} : {{(N_FU*ELENB/2){1'b0}}, {(N_FU*ELENB/2){1'b1}}};
+ vrf_be_t       vreg_wbe_pre;
+
+always_comb begin : vreg_wbe_proc
+    vreg_wbe   = '0;
+    vreg_wb_word_cnt_d = vreg_wb_word_cnt_q;
+    if ((result_tag.last && &(result_valid | ~pending_results) && reduction_state_q inside {Reduction_NormalExecution, Reduction_Wait}) || reduction_done)
+      vreg_wb_word_cnt_d = 0;
+    else if (&(result_valid | ~pending_results) /*&& !result_tag.reduction*/ && (!spatz_req.op_arith.is_narrowing || narrowing_upper_q)) vreg_wb_word_cnt_d = vreg_wb_word_cnt_q + 1;
+  // Got a new result
+    if (&(result_valid | ~pending_results) && !result_tag.reduction) begin
+      // vreg_we  = !result_tag.wb;
+      //vreg_wbe = '1;
+      // vreg_wb_word_cnt_d = vreg_wb_word_cnt_q + 1;
+      if(!spatz_req.op_arith.vm && !spatz_req.op_arith.is_scalar && !result_tag.narrowing)// CMY: masking the wb results
+        // unique case (spatz_req.vtype.vsew)
+        unique case (sew_wb) // CMY: add widening support
+          EW_8:for(int i=0;i<VRFWordBWidth/1;i=i+1)begin
+            vreg_wbe[i*1+:1] = {1{operand_v0_t_q[vreg_wb_word_cnt_q *32 + i]}};
+          end
+          EW_16:for(int i=0;i<VRFWordBWidth/2;i=i+1)begin
+            vreg_wbe[i*2+:2] = {2{operand_v0_t_q[vreg_wb_word_cnt_q *16 + i]}};
+          end
+          EW_32: for(int i=0;i<VRFWordBWidth/4;i=i+1)begin
+            vreg_wbe[i*4+:4] = {4{operand_v0_t_q[vreg_wb_word_cnt_q *8 + i]}};
+          end
+          default: if (MAXEW == EW_64) for(int i=0;i<VRFWordBWidth/8;i=i+1)begin
+            vreg_wbe[i*8+:8] = {8{operand_v0_t_q[vreg_wb_word_cnt_q *4 + i]}};
+          end
+        endcase
+      else if(result_tag.narrowing) begin
+        if(!spatz_req.op_arith.vm && !spatz_req.op_arith.is_scalar)
+        unique case (sew_wb) // CMY: add narrowing support
+          EW_16:for(int i=0;i<VRFWordBWidth/2;i=i+1)begin
+            vreg_wbe_pre[i*2+:2] = {2{operand_v0_t_q[vreg_wb_word_cnt_q *16 + i]}};
+            vreg_wbe = result_tag.narrowing_upper ? {vreg_wbe_pre[N_FU*ELENB-1:(N_FU*ELENB/2)],{(N_FU*ELENB/2){1'b0}}} : {{(N_FU*ELENB/2){1'b0}}, vreg_wbe_pre[(N_FU*ELENB/2)-1:0]};
+          end
+          EW_32: for(int i=0;i<VRFWordBWidth/4;i=i+1)begin
+            vreg_wbe_pre[i*4+:4] = {4{operand_v0_t_q[vreg_wb_word_cnt_q *8 + i]}};
+            vreg_wbe = result_tag.narrowing_upper ? {vreg_wbe_pre[N_FU*ELENB-1:(N_FU*ELENB/2)],{(N_FU*ELENB/2){1'b0}}} : {{(N_FU*ELENB/2){1'b0}}, vreg_wbe_pre[(N_FU*ELENB/2)-1:0]};
+          end
+          EW_64: for(int i=0;i<VRFWordBWidth/8;i=i+1)begin
+            vreg_wbe_pre[i*8+:8] = {8{operand_v0_t_q[vreg_wb_word_cnt_q *4 + i]}};
+            vreg_wbe = result_tag.narrowing_upper ? {vreg_wbe_pre[N_FU*ELENB-1:(N_FU*ELENB/2)],{(N_FU*ELENB/2){1'b0}}} : {{(N_FU*ELENB/2){1'b0}}, vreg_wbe_pre[(N_FU*ELENB/2)-1:0]};
+          end
+          default:;
+        endcase
+        else
+          vreg_wbe = result_tag.narrowing_upper ? {{(N_FU*ELENB/2){1'b1}}, {(N_FU*ELENB/2){1'b0}}} : {{(N_FU*ELENB/2){1'b0}}, {(N_FU*ELENB/2){1'b1}}};
       end
+      else vreg_wbe = '1;
     end
 
     // Reduction finished execution
     if (reduction_state_q == Reduction_WriteBack && result_valid[0]) begin
-      vreg_we = 1'b1;
       unique case (spatz_req.vtype.vsew)
         EW_8 : vreg_wbe = 1'h1;
         EW_16: vreg_wbe = 2'h3;
@@ -599,7 +985,9 @@ module spatz_vfu
         default: if (MAXEW == EW_64) vreg_wbe = 8'hff;
       endcase
     end
-  end : operand_req_proc
+end:vreg_wbe_proc
+//---------------------------------------------------------
+
 
   logic [N_FU*ELEN-1:0] vreg_wdata;
   always_comb begin: align_result
@@ -690,7 +1078,7 @@ module spatz_vfu
   if (N_IPU < N_FU) begin: gen_pipeline_ipu
     logic [N_FU*ELEN-1:0] ipu_result_d, ipu_result_q;
     logic [N_FU*ELENB-1:0] ipu_result_valid_q, ipu_result_valid_d;
-    logic [idx_width(N_FU/N_IPU)-1:0] ipu_result_pnt_d, ipu_result_pnt_q;
+    logic [idx_width(N_FU/N_IPU)-1:0] ipu_result_pnt_d, ipu_result_pnt_q; // idx_width(N_FU/N_IPU) = $clog2(N_FU/N_IPU)
     vfu_tag_t ipu_result_tag_d, ipu_result_tag_q;
     logic [idx_width(N_FU/N_IPU)-1:0] ipu_operand_pnt_d, ipu_operand_pnt_q;
 
@@ -732,7 +1120,7 @@ module spatz_vfu
 
       // Store results
       int_ipu_result_ready = '0;
-      if (&int_ipu_result_valid) begin
+      if (&int_ipu_result_valid) begin // all bytes are valid
         ipu_result_d[ipu_result_pnt_q*ELEN*N_IPU +: ELEN*N_IPU]         = int_ipu_result;
         ipu_result_valid_d[ipu_result_pnt_q*ELENB*N_IPU +: ELENB*N_IPU] = int_ipu_result_valid;
         ipu_result_tag_d                                                = int_ipu_result_tag[0];
@@ -799,7 +1187,7 @@ module spatz_vfu
   ////////////
 
   if (FPU) begin: gen_fpu
-    operation_e fpu_op;
+    /*operation_e fpu_op;
     fp_format_e fpu_src_fmt, fpu_dst_fmt;
     int_format_e fpu_int_fmt;
     logic fpu_op_mode;
@@ -908,7 +1296,7 @@ module spatz_vfu
           default:;
         endcase
       end
-    end: gen_decoder
+    end: gen_decoder*/
 
     logic [N_FPU*ELEN-1:0] wide_operand1, wide_operand2, wide_operand3;
     always_comb begin: gen_widening
@@ -991,7 +1379,7 @@ module spatz_vfu
       `FFL(rm_q, spatz_req.rm, int_fpu_in_valid && int_fpu_in_ready, fpnew_pkg::RNE)
       `FFL(input_tag_q, input_tag, int_fpu_in_valid && int_fpu_in_ready, '{vsew: EW_8, default: '0})
       `FFL(fpu_in_valid_q, int_fpu_in_valid, int_fpu_in_ready, 1'b0)
-      assign int_fpu_in_ready = !fpu_in_valid_q || fpu_in_valid_q && fpu_in_ready_d;
+      assign int_fpu_in_ready = !fpu_in_valid_q || fpu_in_valid_q && fpu_in_ready_d; // no data is to be sent to FPU or data is to be sent and FPU is ready
 
       fpnew_top #(
         .Features                   (FPUFeatures           ),
diff --git a/hw/ip/spatz/src/spatz_vlsu.sv b/hw/ip/spatz/src/spatz_vlsu.sv
index 6cd202c6..638a8708 100644
--- a/hw/ip/spatz/src/spatz_vlsu.sv
+++ b/hw/ip/spatz/src/spatz_vlsu.sv
@@ -7,6 +7,9 @@
 // The vector load/store unit is used to load vectors from memory
 // and to the vector register file and store them back again.
 
+// load: Memory -> LSU -> VRF
+// store: VRF -> LSU -> Memory
+
 module spatz_vlsu
   import spatz_pkg::*;
   import rvv_pkg::*;
@@ -67,7 +70,7 @@ module spatz_vlsu
   //////////////
 
   typedef logic [IdWidth-1:0] id_t;
-  typedef logic [$clog2(NrWordsPerVector*8)-1:0] vreg_elem_t;
+  typedef logic [$clog2(NrWordsPerVector*8)-1:0] vreg_elem_t; // element index. a word is 256bit and an element is 64bit
 
   ///////////////////////
   //  Operation queue  //
@@ -93,6 +96,8 @@ module spatz_vlsu
   );
 
   // Convert the vl to number of bytes for all element widths
+  // CMY: spatz_req_i.vl: the number of elements for this instruction
+  // CMY: spatz_req_d.vl: the number of bytes for this instruction
   always_comb begin: proc_spatz_req
     spatz_req_d = spatz_req_i;
 
@@ -118,7 +123,7 @@ module spatz_vlsu
 
   // Do we have a strided memory access
   logic mem_is_strided;
-  assign mem_is_strided = (mem_spatz_req.op == VLSE) || (mem_spatz_req.op == VSSE);
+  assign mem_is_strided = (mem_spatz_req.op == VLSE) || (mem_spatz_req.op == VSSE); // CMY: Vector Load/Store Strided Elements
 
   // Do we have an indexed memory access
   logic mem_is_indexed;
@@ -128,15 +133,15 @@ module spatz_vlsu
   //  State  //
   /////////////
 
-  typedef enum logic {
-    VLSU_RunningLoad, VLSU_RunningStore
+  typedef enum logic [1:0] {
+    VLSU_RunningLoad, VLSU_RunningStore, VLSU_ReadingV0_t
   } state_t;
   state_t state_d, state_q;
   `FF(state_q, state_d, VLSU_RunningLoad)
 
 
-  id_t [NrMemPorts-1:0] store_count_q;
-  id_t [NrMemPorts-1:0] store_count_d;
+  id_t [NrMemPorts-1:0] store_count_q; // id_t = 3: width of NrPendingLoads
+  id_t [NrMemPorts-1:0] store_count_d; // NrMemPorts = N_FU = 4
 
   for (genvar port = 0; port < NrMemPorts; port++) begin: gen_store_count_q
     `FF(store_count_q[port], store_count_d[port], '0)
@@ -149,7 +154,7 @@ module spatz_vlsu
     for (int port = 0; port < NrMemPorts; port++) begin
       if (spatz_mem_req_o[port].write && spatz_mem_req_valid_o[port] && spatz_mem_req_ready_i[port])
         // Did we send a store?
-        store_count_d[port]++;
+        store_count_d[port]++; // number of outstanding store
 
       // Did we get the ack of a store?
   `ifdef MEMPOOL_SPATZ
@@ -301,6 +306,7 @@ module spatz_vlsu
     vlen_t vstart;
     logic [2:0] rs1;
 
+    logic vm; // CMY: if it is a maskede memory instruction
     logic is_load;
     logic is_strided;
     logic is_indexed;
@@ -327,7 +333,7 @@ module spatz_vlsu
     .full_o    (/* Unused */     ),
     .data_o    (commit_insn_q    ),
     .empty_o   (commit_insn_empty),
-    .pop_i     (commit_insn_pop  ),
+    .pop_i     (commit_insn_pop  ), // finish the execution
     .usage_o   (/* Unused */     )
   );
 
@@ -339,6 +345,7 @@ module spatz_vlsu
       vl        : mem_spatz_req.vl,
       vstart    : mem_spatz_req.vstart,
       rs1       : mem_spatz_req.rs1[2:0],
+      vm        : mem_spatz_req.op_mem.vm,
       is_load   : mem_spatz_req.op_mem.is_load,
       is_strided: mem_is_strided,
       is_indexed: mem_is_indexed
@@ -385,7 +392,7 @@ module spatz_vlsu
   logic  [NrMemPorts-1:0] commit_finished_q;
   logic  [NrMemPorts-1:0] commit_finished_d;
 
-  for (genvar fu = 0; fu < N_FU; fu++) begin: gen_vreg_counters
+  for (genvar fu = 0; fu < N_FU; fu++) begin: gen_vreg_counters // N_FU: number of FPUs
     delta_counter #(
       .WIDTH($bits(vlen_t))
     ) i_delta_counter_vreg (
@@ -407,15 +414,16 @@ module spatz_vlsu
 
   ////////////////////////
   // Address Generation //
-  ////////////////////////
+  ////////////////////////   // CMY: VRF address generation
 
-  elen_t [NrMemPorts-1:0] mem_req_addr;
+  elen_t [NrMemPorts-1:0] mem_req_addr; // CMY: why elen_t? (64bits/hardware element)
 
   vrf_addr_t vd_vreg_addr;
   vrf_addr_t vs2_vreg_addr;
+  vrf_addr_t v0_t_vreg_addr_lo, v0_t_vreg_addr_hi;
 
   // Current element index and byte index that are being accessed at the register file
-  vreg_elem_t vd_elem_id;
+  vreg_elem_t vd_elem_id; // 256/64=4   [3:0]
   vreg_elem_t vs2_elem_id_d, vs2_elem_id_q;
   `FF(vs2_elem_id_q, vs2_elem_id_d, '0)
 
@@ -435,25 +443,33 @@ module spatz_vlsu
     assign idx_offset = mem_idx_counter_q[port];
 
     always_comb begin
+      addr = '0;
+      stride ='0;
+      offset ='0;
       stride = mem_is_strided ? mem_spatz_req.rs2 >> mem_spatz_req.vtype.vsew : 'd1;
-
+      // stride here(HW) is in element, compared to Byte-based in SW
       if (mem_is_indexed) begin
         // What is the relationship between data and index width?
         automatic logic [1:0] data_index_width_diff = int'(mem_spatz_req.vtype.vsew) - int'(mem_spatz_req.op_mem.ew);
-
+        // // op_mem.ew encodes the index element width (EW of VS2)
         // Pointer to index
         automatic logic [idx_width(N_FU*ELENB)-1:0] word_index = (port << (MAXEW - data_index_width_diff)) + (maxew_t'(idx_offset << data_index_width_diff) >> data_index_width_diff) + (maxew_t'(idx_offset >> (MAXEW - data_index_width_diff)) << (MAXEW - data_index_width_diff)) * NrMemPorts;
-
-        // Index
-        unique case (mem_spatz_req.op_mem.ew)
-          EW_8 : offset   = $signed(vrf_rdata_i[1][8 * word_index +: 8]);
-          EW_16: offset   = $signed(vrf_rdata_i[1][8 * word_index +: 16]);
-          default: offset = $signed(vrf_rdata_i[1][8 * word_index +: 32]);
-        endcase
-      end else begin
+        // CMY: word_index: the index of byte in vs2     //       starting point of a VRF port             +  lower bits of idx_offset, the index of bytes inside one port            +  higher bits of idx_offset, the starting point of a VRF word
+        //                                          word_index = (start of this port’s slice)              + (byte offset within the slice)                                           + (how many whole slices we’ve advanced).
+        //                  It computes the byte address inside the VS2 (index) vector
+        if(state_d == VLSU_RunningLoad || state_d == VLSU_RunningStore) begin
+          // Index
+          unique case (mem_spatz_req.op_mem.ew) // op_mem.ew encodes the index element width (EW of VS2)
+            EW_8 : offset   = $signed(vrf_rdata_i[1][8 * word_index +: 8]);
+            EW_16: offset   = $signed(vrf_rdata_i[1][8 * word_index +: 16]);
+            default: offset = $signed(vrf_rdata_i[1][8 * word_index +: 32]);
+          endcase
+        end
+      end else begin // strieded or normal (stride = 1)
         offset = ({mem_counter_q[port][$bits(vlen_t)-1:MAXEW] << $clog2(NrMemPorts), mem_counter_q[port][int'(MAXEW)-1:0]} + (port << MAXEW)) * stride;
       end
-
+      // CMY: the starting point of a 32B block                                  , in-port offset                        + adds a port base offset so each port starts at a different initial byte position
+      // mem_counter_q: how many elements this port has issued/consumed
       addr                      = mem_spatz_req.rs1 + offset;
       mem_req_addr[port]        = (addr >> MAXEW) << MAXEW;
       mem_req_addr_offset[port] = addr[int'(MAXEW)-1:0];
@@ -462,10 +478,20 @@ module spatz_vlsu
     end
   end: gen_mem_req_addr
 
+  logic v0_t_is_ready;
+  assign v0_t_is_ready   = (state_q == VLSU_ReadingV0_t) && (&vrf_rvalid_i); // reuse vrf_read[1] for V0 reading
+  logic [VLEN-1:0]  operand_v0_t,operand_v0_t_q; // CMY: v0 should be read from vrf
+  assign operand_v0_t = (state_q == VLSU_ReadingV0_t)? {vrf_rdata_i[1],vrf_rdata_i[0]}:'0;
+
+  `FFL(operand_v0_t_q, operand_v0_t, v0_t_is_ready, '0) // CMY: backup v0.t
+
+
   // Calculate the register file address
   always_comb begin : gen_vreg_addr
     vd_vreg_addr  = (commit_insn_q.vd << $clog2(NrWordsPerVector)) + $unsigned(vd_elem_id);
     vs2_vreg_addr = (mem_spatz_req.vs2 << $clog2(NrWordsPerVector)) + $unsigned(vs2_elem_id_q);
+    v0_t_vreg_addr_lo =  0  << $clog2(NrWordsPerVector); // CMY: align prestart elements inside VLSU
+    v0_t_vreg_addr_hi =  1  << $clog2(NrWordsPerVector);
   end
 
   ///////////////
@@ -651,7 +677,7 @@ module spatz_vlsu
         commit_counter_d[fu] += ELENB;
       else if (commit_insn_q.vstart[idx_width(N_FU*ELENB)-1:$clog2(ELENB)] == fu)
         commit_counter_d[fu] += commit_insn_q.vstart[$clog2(ELENB)-1:0];
-      commit_operation_valid[fu] = commit_insn_valid && (commit_counter_q[fu] != max_elements) && (catchup[fu] || (!catchup[fu] && ~|catchup));
+      commit_operation_valid[fu] = (state_q == VLSU_RunningLoad || state_q == VLSU_RunningStore)&& commit_insn_valid && (commit_counter_q[fu] != max_elements) && (catchup[fu] || (!catchup[fu] && ~|catchup)); // CMY: added current state judgement
       commit_operation_last[fu]  = commit_operation_valid[fu] && ((max_elements - commit_counter_q[fu]) <= (commit_is_single_element_operation ? commit_single_element_size : ELENB));
       commit_counter_delta[fu]   = !commit_operation_valid[fu] ? vlen_t'('d0) : commit_is_single_element_operation ? vlen_t'(commit_single_element_size) : commit_operation_last[fu] ? (max_elements - commit_counter_q[fu]) : vlen_t'(ELENB);
       commit_counter_en[fu]      = commit_operation_valid[fu] && (commit_insn_q.is_load && vrf_req_valid_d && vrf_req_ready_d) || (!commit_insn_q.is_load && vrf_rvalid_i[0] && vrf_re_o[0] && (!mem_is_indexed || vrf_rvalid_i[1]));
@@ -702,18 +728,39 @@ module spatz_vlsu
   // State //
   ///////////
 
+  logic vlsu_rsp_valid_q; // register the instruction finish signal
+  logic v0_t_is_ready_q;
+  logic v0_t_read_done;
+  `FFLARNC(v0_t_read_done,1'b1,v0_t_is_ready,vlsu_rsp_valid_o,1'b0,clk_i,rst_ni);
+  `FF(v0_t_is_ready_q,v0_t_is_ready,'0);
+
   always_comb begin: p_state
     // Maintain state
     state_d = state_q;
 
     unique case (state_q)
       VLSU_RunningLoad: begin
+        // if(mem_spatz_req_valid && !mem_spatz_req.op_mem.vm && !v0_t_read_done)
+        if(commit_insn_valid && !commit_insn_q.vm && !v0_t_read_done)
+          state_d = VLSU_ReadingV0_t;
         if (commit_insn_valid && !commit_insn_q.is_load)
           if (&rob_empty)
             state_d = VLSU_RunningStore;
       end
 
+      VLSU_ReadingV0_t:
+        if(/*v0_t_is_ready*/v0_t_is_ready & ~v0_t_is_ready_q) begin
+          state_d = VLSU_RunningLoad;
+          if (commit_insn_valid && !commit_insn_q.is_load)
+            // if (&rob_empty) // CMY: we don't need to wait rob_empty because read_v0_t doesn't go through rob.
+            state_d = VLSU_RunningStore;
+          // else state_d = VLSU_RunningLoad;
+        end
+        else state_d = state_q;
+
       VLSU_RunningStore: begin
+        if(commit_insn_valid && !commit_insn_q.vm && !v0_t_read_done)
+          state_d = VLSU_ReadingV0_t;
         if (commit_insn_valid && commit_insn_q.is_load)
           if (&rob_empty)
             state_d = VLSU_RunningLoad;
@@ -756,9 +803,64 @@ module spatz_vlsu
     end
   end
 
+  // CMY: generate masking based on V0.t-----------------------------------
+  logic [VLEN-1:0] vm_masking;
+  always_comb begin
+    vm_masking = '1; // to avoid latch
+    if(!commit_insn_q.vm) begin
+      case (commit_insn_q.vsew)
+        EW_8:for(int i=0;i<VLENB;i=i+1)begin
+          vm_masking[i*1+:1] = {1{operand_v0_t_q[i]}};
+        end
+        EW_16:for(int i=0;i<VLENB;i=i+1)begin
+          vm_masking[i*2+:2] = {2{operand_v0_t_q[i]}};
+        end
+        EW_32: for(int i=0;i<VLENB;i=i+1)begin
+          vm_masking[i*4+:4] = {4{operand_v0_t_q[i]}};
+        end
+        default: if (MAXEW == EW_64) for(int i=0;i<VLENB;i=i+1)begin
+              // vm_masking[i*8+:8] = {8{operand_v0_t_q[/*vreg_wb_word_cnt_q  *4*/ + i]}};
+          vm_masking[i*8+:8] = {8{operand_v0_t_q[i]}};
+        end
+      endcase
+    end
+    // else vm_masking = '1;
+  end
+
+  vlen_t commit_counter_sum,mem_counter_sum;
+  logic [$bits(vlen_t)-5:0] commit_counter_mod32,mem_counter_mod32;
+  vlen_t [NrMemPorts-1:0] prefix,mem_counter_position_sum;
+  always_comb begin
+     commit_counter_sum = '0;
+     mem_counter_sum = '0;
+    for (int unsigned port = 0; port < NrMemPorts; port++) begin
+      commit_counter_sum += commit_counter_q[port];
+      mem_counter_sum += mem_counter_q[port];
+    end
+    // commit_counter_mod32 = (commit_counter_sum == 0)? 0: (commit_counter_sum - 1) >> 5 + 1;
+    commit_counter_mod32 = commit_counter_sum >> 5;
+    // mem_counter_mod32 = (mem_counter_sum == 0)? 0: (mem_counter_sum - 1) >> 5 + 1;
+    mem_counter_mod32 = mem_counter_sum >> 5;
+  end
+
+  vrf_be_t       load_wbe; // CMY: intermediate wbe, before vm_masking.
+  vrf_be_t       vm_wbe; // CMY: monitor the vm_wbe selected from vm_masking
+  logic [NrMemPorts-1:0][ELEN/8-1:0] vm_wbe_store; // CMY: select 8-bit masking for each port, before reordering according to rs1
+  logic [NrMemPorts-1:0][ELEN/8-1:0] store_strb; // CMY: intermediate strb, before vm_masking.
+  logic [NrMemPorts-1:0][ELEN/8-1:0] vm_strb; // CMY: to monitor the vm_masking on each port.
+
+  always_comb begin
+    for (int port = 0; port < NrMemPorts; port++) begin
+      vm_wbe_store[port] = vm_masking[mem_counter_mod32*32 + port*ELENB +: ELENB];
+    end
+  end
+  // -----------------------------------------------
+
   // verilator lint_off LATCH
   always_comb begin
-    vrf_raddr_o     = {vs2_vreg_addr, vd_vreg_addr};
+    load_wbe = '0;
+
+    vrf_raddr_o     = (state_q == VLSU_ReadingV0_t)? {v0_t_vreg_addr_hi, v0_t_vreg_addr_lo}:{vs2_vreg_addr, vd_vreg_addr}; // vs1 is not an operand of vle/vse
     vrf_re_o        = '0;
     vrf_req_d       = '0;
     vrf_req_valid_d = 1'b0;
@@ -781,21 +883,23 @@ module spatz_vlsu
     vrf_req_d.rsp_valid = commit_insn_valid && &commit_finished_d && mem_insn_finished_d[commit_insn_q.id];
 
     // Request indexes
-    vrf_re_o[1] = mem_is_indexed;
+    vrf_re_o[1] = (state_q == VLSU_ReadingV0_t)? 1'b1:mem_is_indexed; // for indexed load/store we need to read vs2
+    if (state_q == VLSU_ReadingV0_t)
+      vrf_re_o[0] = 1'b1;
 
     // Count which vs2 element we should load (indexed loads)
     vs2_elem_id_d = vs2_elem_id_q;
-    if (&(pending_index ^ ~mem_operation_valid) && mem_is_indexed)
+    if (&(pending_index ^ ~mem_operation_valid) && mem_is_indexed) // no operation || (pending && valid)
       vs2_elem_id_d = vs2_elem_id_q + 1;
-    if (mem_spatz_req_ready)
+    if (mem_spatz_req_ready) // finish one instruction
       vs2_elem_id_d = '0;
 
     if (commit_insn_valid && commit_insn_q.is_load) begin
-      // If we have a valid element in the buffer, store it back to the register file
+      // If we have a valid element in the buffer, put it back to the register file
       if (state_q == VLSU_RunningLoad && |commit_operation_valid) begin
         // Enable write back to the VRF if we have a valid element in all buffers that still have to write something back.
         vrf_req_d.waddr = vd_vreg_addr;
-        vrf_req_valid_d = &(rob_rvalid | ~mem_pending) && |mem_pending;
+        vrf_req_valid_d = &(rob_rvalid | ~mem_pending) && |mem_pending; // CMY: rob_rvalid: data is in the rob ready to be written back to VRF
 
         for (int unsigned port = 0; port < NrMemPorts; port++) begin
           automatic logic [63:0] data = rob_rdata[port];
@@ -856,10 +960,18 @@ module spatz_vlsu
                 EW_32: mask   = 15;
                 default: mask = '1;
               endcase
-              vrf_req_d.wbe[ELENB*port +: ELENB] = mask << shift;
-            end else
-              for (int unsigned k = 0; k < ELENB; k++)
-                vrf_req_d.wbe[ELENB*port+k] = k < commit_counter_delta[port];
+              // vrf_req_d.wbe[ELENB*port +: ELENB] = (mask << shift) /*& vm_masking[ELENB*port + shift +: ELENB]*/;
+              load_wbe[ELENB*port +: ELENB] = (mask << shift);
+              vm_wbe = vm_masking[commit_counter_mod32*32 +:(N_FU*ELENB)];
+              vrf_req_d.wbe = load_wbe & vm_wbe; // CMY: commit_counter_sum mod 32
+            end
+            else begin
+              for (int unsigned k = 0; k < ELENB; k++) begin
+                // vrf_req_d.wbe[ELENB*port+k] = (k < commit_counter_delta[port]) /*& vm_masking[ELENB*port+k]*/;
+                load_wbe[ELENB*port+k] = (k < commit_counter_delta[port]);
+                vrf_req_d.wbe = load_wbe & vm_masking[commit_counter_mod32*32 +:(N_FU*ELENB)];
+              end
+            end
         end
       end
 
@@ -875,7 +987,7 @@ module spatz_vlsu
 `endif
         if (!rob_full[port] && !offset_queue_full[port] && mem_operation_valid[port]) begin
           rob_req_id[port]     = spatz_mem_req_ready[port] & spatz_mem_req_valid[port];
-          mem_req_lvalid[port] = (!mem_is_indexed || (vrf_rvalid_i[1] && !pending_index[port])) && mem_spatz_req.op_mem.is_load;
+          mem_req_lvalid[port] = (state_d == VLSU_RunningLoad)?(!mem_is_indexed || (vrf_rvalid_i[1] && !pending_index[port])) && mem_spatz_req.op_mem.is_load : 0; // CMY: added state_d judgement
           mem_req_id[port]     = rob_id[port];
           mem_req_last[port]   = mem_operation_last[port];
         end
@@ -895,13 +1007,14 @@ module spatz_vlsu
       end
 
       for (int unsigned port = 0; port < NrMemPorts; port++) begin
+        vm_strb[port] = vm_wbe_store[port];
         // Read element from buffer and execute memory request
         if (mem_operation_valid[port]) begin
           automatic logic [63:0] data = rob_rdata[port];
 
           // Shift data to lsb if we have a strided or indexed memory access
           if (mem_is_strided || mem_is_indexed)
-            if (MAXEW == EW_32)
+            if (MAXEW == EW_32) // MAXEW_32 doesn't support masking
               unique case (mem_counter_q[port][1:0])
                 2'b01: data = {data[7:0], data[31:8]};
                 2'b10: data = {data[15:0], data[31:16]};
@@ -909,15 +1022,36 @@ module spatz_vlsu
                 default:; // Do nothing
               endcase
             else
-              unique case (mem_counter_q[port][2:0])
-                3'b001: data = {data[7:0], data[63:8]};
-                3'b010: data = {data[15:0], data[63:16]};
-                3'b011: data = {data[23:0], data[63:24]};
-                3'b100: data = {data[31:0], data[63:32]};
-                3'b101: data = {data[39:0], data[63:40]};
-                3'b110: data = {data[47:0], data[63:48]};
-                3'b111: data = {data[55:0], data[63:56]};
-                default:; // Do nothing
+              unique case (mem_counter_q[port][2:0]) // CMY: shift vm_masking along with data
+                3'b001: begin
+                  data = {data[7:0], data[63:8]};
+                  vm_strb[port] = {vm_wbe_store[port][0],vm_wbe_store[port][7:1]};
+                end
+                3'b010: begin
+                  data = {data[15:0], data[63:16]};
+                  vm_strb[port] = {vm_wbe_store[port][1:0],vm_wbe_store[port][7:2]};
+                end
+                3'b011: begin
+                  data = {data[23:0], data[63:24]};
+                  vm_strb[port] = {vm_wbe_store[port][2:0],vm_wbe_store[port][7:3]};
+                end
+                3'b100: begin
+                  data = {data[31:0], data[63:32]};
+                  vm_strb[port] = {vm_wbe_store[port][3:0],vm_wbe_store[port][7:4]};
+                end
+                3'b101: begin
+                  data = {data[39:0], data[63:40]};
+                  vm_strb[port] = {vm_wbe_store[port][4:0],vm_wbe_store[port][7:5]};
+                end
+                3'b110: begin
+                  data = {data[47:0], data[63:48]};
+                  vm_strb[port] = {vm_wbe_store[port][5:0],vm_wbe_store[port][7:6]};
+                end
+                3'b111: begin
+                  data = {data[55:0], data[63:56]};
+                  vm_strb[port] = {vm_wbe_store[port][6:0],vm_wbe_store[port][7]};
+                end
+                default: vm_strb[port] = vm_wbe_store[port]; // Do nothing
               endcase
 
           // Shift data to correct position if we have an unaligned memory request
@@ -930,17 +1064,41 @@ module spatz_vlsu
             endcase
           else
             unique case ((mem_is_strided || mem_is_indexed) ? mem_req_addr_offset[port] : mem_spatz_req.rs1[2:0])
-              3'b001: mem_req_data[port]  = {data[55:0], data[63:56]};
-              3'b010: mem_req_data[port]  = {data[47:0], data[63:48]};
-              3'b011: mem_req_data[port]  = {data[39:0], data[63:40]};
-              3'b100: mem_req_data[port]  = {data[31:0], data[63:32]};
-              3'b101: mem_req_data[port]  = {data[23:0], data[63:24]};
-              3'b110: mem_req_data[port]  = {data[15:0], data[63:16]};
-              3'b111: mem_req_data[port]  = {data[7:0], data[63:8]};
-              default: mem_req_data[port] = data;
+              3'b001: begin
+                mem_req_data[port]  = {data[55:0], data[63:56]}; // CMY: reoreder vm_masking along with data
+                vm_strb[port] = {vm_strb[port][6:0],vm_strb[port][7]};
+              end
+              3'b010: begin
+                mem_req_data[port]  = {data[47:0], data[63:48]};
+                vm_strb[port] = {vm_strb[port][5:0],vm_strb[port][7:6]};
+              end
+              3'b011: begin
+                mem_req_data[port]  = {data[39:0], data[63:40]};
+                vm_strb[port] = {vm_strb[port][4:0],vm_strb[port][7:5]};
+              end
+              3'b100: begin
+                mem_req_data[port]  = {data[31:0], data[63:32]};
+                vm_strb[port] = {vm_strb[port][3:0],vm_strb[port][7:4]};
+              end
+              3'b101: begin
+                mem_req_data[port]  = {data[23:0], data[63:24]};
+                vm_strb[port] = {vm_strb[port][2:0],vm_strb[port][7:3]};
+              end
+              3'b110: begin
+                mem_req_data[port]  = {data[15:0], data[63:16]};
+                vm_strb[port] = {vm_strb[port][1:0],vm_strb[port][7:2]};
+              end
+              3'b111: begin
+                mem_req_data[port]  = {data[7:0], data[63:8]};
+                vm_strb[port] = {vm_strb[port][0],vm_strb[port][7:1]};
+              end
+              default: begin
+                mem_req_data[port] = data;
+                vm_strb[port] = vm_strb[port];
+              end
             endcase
 
-          mem_req_svalid[port] = rob_rvalid[port] && (!mem_is_indexed || (vrf_rvalid_i[1] && !pending_index[port])) && !mem_spatz_req.op_mem.is_load;
+          mem_req_svalid[port] = (state_d == VLSU_RunningStore)? rob_rvalid[port] && (!mem_is_indexed || (vrf_rvalid_i[1] && !pending_index[port])) && !mem_spatz_req.op_mem.is_load : 0; //CMY: added state_d judgement
           mem_req_id[port]     = rob_rid[port];
           mem_req_last[port]   = mem_operation_last[port];
           rob_pop[port]        = spatz_mem_req_valid[port] && spatz_mem_req_ready[port];
@@ -955,10 +1113,20 @@ module spatz_vlsu
               EW_32: mask   = 15;
               default: mask = '1;
             endcase
-            mem_req_strb[port] = mask << shift;
-          end else
-            for (int unsigned k = 0; k < ELENB; k++)
-              mem_req_strb[port][k] = k < mem_counter_delta[port];
+            // mem_req_strb[port] = mask << shift;
+            store_strb[port] = mask << shift;
+            // mem_req_strb[port] = ((mask) & vm_masking[mem_counter_mod32*32 + port*ELENB +: ELENB]) << shift;
+            // vm_strb[port] = vm_masking[mem_counter_mod32*32 + port*ELENB +: ELENB] << shift;
+            mem_req_strb[port] = store_strb[port] & vm_strb[port];
+            // mem_req_strb[port] = store_strb[port] & vm_masking[mem_counter_mod32*32 + port*ELENB +: ELENB];
+          end
+          else begin
+            for (int unsigned k = 0; k < ELENB; k++) begin
+              // mem_req_strb[port][k] = k < mem_counter_delta[port];
+              store_strb[port][k] = k < mem_counter_delta[port];
+            end
+            mem_req_strb[port] = store_strb[port] & vm_masking[mem_counter_mod32*32 + port*ELENB +: ELENB];
+          end
         end else begin
           // Clear empty buffer id requests
           if (!rob_empty[port])
@@ -1002,7 +1170,7 @@ module spatz_vlsu
     assign spatz_mem_req[port].data  = mem_req_data[port];
     assign spatz_mem_req[port].strb  = mem_req_strb[port];
     assign spatz_mem_req[port].user  = '0;
-    assign spatz_mem_req_valid[port] = mem_req_svalid[port] || mem_req_lvalid[port];
+    assign spatz_mem_req_valid[port] = (state_q == VLSU_RunningLoad || state_q == VLSU_RunningStore)&&(mem_req_svalid[port] || mem_req_lvalid[port]); // CMY: add state selection
 `endif
   end
 
diff --git a/hw/ip/spatz/src/spatz_vrf.sv b/hw/ip/spatz/src/spatz_vrf.sv
index 4568d85f..17f24329 100644
--- a/hw/ip/spatz/src/spatz_vrf.sv
+++ b/hw/ip/spatz/src/spatz_vrf.sv
@@ -19,7 +19,7 @@ module spatz_vrf
     input  vrf_addr_t [NrWritePorts-1:0] waddr_i,
     input  vrf_data_t [NrWritePorts-1:0] wdata_i,
     input  logic      [NrWritePorts-1:0] we_i,
-    input  vrf_be_t   [NrWritePorts-1:0] wbe_i,
+    input  vrf_be_t   [NrWritePorts-1:0] wbe_i, // 32 bits* 3 WritePorts
     output logic      [NrWritePorts-1:0] wvalid_o,
     // Read ports
     input  vrf_addr_t [NrReadPorts-1:0]  raddr_i,
@@ -40,7 +40,7 @@ module spatz_vrf
   // Typedefs //
   //////////////
 
-  typedef logic [$bits(vrf_addr_t)-$clog2(NrVRFBanks)-1:0] vregfile_addr_t;
+  typedef logic [$bits(vrf_addr_t)-$clog2(NrVRFBanks)-1:0] vregfile_addr_t; // divide the addresses into banks.
 
   function automatic logic [$clog2(NrWordsPerBank)-1:0] f_vreg(vrf_addr_t addr);
     f_vreg = addr[$clog2(NrVRFWords)-1:$clog2(NrVRFBanks)];
diff --git a/hw/ip/spatz/src/spatz_vsldu.sv b/hw/ip/spatz/src/spatz_vsldu.sv
index 1562708f..4e2014b4 100644
--- a/hw/ip/spatz/src/spatz_vsldu.sv
+++ b/hw/ip/spatz/src/spatz_vsldu.sv
@@ -174,6 +174,17 @@ module spatz_vsldu
 
   `FF(new_vsldu_request_q, new_vsldu_request, '0)
 
+  // CMY: move the FSM definition in front of the prefetch logic
+  // FSM to decide whether we are on the first operation or not
+  typedef enum logic[1:0] {
+    VREG_READ_V0_t_lo, // CMY: add a state to read v0.t
+    VREG_READ_V0_t_hi,
+    VREG_IDLE,
+    VREG_WAIT_FIRST_WRITE
+  } vreg_operation_first_t;
+  vreg_operation_first_t vreg_operation_first_q, vreg_operation_first_d;
+  `FF(vreg_operation_first_q, vreg_operation_first_d, VREG_IDLE)
+
   // Accept a new operation or clear req register if we are finished
   always_comb begin
     slide_amount_d = slide_amount_q;
@@ -204,7 +215,7 @@ module spatz_vsldu
     end
 
     // Clear the prefetch register
-    if (prefetch_q && vrf_re_o && vrf_rvalid_i)
+    if (prefetch_q && vrf_re_o && vrf_rvalid_i && vreg_operation_first_q != VREG_READ_V0_t_lo && vreg_operation_first_q != VREG_READ_V0_t_hi) // CMY: added state judgement
       prefetch_d = 1'b0;
   end
 
@@ -219,17 +230,61 @@ module spatz_vsldu
   vlen_t vreg_counter_q;
   `FF(vreg_counter_q, vreg_counter_d, '0)
 
+  // CMY: count how many VRFWords we have already committed
+  logic [$bits(vlen_t)-5:0] verg_counter_mod32;
+  assign vreg_counter_mod32 = vreg_counter_q >> 5;
+
   // Are we on the first/last VRF operation?
   logic vreg_operation_first;
   logic vreg_operation_last;
 
-  // FSM to decide whether we are on the first operation or not
-  typedef enum logic {
+  // FSM to decide whether we are on the first operation
+  /*typedef enum logic[1:0] {
+    VREG_READ_V0_t, // CMY: added a state to read v0.t
     VREG_IDLE,
     VREG_WAIT_FIRST_WRITE
   } vreg_operation_first_t;
   vreg_operation_first_t vreg_operation_first_q, vreg_operation_first_d;
-  `FF(vreg_operation_first_q, vreg_operation_first_d, VREG_IDLE)
+  `FF(vreg_operation_first_q, vreg_operation_first_d, VREG_IDLE)*/
+
+  logic v0_t_lo_is_ready,v0_t_hi_is_ready;
+  assign v0_t_lo_is_ready   = (vreg_operation_first_q == VREG_READ_V0_t_lo) && vrf_rvalid_i;
+  assign v0_t_hi_is_ready   = (vreg_operation_first_q == VREG_READ_V0_t_hi) && vrf_rvalid_i;
+  logic v0_t_lo_read_done,v0_t_hi_read_done;
+  `FFLARNC(v0_t_lo_read_done,1'b1,v0_t_lo_is_ready,vsldu_rsp_valid_o,1'b0,clk_i,rst_ni);
+  `FFLARNC(v0_t_hi_read_done,1'b1,v0_t_hi_is_ready,vsldu_rsp_valid_o,1'b0,clk_i,rst_ni);
+
+  vrf_data_t  operand_v0_t_lo,operand_v0_t_lo_q; // CMY: v0 should be read from vrf
+  vrf_data_t  operand_v0_t_hi,operand_v0_t_hi_q;
+  assign operand_v0_t_lo = (vreg_operation_first_q == VREG_READ_V0_t_lo)? vrf_rdata_i:'0;
+  assign operand_v0_t_hi = (vreg_operation_first_q == VREG_READ_V0_t_hi)? vrf_rdata_i:'0;
+  `FFL(operand_v0_t_lo_q, operand_v0_t_lo, v0_t_lo_is_ready, '0) // CMY: backup v0.t
+  `FFL(operand_v0_t_hi_q, operand_v0_t_hi, v0_t_hi_is_ready, '0)
+
+  logic [VLEN-1:0] operand_v0_t_q;
+  assign operand_v0_t_q = {operand_v0_t_hi_q,operand_v0_t_lo_q};
+
+  // CMY: generate masking based on V0.t-----------------------------------
+  logic [VLEN-1:0] vm_masking;
+  always_comb begin
+    vm_masking = '1;
+    if(!spatz_req.op_sld.vm) begin
+      case (spatz_req.vtype.vsew)
+        EW_8:for(int i=0;i<VLENB;i=i+1)begin
+          vm_masking[i*1+:1] = {1{operand_v0_t_q[i]}};
+        end
+        EW_16:for(int i=0;i<VLENB;i=i+1)begin
+          vm_masking[i*2+:2] = {2{operand_v0_t_q[i]}};
+        end
+        EW_32: for(int i=0;i<VLENB;i=i+1)begin
+          vm_masking[i*4+:4] = {4{operand_v0_t_q[i]}};
+        end
+        default: if (MAXEW == EW_64) for(int i=0;i<VLENB;i=i+1)begin
+          vm_masking[i*8+:8] = {8{operand_v0_t_q[i]}};
+        end
+      endcase
+    end
+  end
 
   always_comb begin: vsldu_vreg_counter_proc
     // How many elements are left to do
@@ -254,16 +309,43 @@ module spatz_vsldu
       VREG_IDLE: begin
         // Wait until our first write operation
         vreg_operation_first = spatz_req_valid && !prefetch_q && new_vsldu_request_q;
-        if (spatz_req_valid && vreg_counter_q <= slide_amount_q)
-          vreg_operation_first_d = VREG_WAIT_FIRST_WRITE;
 
-        if (vrf_req_valid_d && vrf_req_ready_d)
-          vreg_operation_first_d = VREG_IDLE;
+        if(spatz_req_valid && !spatz_req.op_sld.vm && !v0_t_lo_read_done)
+          vreg_operation_first_d = VREG_READ_V0_t_lo;
+        else begin
+          if (spatz_req_valid && vreg_counter_q <= slide_amount_q)
+            vreg_operation_first_d = VREG_WAIT_FIRST_WRITE;
+
+          if (vrf_req_valid_d && vrf_req_ready_d)
+            vreg_operation_first_d = VREG_IDLE;
+        end
+      end
+
+      VREG_READ_V0_t_lo: begin
+        vreg_operation_first = 0;
+        if(v0_t_lo_is_ready) begin
+          // if (/*spatz_req_valid &&*/ vreg_counter_q <= slide_amount_q)
+          //   vreg_operation_first_d = VREG_WAIT_FIRST_WRITE;
+          // if (vrf_req_valid_d && vrf_req_ready_d)
+          //   vreg_operation_first_d = VREG_IDLE;
+          // vreg_operation_first_d = VREG_WAIT_FIRST_WRITE;
+          vreg_operation_first_d = VREG_READ_V0_t_hi;
+        end
+        else vreg_operation_first_d = VREG_READ_V0_t_lo;
+      end
+
+      VREG_READ_V0_t_hi: begin
+        vreg_operation_first = 0;
+        if(v0_t_hi_is_ready) begin
+          vreg_operation_first_d = VREG_WAIT_FIRST_WRITE;
+        end
+        else vreg_operation_first_d = VREG_READ_V0_t_hi;
       end
+
       VREG_WAIT_FIRST_WRITE: begin
         vreg_operation_first = spatz_req_valid && !prefetch_q;
-        if (vrf_req_valid_d && vrf_req_ready_d)
-          vreg_operation_first_d = VREG_IDLE;
+        if (vrf_req_valid_d && vrf_req_ready_d) // CMY: vrf_req_valid_d -> vrf_req_valid_q -> vrf_we_o
+          vreg_operation_first_d = VREG_IDLE;   // vrf_req_ready_q = vrf_wvalid_i
       end
       default:;
     endcase
@@ -280,7 +362,7 @@ module spatz_vsldu
     end
 
     // Do we have to increment the counter?
-    vreg_counter_en = ((spatz_req.use_vs2 && vrf_re_o && vrf_rvalid_i) || !spatz_req.use_vs2) && ((spatz_req.use_vd && vrf_req_valid_d && vrf_req_ready_d) || !spatz_req.use_vd);
+    vreg_counter_en = (vreg_operation_first_q!=VREG_READ_V0_t_lo) && (vreg_operation_first_q!=VREG_READ_V0_t_hi) && ((spatz_req.use_vs2 && vrf_re_o && vrf_rvalid_i) || !spatz_req.use_vs2) && ((spatz_req.use_vd && vrf_req_valid_d && vrf_req_ready_d) || !spatz_req.use_vd);
     if (vreg_counter_en) begin
       if (vreg_operation_last)
         // Reset the counter
@@ -351,6 +433,7 @@ module spatz_vsldu
 
   // Data signals for different stages of the shift
   vrf_data_t data_in, data_out, data_low, data_high;
+  vrf_be_t slide_wbe; // CMY: used for monitor wbe signals before vm_masking
 
   always_comb begin
     shift_overflow_d = shift_overflow_q;
@@ -363,17 +446,20 @@ module spatz_vsldu
     vrf_req_d.wbe   = '0;
     vrf_req_d.wdata = '0;
 
+    slide_wbe = '0;
+
     // Is there a vector instruction executing now?
     if (!is_vl_zero) begin
-      if (is_slide_up && spatz_req.op_sld.insert && spatz_req.op_sld.vmv) begin
-        for (int b_src = 0; b_src < VRFWordBWidth; b_src++)
-          data_in[(VRFWordBWidth-b_src-1)*8 +: 8] = spatz_req.rs1[b_src*8%ELEN +: 8];
-      end else if (is_slide_up) begin
-        // If we have a slide up operation, flip all bytes around (d[-i] = d[i])
-        for (int b_src = 0; b_src < VRFWordBWidth; b_src++)
-          data_in[(VRFWordBWidth-b_src-1)*8 +: 8] = vrf_rdata_i[b_src*8 +: 8];
-      end else begin
-        data_in = vrf_rdata_i;
+        if (is_slide_up && spatz_req.op_sld.insert && spatz_req.op_sld.vmv) begin
+          for (int b_src = 0; b_src < VRFWordBWidth; b_src++)
+            data_in[(VRFWordBWidth-b_src-1)*8 +: 8] = spatz_req.rs1[b_src*8%ELEN +: 8]; // CMY: rs1: value in the x[rs1]
+        end 
+        else if (is_slide_up) begin
+          // If we have a slide up operation, flip all bytes around (d[-i] = d[i])
+          for (int b_src = 0; b_src < VRFWordBWidth; b_src++)
+            data_in[(VRFWordBWidth-b_src-1)*8 +: 8] = (vreg_operation_first_q == VREG_READ_V0_t_lo || vreg_operation_first_q == VREG_READ_V0_t_hi )? data_in[(VRFWordBWidth-b_src-1)*8 +: 8] : vrf_rdata_i[b_src*8 +: 8];
+        end else begin
+          data_in = (vreg_operation_first_q == VREG_READ_V0_t_lo || vreg_operation_first_q == VREG_READ_V0_t_hi)? data_in : vrf_rdata_i;
 
         // If we are already over the MAXVL, all continuing elements are zero
         if ((vreg_counter_q >= MAXVL - slide_amount_q) || (vreg_operation_last && spatz_req.op_sld.insert))
@@ -420,29 +506,33 @@ module spatz_vsldu
 
         // Insert rs1 element at the first position
         if (spatz_req.op_sld.insert && !spatz_req.op_sld.vmv && vreg_operation_first && spatz_req.vstart == 'd0)
-          vrf_req_d.wdata = vrf_req_d.wdata | vrf_data_t'(spatz_req.rs1);
+          vrf_req_d.wdata = vrf_req_d.wdata | vrf_data_t'(spatz_req.rs1); // CMY: fill the LSB with spatz_req.rs1
       end else begin
         vrf_req_d.wdata = data_out;
       end
 
       // Create byte enable mask
       for (int i = 0; i < VRFWordBWidth; i++)
-        vrf_req_d.wbe[i] = i < vreg_counter_delta;
+        // vrf_req_d.wbe[i] = i < vreg_counter_delta;
+        slide_wbe[i] = i < vreg_counter_delta;
 
       // Special byte enable mask case when we are operating on the first register element.
       if (vreg_operation_first && is_slide_up)
         for (int i = 0; i < VRFWordBWidth; i++)
-          vrf_req_d.wbe[i] = (spatz_req.op_sld.insert || (i >= slide_amount_d[$clog2(VRFWordBWidth)-1:0])) & (i < (vreg_counter_q[$clog2(VRFWordBWidth)-1:0] + vreg_counter_delta));
+          // vrf_req_d.wbe[i] = (spatz_req.op_sld.insert || (i >= slide_amount_d[$clog2(VRFWordBWidth)-1:0])) & (i < (vreg_counter_q[$clog2(VRFWordBWidth)-1:0] + vreg_counter_delta));
+          slide_wbe[i] = (spatz_req.op_sld.insert || (i >= slide_amount_d[$clog2(VRFWordBWidth)-1:0])) & (i < (vreg_counter_q[$clog2(VRFWordBWidth)-1:0] + vreg_counter_delta));      
     end
 
     // Reset overflow register when finished
     if (vreg_operations_finished)
       shift_overflow_d = '0;
+
+    vrf_req_d.wbe = slide_wbe & vm_masking[vreg_counter_mod32*32 +:32];
   end
 
   // VRF signals
-  assign vrf_re_o        = spatz_req.use_vs2 && (spatz_req_valid || prefetch_q) && running_q[spatz_req.id];
-  assign vrf_req_valid_d = spatz_req_valid && spatz_req.use_vd && (vrf_re_o || !spatz_req.use_vs2) && (vrf_rvalid_i || !spatz_req.use_vs2) && !prefetch_q;
+  assign vrf_re_o        = (vreg_operation_first_q == VREG_READ_V0_t_lo)||(vreg_operation_first_q == VREG_READ_V0_t_hi)||(spatz_req.use_vs2 && (spatz_req_valid || prefetch_q) && running_q[spatz_req.id]);
+  assign vrf_req_valid_d = (vreg_operation_first_q != VREG_READ_V0_t_lo)&&(vreg_operation_first_q != VREG_READ_V0_t_hi)&& spatz_req_valid && spatz_req.use_vd && (vrf_re_o || !spatz_req.use_vs2) && (vrf_rvalid_i || !spatz_req.use_vs2) && !prefetch_q;
 
   ////////////////////////
   // Address Generation //
@@ -452,8 +542,14 @@ module spatz_vsldu
 
   always_comb begin
     sld_offset_rd   = is_slide_up ? (prefetch_q ? -slide_amount_q[$bits(vlen_t)-1:$clog2(VRFWordBWidth)] - 1 : -slide_amount_q[$bits(vlen_t)-1:$clog2(VRFWordBWidth)]) : prefetch_q ? slide_amount_q[$bits(vlen_t)-1:$clog2(VRFWordBWidth)] : slide_amount_q[$bits(vlen_t)-1:$clog2(VRFWordBWidth)] + 1;
-    vrf_raddr_o     = {spatz_req.vs2, $clog2(NrWordsPerVector)'(1'b0)} + vreg_counter_q[$bits(vlen_t)-1:$clog2(VRFWordBWidth)] + sld_offset_rd;
+    vrf_raddr_o     = (vreg_operation_first_q == VREG_READ_V0_t_lo) ? 
+                      {0, $clog2(NrWordsPerVector)'(1'b0)} : 
+                      ((vreg_operation_first_q == VREG_READ_V0_t_hi) ? 
+                      {1, $clog2(NrWordsPerVector)'(1'b0)} :
+                      ({spatz_req.vs2, $clog2(NrWordsPerVector)'(1'b0)} + vreg_counter_q[$bits(vlen_t)-1:$clog2(VRFWordBWidth)] + sld_offset_rd));
+    //                vs2 base in VRF                                  + the number of Word under operation                    + number of elements to slide
     vrf_req_d.waddr = {spatz_req.vd, $clog2(NrWordsPerVector)'(1'b0)} + vreg_counter_q[$bits(vlen_t)-1:$clog2(VRFWordBWidth)];
   end
-
+// CMY: the number of elements to slide. slide_amount_q[$bits(vlen_t)-1:$clog2(VRFWordBWidth)] : word offset
+// CMY: 
 endmodule : spatz_vsldu
diff --git a/hw/system/spatz_cluster/Makefile b/hw/system/spatz_cluster/Makefile
index 5c5a4234..6f906bec 100644
--- a/hw/system/spatz_cluster/Makefile
+++ b/hw/system/spatz_cluster/Makefile
@@ -26,9 +26,14 @@ SPATZ_CLUSTER_CFG_DEFINES += -DSNRT_NFPU_PER_CORE=$(shell python3 -c "import jst
 include $(ROOT)/util/Makefrag
 
 # QuestaSim
-VSIM      = questa-2021.3-kgf vsim
-VLOG      = questa-2021.3-kgf vlog
-VSIM_HOME = /usr/pack/questa-2021.3-kgf/questasim
+#VSIM      = questa-2021.3-kgf vsim
+#VLOG      = questa-2021.3-kgf vlog
+#VSIM_HOME = /usr/pack/questa-2021.3-kgf/questasim
+VSIM      = vsim
+VLOG      = vlog
+VSIM_HOME = /sw/CAD/Siemens/questa/2024.3/questasim
+#CMY modified
+
 
 VSIM_FLAGS += -t 1ps
 VSIM_FLAGS += -do "log -r /*; source ${SPATZ_CLUSTER_DIR}/script/vsim/wave.tcl; run -a"
@@ -164,11 +169,14 @@ clean.vcs:
 # SPYGLASS #
 ############
 
-SNPS_SG ?= spyglass-2022.06
+#SNPS_SG ?= spyglass-2022.06
+SNPS_SG ?= spyglass #Mamothones
 
 .PHONY: lint lint/tmp/files
 lint: generate lint/tmp/files lint/sdc/func.sdc lint/script/lint.tcl
-	cd lint && $(SNPS_SG) sg_shell -tcl script/lint.tcl
+#	cd lint && $(SNPS_SG) sg_shell -tcl script/lint.tcl
+	cd lint && sg_shell -tcl script/lint.tcl
+
 
 lint/tmp/files: ${BENDER}
 	mkdir -p lint/tmp
diff --git a/hw/system/spatz_cluster/test/bootrom.elf b/hw/system/spatz_cluster/test/bootrom.elf
index 22191858..179b2e60 100755
Binary files a/hw/system/spatz_cluster/test/bootrom.elf and b/hw/system/spatz_cluster/test/bootrom.elf differ
diff --git a/sw/riscvTests/CMakeLists.txt b/sw/riscvTests/CMakeLists.txt
index d5838211..7d17468f 100644
--- a/sw/riscvTests/CMakeLists.txt
+++ b/sw/riscvTests/CMakeLists.txt
@@ -85,6 +85,9 @@ add_snitch_test(vslide1up   isa/rv64uv/vslide1up.c)
 add_snitch_test(vslideup    isa/rv64uv/vslideup.c)
 add_snitch_test(vslide1down isa/rv64uv/vslide1down.c)
 add_snitch_test(vslidedown  isa/rv64uv/vslidedown.c)
+#CMY: float slide
+#add_snitch_test(vfslide1down  isa/rv64uv/vfslide1down.c)
+#add_snitch_test(vfslide1up  isa/rv64uv/vfslide1up.c)
 
 add_snitch_test(vdiv  isa/rv64uv/vdiv.c)
 add_snitch_test(vdivu isa/rv64uv/vdivu.c)
@@ -128,3 +131,26 @@ add_snitch_test(vfcvt  isa/rv64uv/vfcvt.c)
 add_snitch_test(vfncvt isa/rv64uv/vfncvt.c)
 
 add_snitch_test(vfmv isa/rv64uv/vfmv.c)
+
+# CMY: masking logic instructions
+add_snitch_test(vmand isa/rv64uv/vmand.c)
+add_snitch_test(vmor isa/rv64uv/vmor.c)
+add_snitch_test(vmandnot isa/rv64uv/vmandnot.c)
+add_snitch_test(vmnand isa/rv64uv/vmnand.c)
+add_snitch_test(vmnor isa/rv64uv/vmnor.c)
+add_snitch_test(vmornot isa/rv64uv/vmornot.c)
+add_snitch_test(vmxnor isa/rv64uv/vmxnor.c)
+add_snitch_test(vmxor isa/rv64uv/vmxor.c)
+
+# CMY: Load/Store instructions
+add_snitch_test(vle8 isa/rv64uv/vle8.c)
+add_snitch_test(vle16 isa/rv64uv/vle16.c)
+add_snitch_test(vle32 isa/rv64uv/vle32.c)
+add_snitch_test(vle64 isa/rv64uv/vle64.c)
+#add_snitch_test(vluxei isa/rv64uv/vluxei.c) # not supported by the original design.
+add_snitch_test(vse8 isa/rv64uv/vse8.c)
+add_snitch_test(vse16 isa/rv64uv/vse16.c)
+add_snitch_test(vse32 isa/rv64uv/vse32.c)
+add_snitch_test(vse64 isa/rv64uv/vse64.c)
+add_snitch_test(vss isa/rv64uv/vss.c) # on fixing
+
diff --git a/sw/riscvTests/isa/rv64uv/vadd.c b/sw/riscvTests/isa/rv64uv/vadd.c
index cfe9eac0..722fa5b7 100644
--- a/sw/riscvTests/isa/rv64uv/vadd.c
+++ b/sw/riscvTests/isa/rv64uv/vadd.c
@@ -36,7 +36,7 @@ void TEST_CASE1(void) {
 }
 
 void TEST_CASE2(void) {
-  VSET(16, e8, m8);
+  VSET(16, e8, m8); // #define VSET(VLEN, VTYPE, LMUL)  
   VLOAD_8(v8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
   VLOAD_8(v16, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
   VLOAD_8(v0, 0xAA, 0xAA);
@@ -187,16 +187,52 @@ void TEST_CASE6(void) {
 #endif
 }
 
+void TEST_CASE7(void) {
+  const uint32_t scalar = 5;
+
+  VSET(16, e8, m8);
+  VLOAD_8(v8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+  VLOAD_8(v0, 0xAA, 0xAB);
+  VCLEAR(v24);
+  asm volatile("vadd.vx v24, v8, %[A], v0.t" ::[A] "r"(scalar));
+  VCMP_U8(21, v24, 0, 7, 0, 9, 0, 11, 0, 13, 6, 7, 0, 9, 0, 11, 0, 13);
+
+  VSET(16, e16, m8);
+  VLOAD_16(v8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+  VLOAD_8(v0, 0xAA, 0xAB);
+  VCLEAR(v24);
+  asm volatile("vadd.vx v24, v8, %[A], v0.t" ::[A] "r"(scalar));
+  VCMP_U16(22, v24, 0, 7, 0, 9, 0, 11, 0, 13, 6, 7, 0, 9, 0, 11, 0, 13);
+
+  VSET(16, e32, m8);
+  VLOAD_32(v8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+  VLOAD_8(v0, 0xAA, 0xAB);
+  VCLEAR(v24);
+  asm volatile("vadd.vx v24, v8, %[A], v0.t" ::[A] "r"(scalar));
+  VCMP_U32(23, v24, 0, 7, 0, 9, 0, 11, 0, 13, 6, 7, 0, 9, 0, 11, 0, 13);
+
+#if ELEN == 64
+  VSET(16, e64, m8);
+  VLOAD_64(v8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+  VLOAD_8(v0, 0xAA, 0xAB);
+  VCLEAR(v24);
+  asm volatile("vadd.vx v24, v8, %[A], v0.t" ::[A] "r"(scalar));
+  VCMP_U64(24, v24, 0, 7, 0, 9, 0, 11, 0, 13, 6, 7, 0, 9, 0, 11, 0, 13);
+#endif
+}
+
 int main(void) {
   INIT_CHECK();
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
   TEST_CASE5();
-  // TEST_CASE6();
+  TEST_CASE6();
+  TEST_CASE7();
 
-  EXIT_CHECK();
+  EXIT_CHECK(); 
 }
+ 
\ No newline at end of file
diff --git a/sw/riscvTests/isa/rv64uv/vand.c b/sw/riscvTests/isa/rv64uv/vand.c
index 99c76169..55c0c73e 100644
--- a/sw/riscvTests/isa/rv64uv/vand.c
+++ b/sw/riscvTests/isa/rv64uv/vand.c
@@ -161,7 +161,8 @@ void TEST_CASE3() {
 }
 
 void TEST_CASE4() {
-  const uint32_t scalar = 0x0ff00ff0;
+   const uint32_t scalar = 0x0ff00ff0; // snitch is 32-bit scalar core
+//  const uint64_t scalar = 0x0ff00ff00ff00ff0;
 
   VSET(12, e8, m8);
   VLOAD_8(v16, 0xff, 0x01, 0xf0, 0xff, 0x01, 0xf0, 0xff, 0x01, 0xf0, 0xff, 0x01,
@@ -208,10 +209,14 @@ void TEST_CASE4() {
            0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef,
            0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef);
   asm volatile("vand.vx v8, v16, %[A], v0.t" ::[A] "r"(scalar));
-  VCMP_U64(16, v8, 0x000000000ff00ff0, 0xdeadbeefdeadbeef, 0x00f000f000f000f0,
+  /*VCMP_U64(16, v8, 0x000000000ff00ff0, 0xdeadbeefdeadbeef, 0x00f000f000f000f0,
            0x000000000ff00ff0, 0xdeadbeefdeadbeef, 0x00f000f000f000f0,
            0x000000000ff00ff0, 0xdeadbeefdeadbeef, 0x00f000f000f000f0,
-           0x000000000ff00ff0, 0xdeadbeefdeadbeef, 0x00f000f000f000f0);
+           0x000000000ff00ff0, 0xdeadbeefdeadbeef, 0x00f000f000f000f0);*/
+   VCMP_U64(16, v8, 0x000000000ff00ff0, 0xdeadbeefdeadbeef, 0x0000000000f000f0,
+           0x000000000ff00ff0, 0xdeadbeefdeadbeef, 0x0000000000f000f0,
+           0x000000000ff00ff0, 0xdeadbeefdeadbeef, 0x0000000000f000f0,
+           0x000000000ff00ff0, 0xdeadbeefdeadbeef, 0x0000000000f000f0);
 #endif
 }
 
@@ -310,12 +315,12 @@ int main(void) {
   INIT_CHECK();
   enable_vec();
 
-  TEST_CASE1();
-  // TEST_CASE2();
+//   TEST_CASE1();
+//   TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
-  TEST_CASE5();
-  // TEST_CASE6();
+  TEST_CASE4();
+//   TEST_CASE5();
+//   TEST_CASE6();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vdiv.c b/sw/riscvTests/isa/rv64uv/vdiv.c
index 03012cd9..eae8dea2 100644
--- a/sw/riscvTests/isa/rv64uv/vdiv.c
+++ b/sw/riscvTests/isa/rv64uv/vdiv.c
@@ -237,9 +237,9 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfadd.c b/sw/riscvTests/isa/rv64uv/vfadd.c
index 0cfac68a..1ab37a31 100644
--- a/sw/riscvTests/isa/rv64uv/vfadd.c
+++ b/sw/riscvTests/isa/rv64uv/vfadd.c
@@ -445,14 +445,14 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
   TEST_CASE4();
   TEST_CASE5();
 
   TEST_CASE6();
-  // TEST_CASE7();
-  // TEST_CASE8();
+  TEST_CASE7();
+  TEST_CASE8();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfmadd.c b/sw/riscvTests/isa/rv64uv/vfmadd.c
index 3f83835e..0dc01387 100644
--- a/sw/riscvTests/isa/rv64uv/vfmadd.c
+++ b/sw/riscvTests/isa/rv64uv/vfmadd.c
@@ -434,9 +434,9 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfncvt.c b/sw/riscvTests/isa/rv64uv/vfncvt.c
index 1e7f7874..80c5ab93 100644
--- a/sw/riscvTests/isa/rv64uv/vfncvt.c
+++ b/sw/riscvTests/isa/rv64uv/vfncvt.c
@@ -779,25 +779,25 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
 
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   TEST_CASE5();
-  // TEST_CASE6();
+  TEST_CASE6();
 
   TEST_CASE7();
-  // TEST_CASE8();
+  TEST_CASE8();
 
   TEST_CASE9();
-  // TEST_CASE10();
+  TEST_CASE10();
 
   TEST_CASE11();
-  // TEST_CASE12();
+  TEST_CASE12();
 
   TEST_CASE13();
-  // TEST_CASE14();
+  TEST_CASE14();
 
   /*
   vfncvt.rod.f.f is not supported yet
diff --git a/sw/riscvTests/isa/rv64uv/vfnmacc.c b/sw/riscvTests/isa/rv64uv/vfnmacc.c
index 9d0e0b94..8ca73502 100644
--- a/sw/riscvTests/isa/rv64uv/vfnmacc.c
+++ b/sw/riscvTests/isa/rv64uv/vfnmacc.c
@@ -457,9 +457,9 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfnmadd.c b/sw/riscvTests/isa/rv64uv/vfnmadd.c
index 9bc23a86..a40cb3e1 100644
--- a/sw/riscvTests/isa/rv64uv/vfnmadd.c
+++ b/sw/riscvTests/isa/rv64uv/vfnmadd.c
@@ -459,9 +459,9 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfredmax.c b/sw/riscvTests/isa/rv64uv/vfredmax.c
index b1d26274..6d4e193e 100644
--- a/sw/riscvTests/isa/rv64uv/vfredmax.c
+++ b/sw/riscvTests/isa/rv64uv/vfredmax.c
@@ -345,10 +345,10 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
   TEST_CASE4();
-  // TEST_CASE5();
+  TEST_CASE5();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfredmin.c b/sw/riscvTests/isa/rv64uv/vfredmin.c
index e776b433..be21683a 100644
--- a/sw/riscvTests/isa/rv64uv/vfredmin.c
+++ b/sw/riscvTests/isa/rv64uv/vfredmin.c
@@ -345,10 +345,10 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
   TEST_CASE4();
-  // TEST_CASE5();
+  TEST_CASE5();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfredosum.c b/sw/riscvTests/isa/rv64uv/vfredosum.c
index a1976ed4..97a2e1af 100644
--- a/sw/riscvTests/isa/rv64uv/vfredosum.c
+++ b/sw/riscvTests/isa/rv64uv/vfredosum.c
@@ -345,10 +345,10 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
   TEST_CASE4();
-  // TEST_CASE5();
+  TEST_CASE5();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfredusum.c b/sw/riscvTests/isa/rv64uv/vfredusum.c
index 88e81a58..e6480169 100644
--- a/sw/riscvTests/isa/rv64uv/vfredusum.c
+++ b/sw/riscvTests/isa/rv64uv/vfredusum.c
@@ -345,10 +345,10 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
   TEST_CASE4();
-  // TEST_CASE5();
+  TEST_CASE5();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfslide1down.c b/sw/riscvTests/isa/rv64uv/vfslide1down.c
index 7c7d5342..a07ebc31 100644
--- a/sw/riscvTests/isa/rv64uv/vfslide1down.c
+++ b/sw/riscvTests/isa/rv64uv/vfslide1down.c
@@ -101,7 +101,7 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  TEST_CASE2();
+  // TEST_CASE2();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfslide1up.c b/sw/riscvTests/isa/rv64uv/vfslide1up.c
index 4acd161b..e647380a 100644
--- a/sw/riscvTests/isa/rv64uv/vfslide1up.c
+++ b/sw/riscvTests/isa/rv64uv/vfslide1up.c
@@ -84,7 +84,7 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  TEST_CASE2();
+  // TEST_CASE2();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfwmacc.c b/sw/riscvTests/isa/rv64uv/vfwmacc.c
index 3306c7da..35768871 100644
--- a/sw/riscvTests/isa/rv64uv/vfwmacc.c
+++ b/sw/riscvTests/isa/rv64uv/vfwmacc.c
@@ -352,9 +352,9 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfwmsac.c b/sw/riscvTests/isa/rv64uv/vfwmsac.c
index 492273d0..75f56346 100644
--- a/sw/riscvTests/isa/rv64uv/vfwmsac.c
+++ b/sw/riscvTests/isa/rv64uv/vfwmsac.c
@@ -354,9 +354,9 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vfwmul.c b/sw/riscvTests/isa/rv64uv/vfwmul.c
index 3ea2f5d9..c7651f61 100644
--- a/sw/riscvTests/isa/rv64uv/vfwmul.c
+++ b/sw/riscvTests/isa/rv64uv/vfwmul.c
@@ -255,9 +255,9 @@ int main(void) {
   enable_fp();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vle16.c b/sw/riscvTests/isa/rv64uv/vle16.c
new file mode 100644
index 00000000..99b9a87d
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vle16.c
@@ -0,0 +1,67 @@
+// Author: CMY
+
+#include "vector_macros.h"
+
+void TEST_CASE0(void) { // test vm signal
+  VSET(16, e16, m1);
+  VLOAD_16(v1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  VLOAD_8(v0, 0xAA, 0xAA);
+  volatile uint16_t INP1[] = {0xaabb,0x0123, 0x4567, 0x89ab, 0xcdef, 0xcdef, 0x89ab, 0x4567,
+                             0x0123, 0x4567, 0x89ab, 0xcdef, 0xcdef, 0x89ab, 0x4567, 0x0123};
+  asm volatile("vle16.v v1, (%0),v0.t" ::"r"(INP1));
+  VCMP_U16(0, v1, 0x1,0x0123, 0x3, 0x89ab, 0x5, 0xcdef, 0x7, 0x4567,
+                             0x9, 0x4567, 11, 0xcdef, 13, 0x89ab, 15, 0x0123);
+}
+
+void TEST_CASE1(void) {
+  VSET(16, e16, m1);
+  volatile uint16_t INP1[] = {0xaabb,0x0123, 0x4567, 0x89ab, 0xcdef, 0xcdef, 0x89ab, 0x4567,
+                             0x0123, 0x4567, 0x89ab, 0xcdef, 0xcdef, 0x89ab, 0x4567, 0x0123};
+  asm volatile("vle16.v v1, (%0)" ::"r"(INP1));
+  VCMP_U16(1, v1, 0xaabb,0x0123, 0x4567, 0x89ab, 0xcdef, 0xcdef, 0x89ab, 0x4567,
+                             0x0123, 0x4567, 0x89ab, 0xcdef, 0xcdef, 0x89ab, 0x4567, 0x0123);
+}
+
+// Positive-stride tests
+void TEST_CASE2(void) {
+  VSET(4, e16, m1);
+  volatile uint16_t INP1[] = {0x0123, 0x4567, 0x89ab, 0xcdef, 0xcdef, 0x89ab, 0x4567, 0x0123,
+                             0x0123, 0x4567, 0x89ab, 0xcdef, 0xcdef, 0x89ab, 0x4567, 0x0123};
+  uint64_t stride = 6; // stride unit is BYTE
+  asm volatile("vlse16.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
+  VCMP_U16(2, v1, 0x0123, 0xcdef, 0x4567, 0x4567);
+}
+
+void TEST_CASE3(void) {
+  VSET(16, e16, m1); // SET the VLEN to 16 to use the 4 memory ports
+  volatile uint16_t INP1[] = {0x0123, 0x4567, 0x89ab, 0xcdef, 0xcdef, 0x89ab, 0x4567, 0x0123,
+                             0x0123, 0x4567, 0x89ab, 0xcdef, 0xcdef, 0x89ab, 0x4567, 0x0123};
+  uint64_t stride = 6; // stride unit is BYTE
+  asm volatile("vlse16.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
+  VCMP_U16(3, v1, 0x0123, 0xcdef, 0x4567, 0x4567,0xcdef,0x0123);
+}
+
+void TEST_CASE4(void) {
+  VSET(4, e16, m1);
+  volatile uint16_t INP1[] = {0x9fe4, 0x1920, 0x8f2e, 0x05e0,
+                              0xf9aa, 0x71f0, 0xc394, 0xbbd3};
+  uint64_t stride = 4;
+  VLOAD_8(v0, 0xAA);
+  VCLEAR(v1);
+  asm volatile("vlse16.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
+  VCMP_U16(4, v1, 0, 0x8f2e, 0, 0xc394);
+}
+
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE0();
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+
+  EXIT_CHECK();
+}
\ No newline at end of file
diff --git a/sw/riscvTests/isa/rv64uv/vle32.c b/sw/riscvTests/isa/rv64uv/vle32.c
new file mode 100644
index 00000000..6d5fa70c
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vle32.c
@@ -0,0 +1,181 @@
+// Author: CMY
+
+#include "vector_macros.h"
+
+
+void TEST_CASE1(void) {
+  VSET(16, e32, m1);
+  volatile uint32_t ALIGNED_I32[16] = {
+        0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348, 0x9fa831c7, 0x38197598,
+        0x18931795, 0x81937598, 0x18747547, 0x3eeeeeee, 0x90139301, 0xab8b9148,
+        0x90318509, 0x31897598, 0x83195999, 0x89139848};
+  asm volatile("vle32.v v0, (%0)" ::"r"(ALIGNED_I32));
+  VCMP_U32(1, v0, 0x9fe41920,0xf9aa71f0, 0xa11a9384, 0x99991348, 0x9fa831c7, 0x38197598,
+           0x18931795, 0x81937598, 0x18747547, 0x3eeeeeee, 0x90139301,
+           0xab8b9148, 0x90318509, 0x31897598, 0x83195999, 0x89139848);
+}
+
+// Positive-stride tests
+void TEST_CASE2(void) {
+  VSET(32, e32, m8);
+  volatile uint32_t INP1[] = {1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+        0xa, 0xb, 0xc,
+        0xd, 0xe, 0xf,
+        0x10,
+    0x10,0xf,0xe,0xd,0xc,0xb,0xa,9,8,7,6,5,4,3,2,1};
+  VCLEAR(v2);
+  asm volatile("vle32.v v2, (%0)" ::"r"(INP1));
+  VCMP_U32(32, v2, 1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+        0xa, 0xb, 0xc,
+        0xd, 0xe, 0xf,
+        0x10,
+    0x10,0xf,0xe,0xd,0xc,0xb,0xa,9,8,7,6,5,4,3,2,1);
+}
+
+void TEST_CASE3(void) {
+  VSET(4, e32, m1);
+  volatile uint32_t INP1[] = {0x9fe41920, 0x8f2e05e0, 0xf9aa71f0, 0xc394bbd3,
+                              0xa11a9384, 0xa7163840, 0x99991348, 0xa9f38cd1};
+  uint64_t stride = 8;
+  VLOAD_8(v0, 0xAA);
+  VCLEAR(v1);
+  asm volatile("vlse32.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
+  VCMP_U32(3, v1, 0, 0xf9aa71f0, 0, 0x99991348);
+}
+
+void TEST_CASE10(void) {
+  VSET(8, e32, m1);
+  volatile uint32_t INP1[] = {0x9fe41920, 0x8f2e05e0, 0xf9aa71f0, 0xc394bbd3,
+                              0xa11a9384, 0xa7163840, 0x99991348, 0xa9f38cd1};
+  VCLEAR(v1);
+  asm volatile("vle32.v v1, (%0)" ::"r"(INP1));
+  VCMP_U32(10, v1, 0x9fe41920, 0x8f2e05e0, 0xf9aa71f0, 0xc394bbd3,
+                              0xa11a9384, 0xa7163840, 0x99991348, 0xa9f38cd1);
+}
+
+void TEST_CASE11(void) {
+  VSET(16, e32, m2);
+  volatile uint32_t INP1[] = {0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348, 0x9fa831c7, 0x38197598,
+        0x18931795, 0x81937598, 0x18747547, 0x3eeeeeee, 0x90139301, 0xab8b9148,
+        0x90318509, 0x31897598, 0x83195999, 0x89139848};
+  VCLEAR(v1);
+  asm volatile("vle32.v v1, (%0)" ::"r"(INP1));
+  VCMP_U32(11, v1, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348, 0x9fa831c7, 0x38197598,
+        0x18931795, 0x81937598, 0x18747547, 0x3eeeeeee, 0x90139301, 0xab8b9148,
+        0x90318509, 0x31897598, 0x83195999, 0x89139848);
+}
+
+void TEST_CASE12(void) {
+  VSET(16, e64, m8);
+  volatile uint64_t INP1[] = {0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+        0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+        0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+        0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+        0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+        0x8913984898951989};
+  // VCLEAR(v8);
+  VLOAD_8(v0, 0xFF, 0xFF);
+  VCLEAR(v8);
+  asm volatile("vle64.v v8, (%0), v0.t" ::"r"(INP1));
+  VCMP_U64(12, v8, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+           0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+           0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+           0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+           0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+           0x8913984898951989);
+}
+
+void TEST_CASE4(void) {
+  VSET(8, e64, m2);
+  volatile uint64_t INP1[] = {
+      0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+      0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+      0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+      0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+      0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+      0x8913984898951989};
+  uint64_t stride = 16;
+  VLOAD_8(v0, 0xAA);
+  VCLEAR(v8);
+  asm volatile("vlse64.v v8, (%0), %1" ::"r"(INP1), "r"(stride));
+  VCMP_U64(4, v8, 0x9fe419208f2e05e0, 0xa11a9384a7163840, 0x9fa831c7a11a9384, 0x1893179501093489, 0x1874754791888188,
+           0x9013930148815808, 0x9031850931584902, 0x8319599991911111);
+}
+
+void TEST_CASE5(void) {
+  VSET(8, e64, m2);
+  volatile uint64_t INP1[] = {
+      0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+      0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+      0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+      0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+      0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+      0x8913984898951989};
+  uint64_t stride = 16;
+  VLOAD_8(v0, 0xAA);
+  VCLEAR(v8);
+  asm volatile("vlse64.v v8, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
+  VCMP_U64(5, v8, 0, 0xa11a9384a7163840, 0, 0x1893179501093489, 0,
+           0x9013930148815808, 0,0x8319599991911111);
+}
+
+void TEST_CASE6(void) {
+  VSET(4, e64, m1);
+  volatile uint64_t INP1[] = {0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3,
+                              0xa11a9384a7163840, 0x99991348a9f38cd1};
+  uint64_t stride = 8;
+  asm volatile("vlse64.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
+  VCMP_U64(6, v1, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+           0x99991348a9f38cd1);
+}
+
+void TEST_CASE7(void) {
+  VSET(2, e64, m1);
+  volatile uint64_t INP1[] = {0x99991348a9f38cd1, 0x9fa831c7a11a9384,
+                              0x9fa831c7a11a9384, 0x9fa831c7a11a9384,
+                              0x9fa831c7a11a9384, 0x01015ac1309bb678};
+  uint64_t stride = 40;
+  asm volatile("vlse64.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
+  VCMP_U64(9, v1, 0x99991348a9f38cd1, 0x01015ac1309bb678);
+}
+
+void TEST_CASE8(void) {
+  VSET(8, e64, m2);
+  volatile uint64_t INP1[] = {
+      0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+      0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+      0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+      0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+      0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+      0x8913984898951989};
+  uint64_t stride = 16;
+  VLOAD_8(v0, 0xAB);
+  VCLEAR(v8);
+  asm volatile("vlse64.v v8, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
+  VCMP_U64(8, v8, 0x9fe419208f2e05e0, 0xa11a9384a7163840, 0, 0x1893179501093489, 0,
+           0x9013930148815808, 0,0x8319599991911111);
+}
+
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+//   TEST_CASE10();
+//   TEST_CASE11();
+//   TEST_CASE12();
+//   // TEST_CASE4();
+//   TEST_CASE5();
+//   // TEST_CASE6();
+//   // TEST_CASE7();
+//   TEST_CASE8();
+
+  EXIT_CHECK();
+}
\ No newline at end of file
diff --git a/sw/riscvTests/isa/rv64uv/vle64.c b/sw/riscvTests/isa/rv64uv/vle64.c
new file mode 100644
index 00000000..be5002cb
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vle64.c
@@ -0,0 +1,220 @@
+// Author: CMY
+
+#include "vector_macros.h"
+
+// void TEST_CASE0(void) { // test vm signal
+//   VSET(16, e64, m8);
+//   volatile uint64_t INP1[] = {0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+//         0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+//         0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+//         0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+//         0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+//         0x8913984898951989};
+//   asm volatile("vle64.v v8, (%0)" ::"r"(INP1));
+//   VCMP_U64(0, v8, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+//         0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+//         0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+//         0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+//         0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+//         0x8913984898951989);
+// }
+
+void TEST_CASE0(void) { // test vm signal
+  VSET(16, e64, m8);
+  // VLOAD_64(v8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  volatile uint64_t INP1[] = {1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+        0xa, 0xb, 0xc,
+        0xd, 0xe, 0xf,
+        0x10};
+  asm volatile("vle64.v v8, (%0)" ::"r"(INP1));
+  VCMP_U64(0, v8, 1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+        0xa, 0xb, 0xc,
+        0xd, 0xe, 0xf,
+        0x10);
+}
+
+void TEST_CASE1(void) {
+  VSET(16, e64, m2);
+  volatile uint64_t INP1[] = {0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+        0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+        0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+        0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+        0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+        0x8913984898951989};
+  asm volatile("vle64.v v1, (%0)" ::"r"(INP1));
+  VCMP_U64(1, v1, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+        0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+        0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+        0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+        0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+        0x8913984898951989);
+}
+
+// Positive-stride tests
+void TEST_CASE2(void) {
+  VSET(16, e64, m2);
+  volatile uint64_t INP1[] = {0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+        0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+        0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+        0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+        0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+        0x8913984898951989};
+  VCLEAR(v2);
+  VLOAD_8(v0, 0xFF, 0xFF);
+  asm volatile("vle64.v v2, (%0)" ::"r"(INP1));
+  VCMP_U64(2, v2, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+           0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+           0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+           0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+           0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+           0x8913984898951989);
+}
+
+void TEST_CASE3(void) {
+  VSET(4, e32, m1);
+  volatile uint32_t INP1[] = {0x9fe41920, 0x8f2e05e0, 0xf9aa71f0, 0xc394bbd3,
+                              0xa11a9384, 0xa7163840, 0x99991348, 0xa9f38cd1};
+  uint64_t stride = 8;
+  VLOAD_8(v0, 0xAA);
+  VCLEAR(v1);
+  asm volatile("vlse32.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
+  VCMP_U32(3, v1, 0, 0xf9aa71f0, 0, 0x99991348);
+}
+
+void TEST_CASE10(void) {
+  VSET(8, e32, m1);
+  volatile uint32_t INP1[] = {0x9fe41920, 0x8f2e05e0, 0xf9aa71f0, 0xc394bbd3,
+                              0xa11a9384, 0xa7163840, 0x99991348, 0xa9f38cd1};
+  VCLEAR(v1);
+  asm volatile("vle32.v v1, (%0)" ::"r"(INP1));
+  VCMP_U32(10, v1, 0x9fe41920, 0x8f2e05e0, 0xf9aa71f0, 0xc394bbd3,
+                              0xa11a9384, 0xa7163840, 0x99991348, 0xa9f38cd1);
+}
+
+void TEST_CASE11(void) {
+  VSET(16, e32, m2);
+  volatile uint32_t INP1[] = {0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348, 0x9fa831c7, 0x38197598,
+        0x18931795, 0x81937598, 0x18747547, 0x3eeeeeee, 0x90139301, 0xab8b9148,
+        0x90318509, 0x31897598, 0x83195999, 0x89139848};
+  VCLEAR(v1);
+  asm volatile("vle32.v v1, (%0)" ::"r"(INP1));
+  VCMP_U32(11, v1, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348, 0x9fa831c7, 0x38197598,
+        0x18931795, 0x81937598, 0x18747547, 0x3eeeeeee, 0x90139301, 0xab8b9148,
+        0x90318509, 0x31897598, 0x83195999, 0x89139848);
+}
+
+void TEST_CASE12(void) {
+  VSET(16, e64, m8);
+  volatile uint64_t INP1[] = {0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+        0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+        0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+        0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+        0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+        0x8913984898951989};
+  // VCLEAR(v8);
+  VLOAD_8(v0, 0xFF, 0xFF);
+  VCLEAR(v8);
+  asm volatile("vle64.v v8, (%0), v0.t" ::"r"(INP1));
+  VCMP_U64(12, v8, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+           0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+           0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+           0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+           0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+           0x8913984898951989);
+}
+
+void TEST_CASE4(void) {
+  VSET(8, e64, m2);
+  volatile uint64_t INP1[] = {
+      0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+      0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+      0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+      0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+      0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+      0x8913984898951989};
+  uint64_t stride = 16;
+  VLOAD_8(v0, 0xAA);
+  VCLEAR(v8);
+  asm volatile("vlse64.v v8, (%0), %1" ::"r"(INP1), "r"(stride));
+  VCMP_U64(4, v8, 0x9fe419208f2e05e0, 0xa11a9384a7163840, 0x9fa831c7a11a9384, 0x1893179501093489, 0x1874754791888188,
+           0x9013930148815808, 0x9031850931584902, 0x8319599991911111);
+}
+
+void TEST_CASE5(void) {
+  VSET(8, e64, m2);
+  volatile uint64_t INP1[] = {
+      0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+      0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+      0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+      0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+      0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+      0x8913984898951989};
+  uint64_t stride = 16;
+  VLOAD_8(v0, 0xAA);
+  VCLEAR(v8);
+  asm volatile("vlse64.v v8, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
+  VCMP_U64(5, v8, 0, 0xa11a9384a7163840, 0, 0x1893179501093489, 0,
+           0x9013930148815808, 0,0x8319599991911111);
+}
+
+void TEST_CASE6(void) {
+  VSET(4, e64, m1);
+  volatile uint64_t INP1[] = {0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3,
+                              0xa11a9384a7163840, 0x99991348a9f38cd1};
+  uint64_t stride = 8;
+  asm volatile("vlse64.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
+  VCMP_U64(6, v1, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+           0x99991348a9f38cd1);
+}
+
+void TEST_CASE7(void) {
+  VSET(2, e64, m1);
+  volatile uint64_t INP1[] = {0x99991348a9f38cd1, 0x9fa831c7a11a9384,
+                              0x9fa831c7a11a9384, 0x9fa831c7a11a9384,
+                              0x9fa831c7a11a9384, 0x01015ac1309bb678};
+  uint64_t stride = 40;
+  asm volatile("vlse64.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
+  VCMP_U64(9, v1, 0x99991348a9f38cd1, 0x01015ac1309bb678);
+}
+
+void TEST_CASE8(void) {
+  VSET(8, e64, m2);
+  volatile uint64_t INP1[] = {
+      0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+      0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+      0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+      0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+      0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+      0x8913984898951989};
+  uint64_t stride = 16;
+  VLOAD_8(v0, 0xAB);
+  VCLEAR(v8);
+  asm volatile("vlse64.v v8, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
+  VCMP_U64(8, v8, 0x9fe419208f2e05e0, 0xa11a9384a7163840, 0, 0x1893179501093489, 0,
+           0x9013930148815808, 0,0x8319599991911111);
+}
+
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE0();
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE10();
+  TEST_CASE11();
+  TEST_CASE12();
+  TEST_CASE4();
+  TEST_CASE5();
+  TEST_CASE6();
+  TEST_CASE7();
+  TEST_CASE8();
+
+  EXIT_CHECK();
+}
\ No newline at end of file
diff --git a/sw/riscvTests/isa/rv64uv/vle8.c b/sw/riscvTests/isa/rv64uv/vle8.c
new file mode 100644
index 00000000..7f80ddb7
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vle8.c
@@ -0,0 +1,52 @@
+// Author: CMY
+
+#include "vector_macros.h"
+
+// Positive-stride tests
+void TEST_CASE1(void) {
+  VSET(4, e8, m1);
+  volatile uint8_t INP1[] = {0x9f, 0xe4, 0x19, 0x20, 0x8f, 0x2e, 0x05, 0xe0,
+                             0xf9, 0xaa, 0x71, 0xf0, 0xc3, 0x94, 0xbb, 0xd3};
+  uint64_t stride = 3;
+  asm volatile("vlse8.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
+  VCMP_U8(1, v1, 0x9f, 0x20, 0x05, 0xaa);
+}
+
+void TEST_CASE2(void) {
+  VSET(4, e8, m1);
+  VLOAD_8(v2, 0, 1, 2, 3);
+  volatile uint8_t INP[] = {0xff, 0x00, 0x0f, 0xf0};
+  asm volatile("vluxei8.v v1, (%0), v2" ::"r"(INP));
+  VCMP_U8(1, v1, 0xff, 0x00, 0x0f, 0xf0);
+}
+
+void TEST_CASE3(void) {
+  VSET(4, e8, m1);
+  VLOAD_8(v2, 0, 1, 2, 3);
+  volatile uint8_t INP[] = {0xff, 0x00, 0x0f, 0xf0};
+  asm volatile("vloxei8.v v1, (%0), v2" ::"r"(INP));
+  VCMP_U8(1, v1, 0xff, 0x00, 0x0f, 0xf0);
+}
+
+void TEST_CASE4(void) {
+  VSET(4, e8, m1);
+  volatile uint8_t INP1[] = {0x9f, 0xe4, 0x19, 0x20, 0x8f, 0x2e, 0x05, 0xe0,
+                             0xf9, 0xaa, 0x71, 0xf0, 0xc3, 0x94, 0xbb, 0xd3};
+  uint64_t stride = 3;
+  VLOAD_8(v0, 0xAA);
+  VCLEAR(v1);
+  asm volatile("vlse8.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
+  VCMP_U8(4, v1, 0x00, 0x20,0x00, 0xaa);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  // TEST_CASE2();
+  // TEST_CASE3();
+  TEST_CASE4();
+
+  EXIT_CHECK();
+}
\ No newline at end of file
diff --git a/sw/riscvTests/isa/rv64uv/vmacc.c b/sw/riscvTests/isa/rv64uv/vmacc.c
index e24d7323..8688f62b 100644
--- a/sw/riscvTests/isa/rv64uv/vmacc.c
+++ b/sw/riscvTests/isa/rv64uv/vmacc.c
@@ -82,6 +82,7 @@ void TEST_CASE1() {
 
 void TEST_CASE2() {
   VSET(16, e8, m8);
+//   VLOAD_8(v0, 0xAA, 0xAA); // just for test
   VLOAD_8(v24, 0x21, 0x75, 0x7f, 0x3a, 0x50, 0x6d, 0x3f, 0x3e, 0x74, 0x11, 0x29,
           0xea, 0x14, 0xce, 0xb0, 0x37);
   VLOAD_8(v16, 0xfe, 0xa7, 0x06, 0xaa, 0x35, 0x3c, 0x2c, 0x58, 0xa1, 0xc4, 0x40,
@@ -226,6 +227,7 @@ void TEST_CASE3() {
 void TEST_CASE4() {
   VSET(16, e8, m8);
   int64_t scalar = 5;
+//   VLOAD_8(v0, 0xAA, 0xAA);
   VLOAD_8(v16, 0x60, 0xe3, 0xa0, 0xb7, 0x35, 0x23, 0xa3, 0xf4, 0x5f, 0x6e, 0x07,
           0x01, 0xe7, 0x51, 0x53, 0x29);
   VLOAD_8(v8, 0xfb, 0x1b, 0xc0, 0x36, 0xa7, 0xe0, 0xc8, 0x47, 0x57, 0xe0, 0x51,
@@ -298,9 +300,9 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vmand.c b/sw/riscvTests/isa/rv64uv/vmand.c
new file mode 100644
index 00000000..82332f4b
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vmand.c
@@ -0,0 +1,79 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+//         Basile Bougenot <bbougenot@student.ethz.ch>
+
+#include "vector_macros.h"
+
+void TEST_CASE1() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  asm volatile("vmand.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(1, v1, 0x84, 0x21);
+}
+
+void TEST_CASE2() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0xFF, 0xFF);
+  asm volatile("vmand.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(2, v1, 0xCD, 0xEF);
+}
+
+void TEST_CASE3() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x00, 0x00);
+  asm volatile("vmand.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(3, v1, 0x00, 0x00);
+}
+
+void TEST_CASE4() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x0F, 0xF0);
+  asm volatile("vmand.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(4, v1, 0x0D, 0xE0);
+}
+
+void TEST_CASE5() {
+  VSET(16, e8, m1);
+  VLOAD_8(v1, 0xFF, 0xFF);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  VSET(16, e8, m1);
+  asm volatile("vmand.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(5, v1, 0x84, 0x21);
+}
+
+void TEST_CASE6() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF, 0xCD, 0xEF, 0xCD, 0xEF, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21, 0x84, 0x21, 0x84, 0x21, 0x84, 0x21);
+  asm volatile("vmand.mm v1, v2, v3");
+  VSET(13, e8, m1);
+  VCLEAR(v2);
+  VCMP_U8(6, v2, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+  TEST_CASE6();
+
+  EXIT_CHECK();
+}
diff --git a/sw/riscvTests/isa/rv64uv/vmandnot.c b/sw/riscvTests/isa/rv64uv/vmandnot.c
new file mode 100644
index 00000000..5902eedd
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vmandnot.c
@@ -0,0 +1,68 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+//         Basile Bougenot <bbougenot@student.ethz.ch>
+
+#include "vector_macros.h"
+
+void TEST_CASE1() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  asm volatile("vmandnot.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(1, v1, 0x49, 0xCE);
+}
+
+void TEST_CASE2() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0xFF, 0xFF);
+  asm volatile("vmandnot.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(2, v1, 0x00, 0x00);
+}
+
+void TEST_CASE3() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x00, 0x00);
+  asm volatile("vmandnot.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(3, v1, 0xCD, 0xEF);
+}
+
+void TEST_CASE4() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x0F, 0xF0);
+  asm volatile("vmandnot.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(4, v1, 0xC0, 0x0F);
+}
+
+void TEST_CASE5() {
+  VSET(16, e8, m1);
+  VLOAD_8(v1, 0xFF, 0xFF);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  VSET(16, e8, m1);
+  asm volatile("vmandnot.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(5, v1, 0x49, 0xCE);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+
+  EXIT_CHECK();
+}
diff --git a/sw/riscvTests/isa/rv64uv/vmnand.c b/sw/riscvTests/isa/rv64uv/vmnand.c
new file mode 100644
index 00000000..543fb28b
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vmnand.c
@@ -0,0 +1,68 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+//         Basile Bougenot <bbougenot@student.ethz.ch>
+
+#include "vector_macros.h"
+
+void TEST_CASE1() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  asm volatile("vmnand.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(1, v1, 0x7B, 0xDE);
+}
+
+void TEST_CASE2() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0xFF, 0xFF);
+  asm volatile("vmnand.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(2, v1, 0x32, 0x10);
+}
+
+void TEST_CASE3() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x00, 0x00);
+  asm volatile("vmnand.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(3, v1, 0xFF, 0xFF);
+}
+
+void TEST_CASE4() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x0F, 0xF0);
+  asm volatile("vmnand.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(4, v1, 0xF2, 0x1F);
+}
+
+void TEST_CASE5() {
+  VSET(16, e8, m1);
+  VLOAD_8(v1, 0xFF, 0xFF);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  VSET(16, e8, m1);
+  asm volatile("vmnand.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(5, v1, 0x7B, 0xDE);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+
+  EXIT_CHECK();
+}
diff --git a/sw/riscvTests/isa/rv64uv/vmnor.c b/sw/riscvTests/isa/rv64uv/vmnor.c
new file mode 100644
index 00000000..61a2f81f
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vmnor.c
@@ -0,0 +1,68 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+//         Basile Bougenot <bbougenot@student.ethz.ch>
+
+#include "vector_macros.h"
+
+void TEST_CASE1() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  asm volatile("vmnor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(1, v1, 0x32, 0x10);
+}
+
+void TEST_CASE2() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0xFF, 0xFF);
+  asm volatile("vmnor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(2, v1, 0x00, 0x00);
+}
+
+void TEST_CASE3() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x00, 0x00);
+  asm volatile("vmnor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(3, v1, 0x32, 0x10);
+}
+
+void TEST_CASE4() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x0F, 0xF0);
+  asm volatile("vmnor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(4, v1, 0x30, 0x00);
+}
+
+void TEST_CASE5() {
+  VSET(16, e8, m1);
+  VLOAD_8(v1, 0xFF, 0xFF);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  VSET(16, e8, m1);
+  asm volatile("vmnor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(5, v1, 0x32, 0x10);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+
+  EXIT_CHECK();
+}
diff --git a/sw/riscvTests/isa/rv64uv/vmor.c b/sw/riscvTests/isa/rv64uv/vmor.c
new file mode 100644
index 00000000..2ba46e40
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vmor.c
@@ -0,0 +1,68 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+//         Basile Bougenot <bbougenot@student.ethz.ch>
+
+#include "vector_macros.h"
+
+void TEST_CASE1() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  asm volatile("vmor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(1, v1, 0xCD, 0xEF);
+}
+
+void TEST_CASE2() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0xFF, 0xFF);
+  asm volatile("vmor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(2, v1, 0xFF, 0xFF);
+}
+
+void TEST_CASE3() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x00, 0x00);
+  asm volatile("vmor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(3, v1, 0xCD, 0xEF);
+}
+
+void TEST_CASE4() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x0F, 0xF0);
+  asm volatile("vmor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(4, v1, 0xCF, 0xFF);
+}
+
+void TEST_CASE5() {
+  VSET(16, e8, m1);
+  VLOAD_8(v1, 0xFF, 0xFF);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  VSET(16, e8, m1);
+  asm volatile("vmor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(5, v1, 0xCD, 0xEF);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+
+  EXIT_CHECK();
+}
diff --git a/sw/riscvTests/isa/rv64uv/vmornot.c b/sw/riscvTests/isa/rv64uv/vmornot.c
new file mode 100644
index 00000000..51ca1430
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vmornot.c
@@ -0,0 +1,68 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+//         Basile Bougenot <bbougenot@student.ethz.ch>
+
+#include "vector_macros.h"
+
+void TEST_CASE1() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  asm volatile("vmornot.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(1, v1, 0xFF, 0xFF);
+}
+
+void TEST_CASE2() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0xFF, 0xFF);
+  asm volatile("vmornot.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(2, v1, 0xCD, 0xEF);
+}
+
+void TEST_CASE3() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x00, 0x00);
+  asm volatile("vmornot.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(3, v1, 0xFF, 0xFF);
+}
+
+void TEST_CASE4() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x0F, 0xF0);
+  asm volatile("vmornot.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(4, v1, 0xFD, 0xEF);
+}
+
+void TEST_CASE5() {
+  VSET(16, e8, m1);
+  VLOAD_8(v1, 0xFF, 0xFF);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  VSET(16, e8, m1);
+  asm volatile("vmornot.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(5, v1, 0xFF, 0xFF);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+
+  EXIT_CHECK();
+}
diff --git a/sw/riscvTests/isa/rv64uv/vmul.c b/sw/riscvTests/isa/rv64uv/vmul.c
index 32c1cc06..733f43c9 100644
--- a/sw/riscvTests/isa/rv64uv/vmul.c
+++ b/sw/riscvTests/isa/rv64uv/vmul.c
@@ -237,9 +237,9 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vmxnor.c b/sw/riscvTests/isa/rv64uv/vmxnor.c
new file mode 100644
index 00000000..74820c16
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vmxnor.c
@@ -0,0 +1,68 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+//         Basile Bougenot <bbougenot@student.ethz.ch>
+
+#include "vector_macros.h"
+
+void TEST_CASE1() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  asm volatile("vmxnor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(1, v1, 0xB6, 0x31);
+}
+
+void TEST_CASE2() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0xFF, 0xFF);
+  asm volatile("vmxnor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(2, v1, 0xCD, 0xEF);
+}
+
+void TEST_CASE3() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x00, 0x00);
+  asm volatile("vmxnor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(3, v1, 0x32, 0x10);
+}
+
+void TEST_CASE4() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x0F, 0xF0);
+  asm volatile("vmxnor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(4, v1, 0x3D, 0xE0);
+}
+
+void TEST_CASE5() {
+  VSET(16, e8, m1);
+  VLOAD_8(v1, 0xFF, 0xFF);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  VSET(16, e8, m1);
+  asm volatile("vmxnor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(5, v1, 0xB6, 0x31);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+
+  EXIT_CHECK();
+}
diff --git a/sw/riscvTests/isa/rv64uv/vmxor.c b/sw/riscvTests/isa/rv64uv/vmxor.c
new file mode 100644
index 00000000..444814ae
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vmxor.c
@@ -0,0 +1,68 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+//         Basile Bougenot <bbougenot@student.ethz.ch>
+
+#include "vector_macros.h"
+
+void TEST_CASE1() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  asm volatile("vmxor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(1, v1, 0x49, 0xCE);
+}
+
+void TEST_CASE2() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0xFF, 0xFF);
+  asm volatile("vmxor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(2, v1, 0x32, 0x10);
+}
+
+void TEST_CASE3() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x00, 0x00);
+  asm volatile("vmxor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(3, v1, 0xCD, 0xEF);
+}
+
+void TEST_CASE4() {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x0F, 0xF0);
+  asm volatile("vmxor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(4, v1, 0xC2, 0x1F);
+}
+
+void TEST_CASE5() {
+  VSET(16, e8, m1);
+  VLOAD_8(v1, 0xFF, 0xFF);
+  VLOAD_8(v2, 0xCD, 0xEF);
+  VLOAD_8(v3, 0x84, 0x21);
+  VSET(16, e8, m1);
+  asm volatile("vmxor.mm v1, v2, v3");
+  VSET(16, e8, m1);
+  VCMP_U8(5, v1, 0x49, 0xCE);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+
+  EXIT_CHECK();
+}
diff --git a/sw/riscvTests/isa/rv64uv/vnmsac.c b/sw/riscvTests/isa/rv64uv/vnmsac.c
index be9a4c41..4c6c225e 100644
--- a/sw/riscvTests/isa/rv64uv/vnmsac.c
+++ b/sw/riscvTests/isa/rv64uv/vnmsac.c
@@ -82,13 +82,14 @@ void TEST_CASE1() {
 
 void TEST_CASE2() {
   VSET(16, e8, m8);
+  VLOAD_8(v0, 0xAA, 0xAA);
   VLOAD_8(v24, 0x41, 0x5b, 0xd0, 0x04, 0xc4, 0x7a, 0x91, 0xd1, 0x7b, 0x09, 0x85,
           0x59, 0x2b, 0xe3, 0x33, 0xb9);
   VLOAD_8(v16, 0xc5, 0x4d, 0xad, 0x35, 0x81, 0x18, 0x48, 0x50, 0xe7, 0x95, 0x7b,
           0x18, 0xe6, 0x44, 0x57, 0xaf);
   VLOAD_8(v8, 0x53, 0x13, 0x2c, 0xd8, 0x4a, 0xc3, 0xa3, 0xd7, 0x7e, 0x1f, 0x4c,
           0x4e, 0x2e, 0x7d, 0x13, 0x5a);
-  VLOAD_8(v0, 0xAA, 0xAA);
+//   VLOAD_8(v0, 0xAA, 0xAA);
   asm volatile("vnmsac.vv v8, v16, v24, v0.t");
   VCMP_U8(5, v8, 0x53, 0xb4, 0x2c, 0x04, 0x4a, 0x53, 0xa3, 0x87, 0x7e, 0xe2,
           0x4c, 0xf6, 0x2e, 0x31, 0x13, 0xe3);
@@ -226,11 +227,12 @@ void TEST_CASE3() {
 void TEST_CASE4() {
   VSET(16, e8, m8);
   int64_t scalar = 5;
+  VLOAD_8(v0, 0xAA, 0xAA);
   VLOAD_8(v24, 0x5e, 0xf5, 0xa9, 0x0b, 0x14, 0x3c, 0x84, 0x22, 0xd7, 0xb6, 0x5c,
           0x90, 0xa2, 0x67, 0x3d, 0xf5);
   VLOAD_8(v8, 0xfa, 0xd9, 0x2a, 0xe2, 0xe7, 0x1f, 0x8c, 0xbd, 0x40, 0x5d, 0x50,
           0x1f, 0xe0, 0xdd, 0x1f, 0xd7);
-  VLOAD_8(v0, 0xAA, 0xAA);
+//   VLOAD_8(v0, 0xAA, 0xAA);
   asm volatile("vnmsac.vx v8, %[A], v24, v0.t" ::[A] "r"(scalar));
   VCMP_U8(13, v8, 0xfa, 0x10, 0x2a, 0xab, 0xe7, 0xf3, 0x8c, 0x13, 0x40, 0xcf,
           0x50, 0x4f, 0xe0, 0xda, 0x1f, 0x0e);
@@ -298,9 +300,9 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vnmsub.c b/sw/riscvTests/isa/rv64uv/vnmsub.c
index cd1cbc72..b4ee0c75 100644
--- a/sw/riscvTests/isa/rv64uv/vnmsub.c
+++ b/sw/riscvTests/isa/rv64uv/vnmsub.c
@@ -82,13 +82,14 @@ void TEST_CASE1() {
 
 void TEST_CASE2() {
   VSET(16, e8, m8);
+  VLOAD_8(v0, 0xAA, 0xAA);
   VLOAD_8(v8, 0x41, 0x5b, 0xd0, 0x04, 0xc4, 0x7a, 0x91, 0xd1, 0x7b, 0x09, 0x85,
           0x59, 0x2b, 0xe3, 0x33, 0xb9);
   VLOAD_8(v16, 0xc5, 0x4d, 0xad, 0x35, 0x81, 0x18, 0x48, 0x50, 0xe7, 0x95, 0x7b,
           0x18, 0xe6, 0x44, 0x57, 0xaf);
   VLOAD_8(v24, 0x53, 0x13, 0x2c, 0xd8, 0x4a, 0xc3, 0xa3, 0xd7, 0x7e, 0x1f, 0x4c,
           0x4e, 0x2e, 0x7d, 0x13, 0x5a);
-  VLOAD_8(v0, 0xAA, 0xAA);
+//   VLOAD_8(v0, 0xAA, 0xAA);
   asm volatile("vnmsub.vv v8, v16, v24, v0.t");
   VCMP_U8(5, v8, 0x41, 0xb4, 0xd0, 0x04, 0xc4, 0x53, 0x91, 0x87, 0x7b, 0xe2,
           0x85, 0xf6, 0x2b, 0x31, 0x33, 0xe3);
@@ -226,11 +227,12 @@ void TEST_CASE3() {
 void TEST_CASE4() {
   VSET(16, e8, m8);
   int64_t scalar = 5;
+  VLOAD_8(v0, 0xAA, 0xAA);
   VLOAD_8(v8, 0x5e, 0xf5, 0xa9, 0x0b, 0x14, 0x3c, 0x84, 0x22, 0xd7, 0xb6, 0x5c,
           0x90, 0xa2, 0x67, 0x3d, 0xf5);
   VLOAD_8(v24, 0xfa, 0xd9, 0x2a, 0xe2, 0xe7, 0x1f, 0x8c, 0xbd, 0x40, 0x5d, 0x50,
           0x1f, 0xe0, 0xdd, 0x1f, 0xd7);
-  VLOAD_8(v0, 0xAA, 0xAA);
+//   VLOAD_8(v0, 0xAA, 0xAA);
   asm volatile("vnmsub.vx v8, %[A], v24, v0.t" ::[A] "r"(scalar));
   VCMP_U8(13, v8, 0x5e, 0x10, 0xa9, 0xab, 0x14, 0xf3, 0x84, 0x13, 0xd7, 0xcf,
           0x5c, 0x4f, 0xa2, 0xda, 0x3d, 0x0e);
@@ -298,9 +300,9 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vnsra.c b/sw/riscvTests/isa/rv64uv/vnsra.c
index 5c4e9b56..5a339de4 100644
--- a/sw/riscvTests/isa/rv64uv/vnsra.c
+++ b/sw/riscvTests/isa/rv64uv/vnsra.c
@@ -232,11 +232,11 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  TEST_CASE2();
+//   TEST_CASE2();
   TEST_CASE3();
-  TEST_CASE4();
+//   TEST_CASE4();
   TEST_CASE5();
-  TEST_CASE6();
+//   TEST_CASE6();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vor.c b/sw/riscvTests/isa/rv64uv/vor.c
index be95164e..e943c0e1 100644
--- a/sw/riscvTests/isa/rv64uv/vor.c
+++ b/sw/riscvTests/isa/rv64uv/vor.c
@@ -311,11 +311,11 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
   TEST_CASE5();
-  // TEST_CASE6();
+  TEST_CASE6();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vredand.c b/sw/riscvTests/isa/rv64uv/vredand.c
index a354b18d..362892bb 100644
--- a/sw/riscvTests/isa/rv64uv/vredand.c
+++ b/sw/riscvTests/isa/rv64uv/vredand.c
@@ -91,7 +91,7 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vredmax.c b/sw/riscvTests/isa/rv64uv/vredmax.c
index 0b6953d6..a4f1d9fd 100644
--- a/sw/riscvTests/isa/rv64uv/vredmax.c
+++ b/sw/riscvTests/isa/rv64uv/vredmax.c
@@ -77,7 +77,7 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vredmaxu.c b/sw/riscvTests/isa/rv64uv/vredmaxu.c
index 1e38a098..3852b890 100644
--- a/sw/riscvTests/isa/rv64uv/vredmaxu.c
+++ b/sw/riscvTests/isa/rv64uv/vredmaxu.c
@@ -106,7 +106,7 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
 
   EXIT_CHECK();
diff --git a/sw/riscvTests/isa/rv64uv/vredmin.c b/sw/riscvTests/isa/rv64uv/vredmin.c
index 758cf2b0..cc1a3f6c 100644
--- a/sw/riscvTests/isa/rv64uv/vredmin.c
+++ b/sw/riscvTests/isa/rv64uv/vredmin.c
@@ -77,7 +77,7 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vredminu.c b/sw/riscvTests/isa/rv64uv/vredminu.c
index f33fb52e..0d1583f7 100644
--- a/sw/riscvTests/isa/rv64uv/vredminu.c
+++ b/sw/riscvTests/isa/rv64uv/vredminu.c
@@ -77,7 +77,7 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vredor.c b/sw/riscvTests/isa/rv64uv/vredor.c
index decc6719..ba4047e2 100644
--- a/sw/riscvTests/isa/rv64uv/vredor.c
+++ b/sw/riscvTests/isa/rv64uv/vredor.c
@@ -91,7 +91,7 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vredsum.c b/sw/riscvTests/isa/rv64uv/vredsum.c
index 4d515878..2b4016d3 100644
--- a/sw/riscvTests/isa/rv64uv/vredsum.c
+++ b/sw/riscvTests/isa/rv64uv/vredsum.c
@@ -179,10 +179,10 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
   TEST_CASE4();
-  // TEST_CASE5();
+  TEST_CASE5();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vse16.c b/sw/riscvTests/isa/rv64uv/vse16.c
new file mode 100644
index 00000000..d77d182c
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vse16.c
@@ -0,0 +1,97 @@
+#include "vector_macros.h"
+
+
+void TEST_CASE1(void) {
+  VSET(16, e8, m1);
+  volatile uint16_t ALIGNED_I16[1024];
+  VLOAD_16(v0, 0x05e0, 0xbbd3, 0x3840, 0x8cd1, 0x9384, 0x7548, 0x3489, 0x9388,
+           0x8188, 0x11ae, 0x5808, 0x4891, 0x4902, 0x8759, 0x1111, 0x1989);
+  asm volatile("vse16.v v0, (%0)" ::"r"(ALIGNED_I16));
+  VVCMP_U16(1, ALIGNED_I16, 0x05e0, 0xbbd3, 0x3840, 0x8cd1, 0x9384, 0x7548,
+            0x3489, 0x9388, 0x8188, 0x11ae, 0x5808, 0x4891, 0x4902, 0x8759,
+            0x1111, 0x1989);
+}
+
+void TEST_CASE2(void) {
+  volatile uint16_t ALIGNED_I16[16]={0};
+  VSET(16, e16, m1);
+  VLOAD_16(v3, 0x05e0, 0xbbd3, 0x3840, 0x8cd1, 0x9384, 0x7548, 0x3489, 0x9388,
+           0x8188, 0x11ae, 0x5808, 0x4891, 0x4902, 0x8759, 0x1111, 0x1989);
+  VLOAD_8(v0, 0xFF, 0xFF);
+  asm volatile("vse16.v v3, (%0), v0.t" ::"r"(ALIGNED_I16));
+  VCLEAR(v3);
+  VVCMP_U16(2, ALIGNED_I16, 0x05e0, 0xbbd3, 0x3840, 0x8cd1, 0x9384, 0x7548,
+            0x3489, 0x9388, 0x8188, 0x11ae, 0x5808, 0x4891, 0x4902, 0x8759,
+            0x1111, 0x1989);
+}
+
+//*******Checking functionality of vse16 with different values of masking
+// register******//
+void TEST_CASE3(void) {
+  volatile uint16_t ALIGNED_I16[16];
+  VSET(16, e16, m1);
+  VLOAD_16(v3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  asm volatile("vse16.v v3, (%0)" ::"r"(ALIGNED_I16));
+  VCLEAR(v3);
+  VLOAD_16(v3, 0x05e0, 0xbbd3, 0x3840, 0x8cd1, 0x9384, 0x7548, 0x3489, 0x9388,
+           0x8188, 0x11ae, 0x5808, 0x4891, 0x4902, 0x8759, 0x1111, 0x1989);
+  VLOAD_8(v0, 0x00, 0x00);
+  asm volatile("vse16.v v3, (%0), v0.t" ::"r"(ALIGNED_I16));
+  VCLEAR(v3);
+  VVCMP_U16(3, ALIGNED_I16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16);
+}
+
+void TEST_CASE4(void) {
+  volatile uint16_t ALIGNED_I16[16];
+  VSET(16, e16, m1);
+  VLOAD_16(v3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  asm volatile("vse16.v v3, (%0)" ::"r"(ALIGNED_I16));
+  VCLEAR(v3);
+  VLOAD_16(v3, 0x05e0, 0xbbd3, 0x3840, 0x8cd1, 0x9384, 0x7548, 0x3489, 0x9388,
+           0x8188, 0x11ae, 0x5808, 0x4891, 0x4902, 0x8759, 0x1111, 0x1989);
+  VLOAD_8(v0, 0xAA, 0xAA);
+  asm volatile("vse16.v v3, (%0), v0.t" ::"r"(ALIGNED_I16));
+  VCLEAR(v3);
+  VVCMP_U16(4, ALIGNED_I16, 1, 0xbbd3, 3, 0x8cd1, 5, 0x7548, 7, 0x9388, 9,
+            0x11ae, 11, 0x4891, 13, 0x8759, 15, 0x1989);
+}
+
+void TEST_CASE5(void) {
+  volatile uint16_t ALIGNED_I16[16] = {0};
+  VSET(16, e16, m1);
+  VLOAD_16(v8, 0x05e0, 0xbbd3, 0x3840, 0x8cd1, 0x9384, 0x7548, 0x3489, 0x9388,
+           0x8188, 0x11ae, 0x5808, 0x4891, 0x4902, 0x8759, 0x1111, 0x1989);
+  VSET(16, e8, m4);
+  asm volatile("vse16.v v8, (%0)" ::"r"(ALIGNED_I16));
+  VCLEAR(v8);
+  VVCMP_U16(5, ALIGNED_I16, 0x05e0, 0xbbd3, 0x3840, 0x8cd1, 0x9384, 0x7548,
+            0x3489, 0x9388, 0x8188, 0x11ae, 0x5808, 0x4891, 0x4902, 0x8759,
+            0x1111, 0x1989);
+}
+
+void TEST_CASE6(void) {
+  volatile uint16_t ALIGNED_I16[16] = {0};
+  VSET(16, e16, m1);
+  VLOAD_16(v6, 0x05e0, 0xbbd3, 0x3840, 0x8cd1, 0x9384, 0x7548, 0x3489, 0x9388,
+           0x8188, 0x11ae, 0x5808, 0x4891, 0x4902, 0x8759, 0x1111, 0x1989);
+  asm volatile("vse16.v v6, (%0)" ::"r"(ALIGNED_I16));
+  VCLEAR(v6);
+  VVCMP_U16(6, ALIGNED_I16, 0x05e0, 0xbbd3, 0x3840, 0x8cd1, 0x9384, 0x7548,
+            0x3489, 0x9388, 0x8188, 0x11ae, 0x5808, 0x4891, 0x4902, 0x8759,
+            0x1111, 0x1989);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+  TEST_CASE6();
+
+  EXIT_CHECK();
+}
\ No newline at end of file
diff --git a/sw/riscvTests/isa/rv64uv/vse32.c b/sw/riscvTests/isa/rv64uv/vse32.c
new file mode 100644
index 00000000..e65e391a
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vse32.c
@@ -0,0 +1,101 @@
+#include "vector_macros.h"
+
+
+//**********Checking functionality of vse32********//
+void TEST_CASE1(void) {
+  volatile uint32_t ALIGNED_I32[1024];
+  VSET(16, e32, m1);
+  VLOAD_32(v0, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348, 0x9fa831c7,
+           0x38197598, 0x18931795, 0x81937598, 0x18747547, 0x3eeeeeee,
+           0x90139301, 0xab8b9148, 0x90318509, 0x31897598, 0x83195999,
+           0x89139848);
+  asm volatile("vse32.v v0, (%0)" ::"r"(ALIGNED_I32));
+  VVCMP_U32(1, ALIGNED_I32, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348,
+            0x9fa831c7, 0x38197598, 0x18931795, 0x81937598, 0x18747547,
+            0x3eeeeeee, 0x90139301, 0xab8b9148, 0x90318509, 0x31897598,
+            0x83195999, 0x89139848);
+}
+
+//*******Checking functionality of vse32 with different values of masking
+// register******//
+void TEST_CASE2(void) {
+  volatile uint32_t ALIGNED_I32[1024]={0};
+  VSET(16, e32, m1);
+  VLOAD_32(v3, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348, 0x9fa831c7,
+           0x38197598, 0x18931795, 0x81937598, 0x18747547, 0x3eeeeeee,
+           0x90139301, 0xab8b9148, 0x90318509, 0x31897598, 0x83195999,
+           0x89139848);
+  VLOAD_8(v0, 0xFF, 0xFF);
+  asm volatile("vse32.v v3, (%0), v0.t" ::"r"(ALIGNED_I32));
+  VCLEAR(v3);
+  VVCMP_U32(2, ALIGNED_I32, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348,
+            0x9fa831c7, 0x38197598, 0x18931795, 0x81937598, 0x18747547,
+            0x3eeeeeee, 0x90139301, 0xab8b9148, 0x90318509, 0x31897598,
+            0x83195999, 0x89139848);
+}
+
+void TEST_CASE3(void) {
+  volatile uint32_t ALIGNED_I32[1024];
+  VSET(16, e32, m1);
+  VLOAD_32(v3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  asm volatile("vse32.v v3, (%0)" ::"r"(ALIGNED_I32));
+  VCLEAR(v3);
+  VLOAD_32(v3, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348, 0x9fa831c7,
+           0x38197598, 0x18931795, 0x81937598, 0x18747547, 0x3eeeeeee,
+           0x90139301, 0xab8b9148, 0x90318509, 0x31897598, 0x83195999,
+           0x89139848);
+  VLOAD_8(v0, 0x00, 0x00);
+  asm volatile("vse32.v v3, (%0), v0.t" ::"r"(ALIGNED_I32));
+  VCLEAR(v3);
+  VVCMP_U32(3, ALIGNED_I32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16);
+}
+
+void TEST_CASE4(void) {
+  volatile uint32_t ALIGNED_I32[1024];
+  VSET(16, e32, m1);
+  VLOAD_32(v3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  asm volatile("vse32.v v3, (%0)" ::"r"(ALIGNED_I32));
+  VCLEAR(v3);
+  VLOAD_32(v3,  0x11111111, 0x22222222, 0x33333333, 0x44444444,
+           0x55555555, 0x66666666, 0x77777777, 0x88888888, 0x99999999,
+           0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee,
+           0xffffffff,0x00000000);
+  VLOAD_8(v0, 0xAA, 0xAA);
+  asm volatile("vse32.v v3, (%0), v0.t" ::"r"(ALIGNED_I32));
+  VCLEAR(v3);
+  VVCMP_U32(4, ALIGNED_I32, 1, 0x22222222, 3, 0x44444444, 5, 0x66666666, 7,
+            0x88888888, 9, 0xaaaaaaaa, 11, 0xcccccccc, 13, 0xeeeeeeee, 15,
+            0x00000000);
+}
+
+// change LMUL and EW
+void TEST_CASE5(void) {
+  volatile uint32_t ALIGNED_I32[1024] = {0};
+  VSET(16, e32, m1);
+  VLOAD_32(v8, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348, 0x9fa831c7,
+           0x38197598, 0x18931795, 0x81937598, 0x18747547, 0x3eeeeeee,
+           0x90139301, 0xab8b9148, 0x90318509, 0x31897598, 0x83195999,
+           0x89139848);
+  VSET(16, e8, m2); // ? uncertain
+  asm volatile("vse32.v v8, (%0)" ::"r"(ALIGNED_I32));
+  VCLEAR(v8);
+  VVCMP_U32(5, ALIGNED_I32, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348,
+            0x9fa831c7, 0x38197598, 0x18931795, 0x81937598, 0x18747547,
+            0x3eeeeeee, 0x90139301, 0xab8b9148, 0x90318509, 0x31897598,
+            0x83195999, 0x89139848);
+}
+
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+
+  EXIT_CHECK();
+}
\ No newline at end of file
diff --git a/sw/riscvTests/isa/rv64uv/vse64.c b/sw/riscvTests/isa/rv64uv/vse64.c
new file mode 100644
index 00000000..94b50dfa
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vse64.c
@@ -0,0 +1,118 @@
+#include "vector_macros.h"
+
+
+//**********Checking functionality of vse64********//
+void TEST_CASE1(void) {
+  volatile uint64_t ALIGNED_I64[1024];
+  VSET(16, e64, m2);
+  VLOAD_64(v0, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+           0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+           0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+           0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+           0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+           0x8913984898951989);
+  asm volatile("vse64.v v0, (%0)" ::"r"(ALIGNED_I64));
+  VVCMP_U64(1, ALIGNED_I64, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3,
+            0xa11a9384a7163840, 0x99991348a9f38cd1, 0x9fa831c7a11a9384,
+            0x3819759853987548, 0x1893179501093489, 0x81937598aa819388,
+            0x1874754791888188, 0x3eeeeeeee33111ae, 0x9013930148815808,
+            0xab8b914891484891, 0x9031850931584902, 0x3189759837598759,
+            0x8319599991911111, 0x8913984898951989);
+}
+
+//*******Checking functionality of vse64 with different values of masking
+// register******//
+void TEST_CASE2(void) {
+  volatile uint64_t ALIGNED_I64[1024] = {0};
+  VSET(16, e64, m2);
+  VLOAD_64(v3, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+           0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+           0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+           0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+           0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+           0x8913984898951989);
+  VLOAD_8(v0, 0xFF, 0xFF);
+  asm volatile("vse64.v v3, (%0), v0.t" ::"r"(ALIGNED_I64));
+  VCLEAR(v3);
+  VVCMP_U64(2, ALIGNED_I64, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3,
+            0xa11a9384a7163840, 0x99991348a9f38cd1, 0x9fa831c7a11a9384,
+            0x3819759853987548, 0x1893179501093489, 0x81937598aa819388,
+            0x1874754791888188, 0x3eeeeeeee33111ae, 0x9013930148815808,
+            0xab8b914891484891, 0x9031850931584902, 0x3189759837598759,
+            0x8319599991911111, 0x8913984898951989);
+}
+
+void TEST_CASE3(void) {
+  volatile uint64_t ALIGNED_I64[1024] = {0};
+  VSET(16, e64, m2);
+  VLOAD_64(v3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  asm volatile("vse64.v v3, (%0)" ::"r"(ALIGNED_I64));
+  VCLEAR(v3);
+  VLOAD_64(v3, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+           0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+           0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+           0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+           0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+           0x8913984898951989);
+  VLOAD_8(v0, 0x00, 0x00);
+  asm volatile("vse64.v v3, (%0), v0.t" ::"r"(ALIGNED_I64));
+  VCLEAR(v3);
+  VVCMP_U64(3, ALIGNED_I64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16);
+}
+
+void TEST_CASE4(void) {
+  volatile uint64_t ALIGNED_I64[1024] = {0};
+  VSET(16, e64, m2);
+  VLOAD_64(v3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  asm volatile("vse64.v v3, (%0)" ::"r"(ALIGNED_I64));
+  VCLEAR(v3);
+  VLOAD_64(v3, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+           0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+           0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+           0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+           0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+           0x8913984898951989);
+  VLOAD_8(v0, 0xAA, 0xAA);
+  asm volatile("vse64.v v3, (%0), v0.t" ::"r"(ALIGNED_I64));
+  VCLEAR(v3);
+  VVCMP_U64(4, ALIGNED_I64, 1, 0xf9aa71f0c394bbd3, 3, 0x99991348a9f38cd1, 5,
+            0x3819759853987548, 7, 0x81937598aa819388, 9, 0x3eeeeeeee33111ae,
+            11, 0xab8b914891484891, 13, 0x3189759837598759, 15,
+            0x8913984898951989);
+}
+
+// change LMUL and EW
+void TEST_CASE5(void) {
+  volatile uint64_t ALIGNED_I64[1024] = {0};
+  VSET(16, e64, m2);
+  VLOAD_64(v8, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
+           0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
+           0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
+           0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
+           0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
+           0x8913984898951989);
+  VSET(16, e8, m1);
+  asm volatile("vse64.v v8, (%0)" ::"r"(ALIGNED_I64));
+  VCLEAR(v8);
+  VVCMP_U64(10, ALIGNED_I64, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3,
+            0xa11a9384a7163840, 0x99991348a9f38cd1, 0x9fa831c7a11a9384,
+            0x3819759853987548, 0x1893179501093489, 0x81937598aa819388,
+            0x1874754791888188, 0x3eeeeeeee33111ae, 0x9013930148815808,
+            0xab8b914891484891, 0x9031850931584902, 0x3189759837598759,
+            0x8319599991911111, 0x8913984898951989);
+}
+
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+
+  EXIT_CHECK();
+}
\ No newline at end of file
diff --git a/sw/riscvTests/isa/rv64uv/vse8.c b/sw/riscvTests/isa/rv64uv/vse8.c
new file mode 100644
index 00000000..60d24bfd
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vse8.c
@@ -0,0 +1,83 @@
+#include "vector_macros.h"
+
+
+void TEST_CASE1(void) {
+  VSET(16, e8, m1);
+  VLOAD_8(v0, 0x11, 0x22);
+  volatile uint8_t ALIGNED_I8[1024];
+  VLOAD_8(v1, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x11, 0x22, 0x33,
+          0x44, 0x55, 0x66, 0x77, 0x88);
+  asm volatile("vse8.v v1, (%0)" ::"r"(ALIGNED_I8));
+  VVCMP_U8(1, ALIGNED_I8, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x11, 0x22, 0x33,
+          0x44, 0x55, 0x66, 0x77, 0x88);
+}
+
+void TEST_CASE2(void) {
+  VSET(16, e8, m1);
+  volatile uint8_t ALIGNED_I8[16]={0};
+  VLOAD_8(v6, 0xe0, 0xd3, 0x40, 0xd1, 0x84, 0x48, 0x89, 0x88, 0x88, 0xae, 0x08,
+          0x91, 0x02, 0x59, 0x11, 0x89);
+  asm volatile("vse8.v v6, (%0)" ::"r"(ALIGNED_I8));
+  VCLEAR(v6);
+  VVCMP_U8(2, ALIGNED_I8, 0xe0, 0xd3, 0x40, 0xd1, 0x84, 0x48, 0x89, 0x88, 0x88,
+           0xae, 0x08, 0x91, 0x02, 0x59, 0x11, 0x89);
+}
+
+//*******Checking functionality of vse8 with different values of masking
+// register******//
+void TEST_CASE3(void) {
+  VSET(16, e8, m1);
+  volatile uint8_t ALIGNED_I8[16];
+  VLOAD_8(v0, 0xFF, 0xFF);
+  VLOAD_8(v3, 0xe0, 0xd3, 0x40, 0xd1, 0x84, 0x48, 0x89, 0x88, 0x88, 0xae, 0x08,
+          0x91, 0x02, 0x59, 0x11, 0x89);
+//   VLOAD_8(v0, 0xFF, 0xFF);
+  asm volatile("vse8.v v3, (%0), v0.t" ::"r"(ALIGNED_I8));
+  VCLEAR(v3);
+  VVCMP_U8(3, ALIGNED_I8, 0xe0, 0xd3, 0x40, 0xd1, 0x84, 0x48, 0x89, 0x88, 0x88,
+           0xae, 0x08, 0x91, 0x02, 0x59, 0x11, 0x89);
+}
+
+void TEST_CASE4(void) {
+  VSET(16, e8, m1);
+  volatile uint8_t ALIGNED_I8[16];
+  VLOAD_8(v3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  asm volatile("vse8.v v3, (%0)" ::"r"(ALIGNED_I8));
+  VCLEAR(v3);
+  VLOAD_8(v3, 0xe0, 0xd3, 0x40, 0xd1, 0x84, 0x48, 0x89, 0x88, 0x88, 0xae, 0x08,
+          0x91, 0x02, 0x59, 0x11, 0x89);
+  VLOAD_8(v0, 0x00, 0x00);
+  asm volatile("vse8.v v3, (%0), v0.t" ::"r"(ALIGNED_I8));
+  VCLEAR(v3);
+  VVCMP_U8(4, ALIGNED_I8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+           16);
+}
+
+void TEST_CASE5(void) {
+  VSET(16, e8, m1);
+  volatile uint8_t ALIGNED_I8[16];
+  VLOAD_8(v3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  asm volatile("vse8.v v3, (%0)" ::"r"(ALIGNED_I8));
+  VCLEAR(v3);
+  VLOAD_8(v3, 0xe0, 0xd3, 0x40, 0xd1, 0x84, 0x48, 0x89, 0x88, 0x88, 0xae, 0x08,
+          0x91, 0x02, 0x59, 0x11, 0x89);
+  VLOAD_8(v0, 0xAA, 0xAA);
+  asm volatile("vse8.v v3, (%0), v0.t" ::"r"(ALIGNED_I8));
+  VCLEAR(v3);
+  VVCMP_U8(5, ALIGNED_I8, 1, 0xd3, 3, 0xd1, 5, 0x48, 7, 0x88, 9, 0xae, 11, 0x91,
+           13, 0x59, 15, 0x89);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+//   TEST_CASE0();
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
+
+  EXIT_CHECK();
+}
\ No newline at end of file
diff --git a/sw/riscvTests/isa/rv64uv/vslide1down.c b/sw/riscvTests/isa/rv64uv/vslide1down.c
index 91b5b38a..71589a15 100644
--- a/sw/riscvTests/isa/rv64uv/vslide1down.c
+++ b/sw/riscvTests/isa/rv64uv/vslide1down.c
@@ -52,11 +52,16 @@ void TEST_CASE1() {
 void TEST_CASE2() {
   uint64_t scalar = 99;
 
-  VSET(32, e8, m8);
-  VLOAD_8(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+  // VSET(32, e8, m8);
+  // VLOAD_8(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+  //         20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
   VSET(16, e8, m8);
   VLOAD_8(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+  // volatile uint8_t ALIGNED_I8[16];
+  // asm volatile("vse8.v v16, (%0)" ::"r"(ALIGNED_I8));
+  // for(int i = 0; i<16; i++){
+  //   printf("ALIGNED_I8[%d]= %d \n",  i ,ALIGNED_I8[i]);
+  // }
   VLOAD_8(v8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
   VLOAD_8(v0, 0xAA, 0xAA);
   asm volatile("vslide1down.vx v8, v16, %[A], v0.t" ::[A] "r"(scalar));
@@ -100,7 +105,7 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vslide1up.c b/sw/riscvTests/isa/rv64uv/vslide1up.c
index ffe48221..b0ddc882 100644
--- a/sw/riscvTests/isa/rv64uv/vslide1up.c
+++ b/sw/riscvTests/isa/rv64uv/vslide1up.c
@@ -76,7 +76,7 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vslidedown.c b/sw/riscvTests/isa/rv64uv/vslidedown.c
index 092e8ea3..51b94587 100644
--- a/sw/riscvTests/isa/rv64uv/vslidedown.c
+++ b/sw/riscvTests/isa/rv64uv/vslidedown.c
@@ -45,37 +45,37 @@ void TEST_CASE1() {
 
 void TEST_CASE2() {
   VSET(32, e8, m8);
-  VLOAD_8(v0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+  VLOAD_8(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
   VSET(16, e8, m8);
   VLOAD_8(v8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
   VLOAD_8(v0, 0xAA, 0xAA);
-  asm volatile("vslidedown.vi v8, v0, 3, v0.t");
+  asm volatile("vslidedown.vi v8, v16, 3, v0.t");
   VCMP_U8(5, v8, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1, 17, -1, 19);
 
   VSET(32, e16, m8);
-  VLOAD_16(v0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+  VLOAD_16(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
   VSET(16, e16, m8);
   VLOAD_16(v8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-  asm volatile("vslidedown.vi v8, v0, 4, v0.t");
+  asm volatile("vslidedown.vi v8, v16, 4, v0.t");
   VCMP_U16(6, v8, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14, -1, 16, -1, 18, -1, 20);
 
   VSET(32, e32, m8);
-  VLOAD_32(v0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+  VLOAD_32(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
   VSET(16, e32, m8);
   VLOAD_32(v8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-  asm volatile("vslidedown.vi v8, v0, 5, v0.t");
+  asm volatile("vslidedown.vi v8, v16, 5, v0.t");
   VCMP_U32(7, v8, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1, 17, -1, 19, -1, 21);
 
 #if ELEN == 64
   VSET(32, e64, m8);
-  VLOAD_64(v0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+  VLOAD_64(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
   VSET(16, e64, m8);
   VLOAD_64(v8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-  asm volatile("vslidedown.vi v8, v0, 6, v0.t");
+  asm volatile("vslidedown.vi v8, v16, 6, v0.t");
   VCMP_U64(8, v8, -1, 8, -1, 10, -1, 12, -1, 14, -1, 16, -1, 18, -1, 20, -1,
            22);
 #endif
@@ -123,40 +123,40 @@ void TEST_CASE4() {
   uint64_t scalar = 3;
 
   VSET(32, e8, m8);
-  VLOAD_8(v0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+  VLOAD_8(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
   VSET(16, e8, m8);
   VLOAD_8(v8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
   VLOAD_8(v0, 0xAA, 0xAA);
-  asm volatile("vslidedown.vx v8, v0, %[A], v0.t" ::[A] "r"(scalar));
+  asm volatile("vslidedown.vx v8, v16, %[A], v0.t" ::[A] "r"(scalar));
   VCMP_U8(13, v8, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1, 17, -1, 19);
 
   VSET(32, e16, m8);
-  VLOAD_16(v0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+  VLOAD_16(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
   VSET(16, e16, m8);
   VLOAD_16(v8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
   VLOAD_8(v0, 0xAA, 0xAA);
-  asm volatile("vslidedown.vx v8, v0, %[A], v0.t" ::[A] "r"(scalar));
+  asm volatile("vslidedown.vx v8, v16, %[A], v0.t" ::[A] "r"(scalar));
   VCMP_U16(14, v8, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1, 17, -1, 19);
 
   VSET(32, e32, m8);
-  VLOAD_32(v0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+  VLOAD_32(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
   VSET(16, e32, m8);
   VLOAD_32(v8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
   VLOAD_8(v0, 0xAA, 0xAA);
-  asm volatile("vslidedown.vx v8, v0, %[A], v0.t" ::[A] "r"(scalar));
+  asm volatile("vslidedown.vx v8, v16, %[A], v0.t" ::[A] "r"(scalar));
   VCMP_U32(15, v8, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1, 17, -1, 19);
 
 #if ELEN == 64
   VSET(32, e64, m8);
-  VLOAD_64(v0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+  VLOAD_64(v16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
   VSET(16, e64, m8);
   VLOAD_64(v8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
   VLOAD_8(v0, 0xAA, 0xAA);
-  asm volatile("vslidedown.vx v8, v0, %[A], v0.t" ::[A] "r"(scalar));
+  asm volatile("vslidedown.vx v8, v16, %[A], v0.t" ::[A] "r"(scalar));
   VCMP_U64(16, v8, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1, 17, -1, 19);
 #endif
 }
@@ -166,9 +166,9 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vslideup.c b/sw/riscvTests/isa/rv64uv/vslideup.c
index e37dff71..d81b90f5 100644
--- a/sw/riscvTests/isa/rv64uv/vslideup.c
+++ b/sw/riscvTests/isa/rv64uv/vslideup.c
@@ -133,9 +133,9 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vsra.c b/sw/riscvTests/isa/rv64uv/vsra.c
index 4901555f..9d8a9cfb 100644
--- a/sw/riscvTests/isa/rv64uv/vsra.c
+++ b/sw/riscvTests/isa/rv64uv/vsra.c
@@ -318,11 +318,11 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
   TEST_CASE5();
-  // TEST_CASE6();
+  TEST_CASE6();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vss.c b/sw/riscvTests/isa/rv64uv/vss.c
new file mode 100644
index 00000000..14f0e355
--- /dev/null
+++ b/sw/riscvTests/isa/rv64uv/vss.c
@@ -0,0 +1,141 @@
+#include "vector_macros.h"
+
+// Positive-stride tests
+void TEST_CASE1(void) {
+  VSET(4, e8, m1);
+  volatile uint8_t OUT1[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+  uint64_t stride = 3;
+  VLOAD_8(v1, 0x9f, 0xe4, 0x19, 0x20);
+  asm volatile("vsse8.v v1, (%0), %1" ::"r"(OUT1), "r"(stride));
+  VVCMP_U8(1, OUT1, 0x9f, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x19, 0x00, 0x00, 0x20,
+           0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+}
+
+void TEST_CASE2(void) {
+  VSET(8, e16, m1);
+  volatile uint16_t OUT1[] = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                              0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                              0x0000, 0x0000, 0x0000, 0x0000};
+  uint64_t stride = 4;
+  VLOAD_16(v1, 0x9f11, 0xe478, 0x1549, 0x3240, 0x2f11, 0xe448, 0x1546, 0x3220);
+  asm volatile("vsse16.v v1, (%0), %1" ::"r"(OUT1), "r"(stride));
+  VVCMP_U16(2, OUT1, 0x9f11, 0x0000, 0xe478, 0x0000, 0x1549, 0x0000, 0x3240,
+            0x0000, 0x2f11, 0x0000, 0xe448, 0x0000, 0x1546, 0x0000, 0x3220,
+            0x0000);
+}
+
+void TEST_CASE3(void) {
+  VSET(4, e32, m1);
+  volatile uint32_t OUT1[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                              0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                              0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                              0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  uint64_t stride = 8;
+  VLOAD_32(v1, 0x9f872456, 0xe1356784, 0x13241139, 0x20862497);
+  asm volatile("vsse32.v v1, (%0), %1" ::"r"(OUT1), "r"(stride));
+  VVCMP_U32(3, OUT1, 0x9f872456, 0x00000000, 0xe1356784, 0x00000000, 0x13241139,
+            0x00000000, 0x20862497, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000);
+}
+
+void TEST_CASE4(void) {
+  VSET(16, e64, m2);
+  volatile uint64_t OUT1[] = {
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000};
+  uint64_t stride = 16;
+  VLOAD_64(v2, 0x9f87245315434136, 0xe135578794246784, 0x1315345345241139,
+           0x2086252110062497, 0x1100229933847136, 0xaaffaaffaaffaaff,
+           0xaf87245315434136, 0xa135578794246784, 0x2315345345241139,
+           0x1086252110062497, 0x1100229933847134, 0xaaffaaffaaffaaf4,
+           0x9315345345241139, 0x9086252110062497, 0x9100229933847134,
+           0x9affaaffaaffaaf4);
+  asm volatile("vsse64.v v2, (%0), %1" ::"r"(OUT1), "r"(stride));
+  VVCMP_U64(4, OUT1, 0x9f87245315434136, 0x0000000000000000, 0xe135578794246784,
+            0x0000000000000000, 0x1315345345241139, 0x0000000000000000,
+            0x2086252110062497, 0x0000000000000000, 0x1100229933847136,
+            0x0000000000000000, 0xaaffaaffaaffaaff, 0x0000000000000000,
+            0xaf87245315434136, 0x0000000000000000, 0xa135578794246784,
+            0x0000000000000000, 0x2315345345241139, 0x0000000000000000,
+            0x1086252110062497, 0x0000000000000000, 0x1100229933847134,
+            0x0000000000000000, 0xaaffaaffaaffaaf4, 0x0000000000000000,
+            0x9315345345241139, 0x0000000000000000, 0x9086252110062497,
+            0x0000000000000000, 0x9100229933847134, 0x0000000000000000,
+            0x9affaaffaaffaaf4, 0x0000000000000000);
+}
+
+// Masked strided store
+void TEST_CASE5(void) {
+  VSET(4, e8, m1);
+  volatile uint8_t OUT1[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+  uint64_t stride = 3;
+  // VLOAD_8(v0, 0xAA);
+  VLOAD_8(v0, 0xA);
+  VLOAD_8(v1, 0x9f, 0xe4, 0x19, 0x20);
+  asm volatile("vsse8.v v1, (%0), %1, v0.t" ::"r"(OUT1), "r"(stride));
+  VVCMP_U8(5, OUT1, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20,
+           0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+}
+
+void TEST_CASE6(void) {
+  VSET(16, e64, m2);
+  volatile uint64_t OUT1[] = {
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+      0x0000000000000000, 0x0000000000000000};
+  uint64_t stride = 16;
+  VLOAD_64(v2, 0x9f87245315434136, 0xe135578794246784, 0x1315345345241139,
+           0x2086252110062497, 0x1100229933847136, 0xaaffaaffaaffaaff,
+           0xaf87245315434136, 0xa135578794246784, 0x2315345345241139,
+           0x1086252110062497, 0x1100229933847134, 0xaaffaaffaaffaaf4,
+           0x9315345345241139, 0x9086252110062497, 0x9100229933847134,
+           0x9affaaffaaffaaf4);
+  VLOAD_8(v0, 0xAA, 0xAA);
+  asm volatile("vsse64.v v2, (%0), %1, v0.t" ::"r"(OUT1), "r"(stride));
+  VVCMP_U64(6, OUT1, 0x0000000000000000, 0x0000000000000000, 0xe135578794246784,
+            0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+            0x2086252110062497, 0x0000000000000000, 0x0000000000000000,
+            0x0000000000000000, 0xaaffaaffaaffaaff, 0x0000000000000000,
+            0x0000000000000000, 0x0000000000000000, 0xa135578794246784,
+            0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+            0x1086252110062497, 0x0000000000000000, 0x0000000000000000,
+            0x0000000000000000, 0xaaffaaffaaffaaf4, 0x0000000000000000,
+            0x0000000000000000, 0x0000000000000000, 0x9086252110062497,
+            0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+            0x9affaaffaaffaaf4, 0x0000000000000000);
+}
+
+int main(void) {
+  INIT_CHECK();
+  enable_vec();
+
+  TEST_CASE1();
+  TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+
+  TEST_CASE5();
+  TEST_CASE6();
+
+  EXIT_CHECK();
+}
\ No newline at end of file
diff --git a/sw/riscvTests/isa/rv64uv/vsub.c b/sw/riscvTests/isa/rv64uv/vsub.c
index 177910fd..500518ad 100644
--- a/sw/riscvTests/isa/rv64uv/vsub.c
+++ b/sw/riscvTests/isa/rv64uv/vsub.c
@@ -136,9 +136,9 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vwadd.c b/sw/riscvTests/isa/rv64uv/vwadd.c
index d98e1bd4..da623bfb 100644
--- a/sw/riscvTests/isa/rv64uv/vwadd.c
+++ b/sw/riscvTests/isa/rv64uv/vwadd.c
@@ -217,9 +217,9 @@ int main(void) {
 
   // SKIP 2,4: masking not supported
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
   // SKIP 5-8: vwadd.wv and vwadd.wx not supported
   // TEST_CASE5();
   // TEST_CASE6();
diff --git a/sw/riscvTests/isa/rv64uv/vwmacc.c b/sw/riscvTests/isa/rv64uv/vwmacc.c
index d1118af8..2e78bd6c 100644
--- a/sw/riscvTests/isa/rv64uv/vwmacc.c
+++ b/sw/riscvTests/isa/rv64uv/vwmacc.c
@@ -248,9 +248,9 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
 
   EXIT_CHECK();
 }
diff --git a/sw/riscvTests/isa/rv64uv/vxor.c b/sw/riscvTests/isa/rv64uv/vxor.c
index 6e4bb684..c7b41f02 100644
--- a/sw/riscvTests/isa/rv64uv/vxor.c
+++ b/sw/riscvTests/isa/rv64uv/vxor.c
@@ -311,11 +311,11 @@ int main(void) {
   enable_vec();
 
   TEST_CASE1();
-  // TEST_CASE2();
+  TEST_CASE2();
   TEST_CASE3();
-  // TEST_CASE4();
+  TEST_CASE4();
   TEST_CASE5();
-  // TEST_CASE6();
+  TEST_CASE6();
 
   EXIT_CHECK();
 }
diff --git a/util/Makefrag b/util/Makefrag
index 3eaffd5d..cb940baf 100644
--- a/util/Makefrag
+++ b/util/Makefrag
@@ -24,7 +24,8 @@ VERILATOR_INSTALL_DIR ?= ${INSTALL_DIR}/verilator
 BENDER ?= ${BENDER_INSTALL_DIR}/bender
 DASM   ?= ${SPIKE_INSTALL_DIR}/bin/spike-dasm
 VLT    ?= ${VERILATOR_INSTALL_DIR}/bin/verilator_bin
-CMAKE  ?= cmake-3.18.1
+#CMAKE  ?= cmake-3.18.1
+CMAKE ?= cmake # CMY modified
 PYTHON ?= python3.6
 CC     ?= gcc-11.2.0
 CXX    ?= g++-11.2.0