All major combination functionality

jiyang1011 · jiyang1011 · commit eb3529ebdc64 · 2025-04-15T00:47:49.000-07:00
diff --git a/examples/sycl/pvc/pvc_gemm_fp8.cpp b/examples/sycl/pvc/pvc_gemm_fp8.cpp
@@ -105,7 +105,8 @@
  };
  
  ///////////////////////////////////////////////////////////////////////////////////////////////////
- 
+#define A_ROW
+#define B_COL
  template <
    class Gemm
  >
@@ -159,19 +160,33 @@
    //
  
    bool verify(const Options &options) {
- 
-     using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
-     using GmemTiledCopyB = XE_2D_U16x32x32_LD_N;
- 
-     // Workgroup-level tile
-     using TileShape = Shape<_256, _256, _32>;
- 
-     using TiledMma =
-       TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
-                Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>,
-                Tile<Layout<Shape<_8, _8, _4>, Stride<_1, _32, _8>>,
-                     Layout<Shape<_16, _4, _4>, Stride<_1, _64, _16>>, _32>>;
- 
+    #if defined(A_ROW) && defined(B_COL)
+    using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+    using GmemTiledCopyB = XE_2D_U16x16x16_LD_T;
+    #endif
+    #if defined(A_ROW) && defined(B_ROW)
+    using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
+    using GmemTiledCopyB = XE_2D_U16x32x32_LD_N;
+    #endif
+
+    #if defined(A_COL) && defined(B_ROW)
+    using GmemTiledCopyA = XE_2D_U16x16x16_LD_T;
+    using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
+    #endif
+
+    #if defined(A_COL) && defined(B_COL)
+    using GmemTiledCopyA = XE_2D_U16x16x16_LD_T;
+    using GmemTiledCopyB = XE_2D_U16x16x16_LD_T;
+    #endif
+    // Workgroup-level tile
+    using TileShape = Shape<_256, _256, _32>;
+  
+    using MMAAtom = MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>;
+    using TiledMma = TiledMMA<MMAAtom,
+                              Layout<Shape<_8,_4,_1>, Stride<_4,_1,_0>>, 
+                              Tile<Layout<Shape<_8, _8, _4>, Stride<_1, _32, _8>>,
+                                   Layout<Shape<_16, _4, _4>, Stride<_1, _64, _16>>, 
+                                   _32>>;
      constexpr int PipelineStages = 3;
      using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVC<PipelineStages>;
      using EpilogueDispatchPolicy = cutlass::epilogue::IntelPVCEpilogue;
@@ -317,29 +332,47 @@
  
  };
  
-
  struct TransformA {
   template <class RTensor, class Trait, class TransTensor>
   CUTE_HOST_DEVICE auto operator()(RTensor const& in, Trait trait, TransTensor& out) {
+  #if defined(A_ROW)
     // auto mma_A = make_fragment_like<typename TiledMma::ValTypeA>(in);
     Layout A_selector = make_layout(make_shape(_8{}, _4{}, _2{}),  make_stride(_2{},_16{},_1{}));
+    // Layout A_selector = make_layout(make_shape(_8{}, _1{}, _2{}), make_stride(_2{},_16{}, _1{}));
+    // Layout A_selector = make_layout(make_shape(_8{}, _2{}, _2{}), make_stride(_2{}, _16{}, _1{}));
     CUTLASS_PRAGMA_UNROLL
     for(int i = 0; i < size<1>(out); i++) {
       CUTLASS_PRAGMA_UNROLL
       for(int j =0; j < size<2>(out); j++) {
         CUTLASS_PRAGMA_UNROLL
         for(int v = 0; v < size<0>(out); v++) {
           out(v, i, j) = static_cast<cutlass::bfloat16_t/*typename TiledMma::ValTypeA*/>(in.data()[A_selector(v, i, j)]);
-          // out(v, i, j) = 1;
+          // out(v, i, j) = (bfloat16_t)(1.0f);
         }
       }
     }
+  #endif 
+  #if defined(A_COL)
+  Layout A_selector = make_layout(make_shape(_8{},_4{},_2{}), make_stride(_1{},_8{},_32{}));
+  CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < size<1>(out); i++) {
+      CUTLASS_PRAGMA_UNROLL
+      for(int j =0; j < size<2>(out); j++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int v = 0; v < size<0>(out); v++) {
+          out(v, i, j) = static_cast<cutlass::bfloat16_t/*typename TiledMma::ValTypeA*/>(in.data()[A_selector(v, i, j)]);
+          // out(v, i, j) = (bfloat16_t)(1.0f);
+        }
+      }
+    }
+  #endif
   }
  };
 
  struct TransformB {
   template <class RTensor, class Trait, class TransTensor>
   CUTE_HOST_DEVICE auto operator()(RTensor const& in, Trait trait, TransTensor& out) {
+    #if defined(B_ROW) && defined(A_ROW)
     //  auto mma_B = make_fragment_like<typename TiledMma::ValTypeB>(in);
      Layout B_selector = make_layout(make_shape(_16{}, make_shape(_2{}, _2{}), _2{}), make_stride(_4{}, make_stride(_1{}, _64{}) ,_2{}));
      CUTLASS_PRAGMA_UNROLL
@@ -349,10 +382,53 @@
          CUTLASS_PRAGMA_UNROLL
          for(int v = 0; v < size<0>(out); v++) {
           out(v, i, j) = static_cast<cutlass::bfloat16_t/*typename TiledMma::ValTypeB*/>(in.data()[B_selector(v, i, j)]);
-          //  out(v, i, j) = 1;
+          //  out(v, i, j) = (bfloat16_t)(1.0f);
+         }
+       }
+     }
+     #endif
+     #if defined(B_ROW) && defined(A_COL)
+     Layout B_selector = make_layout(make_shape(_16{}, make_shape(_2{}, _2{}), _2{}), make_stride(_2{}, make_stride(_1{}, _64{}) ,_32{}));
+     CUTLASS_PRAGMA_UNROLL
+     for(int i = 0; i < size<1>(out); i++) {
+       CUTLASS_PRAGMA_UNROLL
+       for(int j =0; j < size<2>(out); j++) {
+         CUTLASS_PRAGMA_UNROLL
+         for(int v = 0; v < size<0>(out); v++) {
+          out(v, i, j) = static_cast<cutlass::bfloat16_t/*typename TiledMma::ValTypeB*/>(in.data()[B_selector(v, i, j)]);
+          //  out(v, i, j) = (bfloat16_t)(1.0f);
+         }
+       }
+     }
+     #endif
+     #if defined(B_COL) && defined(A_COL)
+     Layout B_selector = make_layout(make_shape(_16{}, _4{},_2{}), make_stride(_1{}, _32{},_16{}));
+     CUTLASS_PRAGMA_UNROLL
+     for(int i = 0; i < size<1>(out); i++) {
+       CUTLASS_PRAGMA_UNROLL
+       for(int j =0; j < size<2>(out); j++) {
+         CUTLASS_PRAGMA_UNROLL
+         for(int v = 0; v < size<0>(out); v++) {
+          out(v, i, j) = static_cast<cutlass::bfloat16_t/*typename TiledMma::ValTypeB*/>(in.data()[B_selector(v, i, j)]);
+          //  out(v, i, j) = (bfloat16_t)(1.0f);
          }
        }
      }
+     #endif
+     #if defined(B_COL) && defined(A_ROW)
+     Layout B_selector = make_layout(make_shape(_16{}, _4{}, _2{}), make_stride(_2{}, _32{},_1{}));
+     CUTLASS_PRAGMA_UNROLL
+     for(int i = 0; i < size<1>(out); i++) {
+       CUTLASS_PRAGMA_UNROLL
+       for(int j =0; j < size<2>(out); j++) {
+         CUTLASS_PRAGMA_UNROLL
+         for(int v = 0; v < size<0>(out); v++) {
+          out(v, i, j) = static_cast<cutlass::bfloat16_t/*typename TiledMma::ValTypeB*/>(in.data()[B_selector(v, i, j)]);
+          //  out(v, i, j) = (bfloat16_t)(1.0f);
+         }
+       }
+     }
+     #endif
    }
   };
   
@@ -398,16 +474,39 @@
    using ElementInputB = cutlass::float_e4m3_t;        // <- data type of elements in input matrix B
    using ElementOutput = float;                        // <- data type of elements in output matrix D
  
-   using LayoutA = cutlass::layout::RowMajor;
-   using LayoutB = cutlass::layout::RowMajor;
    using LayoutC = cutlass::layout::RowMajor;
    using LayoutD = cutlass::layout::RowMajor;
  
    // Note: XE_2D_U8x32x32_LD_V is incompatible with our bf16 MMA atoms
+   // 2.8tflops  U8x32x32NLD_N 
+   // 1.4tflops  U8x16x32NLD_N
+   // 0.7tflops  U8x 8x32NLD_N
+   #if defined(A_COL) && defined(B_ROW)
+   using LayoutA = cutlass::layout::ColumnMajor;
+   using LayoutB = cutlass::layout::RowMajor;
+   using GmemTiledCopyA = XE_2D_U8x16x32_LD_T;
+   using GmemTiledCopyB = XE_2D_U8x32x32_LD_N;
+   #endif
+   #if defined(A_ROW) &&  defined(B_ROW)
+   using LayoutA = cutlass::layout::RowMajor;
+   using LayoutB = cutlass::layout::RowMajor;
    using GmemTiledCopyA = XE_2D_U8x32x32_LD_N;
    using GmemTiledCopyB = XE_2D_U8x32x32_LD_N;
-   static_assert(sizeof(ElementInputA) == 1, "ElementA width must match GmemTiledCopyA U8");
- 
+   #endif
+
+   #if defined(A_COL) & defined(B_COL)
+   using LayoutA = cutlass::layout::ColumnMajor;
+   using LayoutB = cutlass::layout::ColumnMajor;
+   using GmemTiledCopyA = XE_2D_U8x16x32_LD_T;
+   using GmemTiledCopyB = XE_2D_U8x16x32_LD_T;
+   #endif
+   
+   #if defined(A_ROW) && defined(B_COL)
+   using LayoutA = cutlass::layout::RowMajor;
+   using LayoutB = cutlass::layout::ColumnMajor;
+   using GmemTiledCopyA = XE_2D_U8x32x32_LD_N;
+   using GmemTiledCopyB = XE_2D_U8x16x32_LD_T;
+   #endif
    // Workgroup-level tile
    using TileShape = Shape<_256, _256, _32>;
  
@@ -435,7 +534,11 @@
            FusionCallBacks,
            XE_2D_U32x8x16_LD_N,
            void, void,
+           #if defined(B_COL)
+           XE_2D_U32x8x16_ST_N,
+           #else
            void,
+           #endif
            void, void>;
  
    // Mainloop
diff --git a/include/cute/arch/xe_copy_2B.hpp b/include/cute/arch/xe_copy_2B.hpp
@@ -778,6 +778,27 @@ struct XE_2D_U16x16x16_LD_T {
   }
 };
 
+struct XE_2D_U8x16x32_LD_T {
+    using BlockShape = Shape<_32, _16>;
+    using inst_dtype = uint32_t;
+  
+    static constexpr bool is_transpose = true;
+  
+    template <class T>
+    CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width,
+                                      int height, int pitch, intel::coord_t coord,
+                                      T *dst) {
+  #if defined(SYCL_INTEL_TARGET)
+      static_assert(sizeof(T) == 1, "Expected T to have size 1");
+      *reinterpret_cast<intel::uint8 *>(dst) =
+          __builtin_IB_subgroup_block_read_flat_transpose_u32_k8(
+              (long)(baseoffset), width - 1, height - 1, pitch - 1, coord);
+  #else
+      CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware");
+  #endif
+    }
+  }; 
+
 struct XE_2D_U16x1x16_ST_N {
   using BlockShape = Shape<_1, _16>;
 
diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp
@@ -1667,6 +1667,26 @@ struct Copy_Traits_<XE_2D_U16x16x16_LD_T, args_t...>
       : XE_2D_LD_Unpack<XE_2D_U16x16x16_LD_T, args_t...>(args...) {}
 };
 
+template <class... args_t>
+struct Copy_Traits_<XE_2D_U8x16x32_LD_T, args_t...>
+    : XE_2D_LD_Unpack<XE_2D_U8x16x32_LD_T, args_t...> {
+  using ThrID = Layout<_16>;
+  // Map from (src-thr,src-val) to bit
+  // TODO(joe): Not convinced that changing from <_16, _256> should be required here
+  // but get_logical_layout assumes get<1,0>(layout.shape) is the type size
+  using SrcLayout = Layout<Shape <_16,Shape <_8,_32>>,
+                           Stride< _0,Stride<_1,_64>>>;
+  // Map from (dst-thr,dst-val) to bit
+  using DstLayout = Layout<Shape < _16,Shape <_8,_32>>,
+                           Stride<_256,Stride<_1, _8>>>;
+  // Reference map from (thr,val) to bit
+  using RefLayout = DstLayout;
+
+  template <class... ArgT>
+  Copy_Traits_(ArgT... args)
+      : XE_2D_LD_Unpack<XE_2D_U8x16x32_LD_T, args_t...>(args...) {}
+};
+
 // template<class... args_t>
 // struct Copy_Traits<XE_2D_U32x16x1_LD_T, args_t...>
 //     : XE_2D_LD_Unpack<XE_2D_U32x16x1_LD_T, args_t...> {
@@ -2251,6 +2271,7 @@ COPY_TRAIT_LD_DEF(XE_2D_U16x32x16_LD_V)
 COPY_TRAIT_LD_DEF(XE_2D_U16x32x32_LD_V)
 COPY_TRAIT_LD_DEF(XE_2D_U16x16x32_LD_V)
 COPY_TRAIT_LD_DEF(XE_2D_U16x16x16_LD_T)
+COPY_TRAIT_LD_DEF(XE_2D_U8x16x32_LD_T)
 COPY_TRAIT_LD_DEF(XE_2D_TF32x16x16_LD_N)
 COPY_TRAIT_LD_DEF(XE_2D_TF32x32x16_LD_N)
 COPY_TRAIT_LD_DEF(XE_2D_U4x32x64_LD_N)
diff --git a/include/cutlass/epilogue/collective/xe_epilogue.hpp b/include/cutlass/epilogue/collective/xe_epilogue.hpp
@@ -378,12 +378,16 @@ class CollectiveEpilogue<
     auto synchronize = [&] () {};
     
 // 32 x 64 
+// if(cute::thread0()) {
+//  print("accumulators: ");print(accumulators);print("\n");
+// }
 if constexpr(!is_same_v<CopyOpR2G_, XE_2D_U32x8x16_ST_N>) {
 auto D = make_tensor(make_gmem_ptr(params.ptr_D), make_layout(make_shape(4096, 4096), make_stride(4096, 1)));
 for(int i = 0; i < size<1>(accumulators); i++) {
   for(int j = 0; j < size<2>(accumulators); j++) {
     for(int v = 0; v < size<0>(accumulators); v++) {
       D(v + i * 8 + m_sg * 32 + BlockIdxY() * 256 ,  BlockIdxX() * 256 + n_sg * 64 + (thread_idx % 16) * 2 + (j % 2) + (j / 2) * 32) = accumulators(v, i, j);
+      // D(v + i * 8 + m_sg * 16 + BlockIdxY() * 128 ,  BlockIdxX() * 256 + n_sg * 64 + (thread_idx % 16) * 2 + (j % 2) + (j / 2) * 32) = accumulators(v, i, j);
     }
   }
 }
diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp
@@ -197,9 +197,6 @@ struct CollectiveMma<MainloopIntelPVC<Stages, Schedule>, TileShape_, ElementA_,
     auto pAgA = thr_prefetch_A.partition_S(gA);
     auto pBgB = thr_prefetch_B.partition_S(gB);
 
-    TransformA transformA{};
-    TransformB transformB{};
-
 #if CUTLASS_ENABLE_DEBUG_PRINTS
 #define PRINT(x) print(#x ": "); print(x); print("\n");
     if (cute::thread(LOG_THREAD, LOG_GROUP)) {
@@ -228,7 +225,7 @@ struct CollectiveMma<MainloopIntelPVC<Stages, Schedule>, TileShape_, ElementA_,
     const auto k_start_idx = crd2idx((*k_tile_iter), make_shape(K_start));
     constexpr int barrier_scope = 2;
     int prefetch_k = 0;
-    
+
     CUTLASS_PRAGMA_UNROLL
     for (; prefetch_k < DispatchPolicy::Stages; prefetch_k++) {
       prefetch(tiled_prefetch_a, pAgA(_, _, _, prefetch_k));

Original file line number	Diff line number	Diff line change
`@@ -378,12 +378,16 @@ class CollectiveEpilogue<`
`378`	`378`	`auto synchronize = [&] () {};`
`379`	`379`
`380`	`380`	`// 32 x 64`
	`381`	`+// if(cute::thread0()) {`
	`382`	`+// print("accumulators: ");print(accumulators);print("\n");`
	`383`	`+// }`
`381`	`384`	`if constexpr(!is_same_v<CopyOpR2G_, XE_2D_U32x8x16_ST_N>) {`
`382`	`385`	`auto D = make_tensor(make_gmem_ptr(params.ptr_D), make_layout(make_shape(4096, 4096), make_stride(4096, 1)));`
`383`	`386`	`for(int i = 0; i < size<1>(accumulators); i++) {`
`384`	`387`	`for(int j = 0; j < size<2>(accumulators); j++) {`
`385`	`388`	`for(int v = 0; v < size<0>(accumulators); v++) {`
`386`	`389`	`D(v + i * 8 + m_sg * 32 + BlockIdxY() * 256 , BlockIdxX() * 256 + n_sg * 64 + (thread_idx % 16) * 2 + (j % 2) + (j / 2) * 32) = accumulators(v, i, j);`
	`390`	`+ // D(v + i * 8 + m_sg * 16 + BlockIdxY() * 128 , BlockIdxX() * 256 + n_sg * 64 + (thread_idx % 16) * 2 + (j % 2) + (j / 2) * 32) = accumulators(v, i, j);`
`387`	`391`	`}`
`388`	`392`	`}`
`389`	`393`	`}`