KnowingNothing
diff --git a/‎examples/atom/ldgmem_ldsmem_v0.cu
+96 b/‎examples/atom/ldgmem_ldsmem_v0.cu
+96
diff --git a/‎examples/atom/ldgmem_ldsmem_v1.cu
+108 b/‎examples/atom/ldgmem_ldsmem_v1.cu
+108
diff --git a/‎include/common.h
+29 b/‎include/common.h
+29
diff --git a/‎util/cutlass/test_tile_scheduler.cu
+91 b/‎util/cutlass/test_tile_scheduler.cu
+91
@@ -0,0 +1,96 @@
+#include "common.h"
+// copy async
+
+// nvcc -arch=sm_90a -std=c++17 -I ../../include/ -lcuda ldgmem_ldsmem_v0.cu -o test
+
+const int SM_LODA_BYTES = 128/8;
+
+template <typename DType, int BLOCKM, int BLOCKN, int NUM_THREADS>
+__global__ void naive_matrix_ldsm(DType* source, int M, int N, DType* dummy_out) {
+    __shared__ DType smem[BLOCKM*BLOCKN];
+    const int VEC_LEN = SM_LODA_BYTES / sizeof(DType);
+    const int VEC_REPEAT = BLOCKN / VEC_LEN;
+    const int THREAD_N = VEC_REPEAT;
+    const int THREAD_M = NUM_THREADS / THREAD_N;
+    const int ROW_REPEAT = BLOCKM / THREAD_M;
+    static_assert(BLOCKN % VEC_LEN == 0);
+    static_assert(NUM_THREADS % THREAD_N == 0);
+    static_assert(ROW_REPEAT * THREAD_M == BLOCKM);
+
+    int mo = blockIdx.x * BLOCKM;
+    int mi = threadIdx.x / THREAD_N;
+    int ni = threadIdx.x % THREAD_N;
+    int4* ld_source = reinterpret_cast<int4*>(source);
+    int4* ld_smem = reinterpret_cast<int4*>(smem);
+    for (int no = 0; no < N; no += BLOCKN) {
+        for (int row_repeat = 0; row_repeat < ROW_REPEAT; ++row_repeat) {
+            int m = mo + row_repeat * THREAD_M + mi;
+            int n = no + ni * VEC_LEN;
+            int idx = m * N + n;
+            int sm = row_repeat * THREAD_M + mi;
+            int sn = ni * VEC_LEN;
+            int sm_idx = sm * BLOCKN + sn;
+            ld_smem[sm_idx / VEC_LEN] = ld_source[idx / VEC_LEN];
+        }
+        __syncthreads();
+        for (int row_repeat = 0; row_repeat < ROW_REPEAT; ++row_repeat) {
+            int m = mo + row_repeat * THREAD_M + mi;
+            int n = no + ni * VEC_LEN;
+            int idx = m * N + n;
+            int sm = row_repeat * THREAD_M + mi;
+            int sn = ni * VEC_LEN;
+            int sm_idx = sm * BLOCKN + sn;
+            for (int i = 0; i < VEC_LEN; ++i) {
+                dummy_out[idx + i] = smem[sm_idx + i] + DType(1);
+            }
+        }
+    }
+}
+
+
+template<typename DType>
+void cpu_dummy(DType* source, DType* dummy_out, int M, int N) {
+    for (int m = 0; m < M; ++m) {
+        for (int n = 0; n < N; ++n) {
+            dummy_out[m * N + n] = (DType)((float)source[m * N + n] + (float)DType(1));
+        }
+    }
+}
+
+
+int main(int argc, char** argv) {
+    const int M = 1024;
+    const int N = 1024;
+    using DType = half;
+    const int BLOCKM = 128;
+    const int BLOCKN = 128;
+    const int NUM_THREADS = 128;
+    std::vector<int> shape{M, N};
+    auto A = alloc_cpu_tensor<DType>(shape);
+    random_fill(A, shape);
+    auto B = alloc_cpu_tensor<DType>(shape);
+    auto golden = alloc_cpu_tensor<DType>(shape);
+
+    GPUTimer gpu_timer;
+
+    auto dA = alloc_gpu_tensor<DType>(shape);
+    auto dB = alloc_gpu_tensor<DType>(shape);
+    gpu_timer.sync_all();
+    gpu_timer.tick();
+    copy_to_gpu_async(A, dA, shape);
+    dim3 block(NUM_THREADS);
+    dim3 grid(ceil_div(M, BLOCKM));
+    naive_matrix_ldsm<DType, BLOCKM, BLOCKN, NUM_THREADS><<<grid, block>>>(dA, M, N, dB);
+    copy_to_cpu_async(B, dB, shape);
+    gpu_timer.tick();
+    gpu_timer.sync_all();
+    std::cout << "GPU naive done! Use " << gpu_timer.report_last_ms() << " ms.\n";
+
+    std::cout << "Calculating golden...\n";
+    cpu_dummy(A, golden, M, N);
+    assert_allclose(B, golden, shape, 1e-5, /*dump=*/false);
+    std::cout << "Correct!\n";
+
+    
+    return 0;
+}
@@ -0,0 +1,108 @@
+#include "common.h"
+// split copy async
+
+// nvcc -arch=sm_90a -std=c++17 -I ../../include/ -lcuda ldgmem_ldsmem_v0.cu -o test
+
+const int SM_LODA_BYTES = 128/8;
+
+template <typename DType, int BLOCKM, int BLOCKN, int NUM_THREADS>
+__global__ void split_matrix_ldsm(DType* source, int M, int N, DType* dummy_out, int split, int curr_split) {
+    __shared__ DType smem[BLOCKM*BLOCKN];
+    const int VEC_LEN = SM_LODA_BYTES / sizeof(DType);
+    const int VEC_REPEAT = BLOCKN / VEC_LEN;
+    const int THREAD_N = VEC_REPEAT;
+    const int THREAD_M = NUM_THREADS / THREAD_N;
+    const int ROW_REPEAT = BLOCKM / THREAD_M;
+    static_assert(BLOCKN % VEC_LEN == 0);
+    static_assert(NUM_THREADS % THREAD_N == 0);
+    static_assert(ROW_REPEAT * THREAD_M == BLOCKM);
+
+    dummy_out += M / split * curr_split * N;
+
+    int mo = blockIdx.x * BLOCKM;
+    int mi = threadIdx.x / THREAD_N;
+    int ni = threadIdx.x % THREAD_N;
+    int4* ld_source = reinterpret_cast<int4*>(source);
+    int4* ld_smem = reinterpret_cast<int4*>(smem);
+    for (int no = 0; no < N; no += BLOCKN) {
+        for (int row_repeat = 0; row_repeat < ROW_REPEAT; ++row_repeat) {
+            int m = mo + row_repeat * THREAD_M + mi;
+            int n = no + ni * VEC_LEN;
+            int idx = m * N + n;
+            int sm = row_repeat * THREAD_M + mi;
+            int sn = ni * VEC_LEN;
+            int sm_idx = sm * BLOCKN + sn;
+            ld_smem[sm_idx / VEC_LEN] = ld_source[idx / VEC_LEN];
+        }
+        __syncthreads();
+        for (int row_repeat = 0; row_repeat < ROW_REPEAT; ++row_repeat) {
+            int m = mo + row_repeat * THREAD_M + mi;
+            int n = no + ni * VEC_LEN;
+            int idx = m * N + n;
+            int sm = row_repeat * THREAD_M + mi;
+            int sn = ni * VEC_LEN;
+            int sm_idx = sm * BLOCKN + sn;
+            for (int i = 0; i < VEC_LEN; ++i) {
+                dummy_out[idx + i] = smem[sm_idx + i] + DType(1);
+            }
+        }
+    }
+}
+
+
+template<typename DType>
+void cpu_dummy(DType* source, DType* dummy_out, int M, int N) {
+    for (int m = 0; m < M; ++m) {
+        for (int n = 0; n < N; ++n) {
+            dummy_out[m * N + n] = (DType)((float)source[m * N + n] + (float)DType(1));
+        }
+    }
+}
+
+
+int main(int argc, char** argv) {
+    const int M = 1024;
+    const int N = 1024;
+    int split = 4;
+    using DType = half;
+    const int BLOCKM = 128;
+    const int BLOCKN = 128;
+    const int NUM_THREADS = 128;
+    std::vector<int> shape{M, N};
+    std::vector<int> epoch_shape{M/split, N};
+    auto A = alloc_cpu_tensor<DType>(shape);
+    random_fill(A, shape);
+    // constant_fill(A, shape, DType(1));
+    auto B = alloc_cpu_tensor<DType>(shape);
+    auto golden = alloc_cpu_tensor<DType>(shape);
+
+    GPUTimer gpu_timer;
+
+    std::vector<DType*> dAs;
+    for (int i = 0; i < split; ++i) {
+        dAs.push_back(alloc_gpu_tensor<DType>(epoch_shape));
+    }
+    auto dB = alloc_gpu_tensor<DType>(shape);
+    
+    dim3 block(NUM_THREADS);
+    dim3 grid(ceil_div(M/split, BLOCKM));
+    gpu_timer.sync_all();
+    gpu_timer.tick();
+    for (int i = 0; i < split; ++i) {
+        copy_to_gpu_async(A + M/split * i * N, dAs[i], epoch_shape);
+        split_matrix_ldsm<DType, BLOCKM, BLOCKN, NUM_THREADS><<<grid, block>>>(dAs[i], M, N, dB, split, i);
+    }
+    gpu_timer.tick();
+    gpu_timer.sync_all();
+    std::cout << "GPU split done! Use " << gpu_timer.report_last_ms() << " ms.\n";
+    copy_to_cpu_async(B, dB, shape);
+    
+
+    std::cout << "Calculating golden...\n";
+    cpu_dummy(A, golden, M, N);
+    assert_allclose(B, golden, shape, 1e-5, /*dump=*/false);
+    std::cout << "Correct!\n";
+
+    
+    return 0;
+}
@@ -268,6 +268,15 @@ void random_fill(DType* tensor, std::vector<int> shape) {
   }
 }
 
+template <class DType>
+void constant_fill(DType* tensor, std::vector<int> shape, DType value) {
+  int length =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+  for (int i = 0; i < length; ++i) {
+    tensor[i] = value;
+  }
+}
+
 template <class DType>
 DType* alloc_gpu_tensor(std::vector<int> shape) {
   DType* dt;
@@ -297,6 +306,16 @@ void copy_to_gpu(DType* hptr, DType* dptr, std::vector<int> shape) {
       cudaMemcpyHostToDevice));
 }
 
+template <class DType>
+void copy_to_gpu_async(DType* hptr, DType* dptr, std::vector<int> shape,
+                       cudaStream_t stream = 0) {
+  CUDA_CHECK(cudaMemcpyAsync(
+      dptr, hptr,
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+          sizeof(DType),
+      cudaMemcpyHostToDevice, stream));
+}
+
 template <class DType>
 void copy_to_cpu(DType* hptr, DType* dptr, std::vector<int> shape) {
   CUDA_CHECK(cudaMemcpy(
@@ -306,6 +325,16 @@ void copy_to_cpu(DType* hptr, DType* dptr, std::vector<int> shape) {
       cudaMemcpyDeviceToHost));
 }
 
+template <class DType>
+void copy_to_cpu_async(DType* hptr, DType* dptr, std::vector<int> shape,
+                       cudaStream_t stream = 0) {
+  CUDA_CHECK(cudaMemcpyAsync(
+      hptr, dptr,
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+          sizeof(DType),
+      cudaMemcpyDeviceToHost, stream));
+}
+
 template <class DType>
 void assert_allclose(DType* res_ptr, DType* golden_ptr, std::vector<int> shape,
                      float rtol = 1e-5, bool dump = false) {
 
@@ -0,0 +1,91 @@
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/cutlass.h"
+#include "cute/tensor.hpp"
+
+#include "common.h"
+
+using namespace cutlass;
+using namespace cutlass::gemm::kernel::detail;
+using namespace cute;
+
+using Scheduler = PersistentTileSchedulerSm90;
+
+/// nvcc -arch=sm_90a -I ../../include -I /home/jshao/zhengsz/cutlass/include -lcuda -std=c++17 test_tile_scheduler.cu -o test
+
+struct KernelSharedStorage {
+
+};
+
+struct KernelParams {
+    int M;
+    int N;
+    int K;
+    Scheduler::Params schedule_params;
+    int* idx;
+};
+
+__global__ void test_kernel(KernelParams params) {
+    Scheduler scheduler(params.schedule_params);
+    auto tileinfo = scheduler.get_current_work();
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        int is_n = params.schedule_params.raster_order_ == Scheduler::RasterOrder::AlongN;
+        printf("log swizzle %d is n %d\n", params.schedule_params.log_swizzle_size_, is_n);
+    }
+    if (threadIdx.x == 0) {
+        printf("block %d maps to linear m %d n %d\n", blockIdx.x, tileinfo.M_idx, tileinfo.N_idx);
+    }
+}
+
+int main() {
+    const int M = 4096;
+    const int N = 4096;
+    const int K = 4096;
+    dim3 grid(SM_NUMBER, 1, 1);
+    dim3 block(WARP_GROUP_SIZE * WG_NUMBER, 1, 1);
+    const int CLUSTER_M = 2;
+    const int CLUSTER_N = 1;
+    dim3 cluster(CLUSTER_M, CLUSTER_N, 1);
+    int smemSizeBytes = sizeof(KernelSharedStorage);
+    void const *kernel =
+        (void const *)test_kernel;
+
+    auto idx = alloc_cpu_tensor<int>({(int)block.x});
+    auto g_idx = alloc_gpu_tensor<int>({(int)block.x});
+
+    using ShapeMNKL = Shape<int, int, int, int>;
+    ShapeMNKL shape{M, N, K, 1};
+    using TileShape = Shape<_128, _128, _64>;
+    TileShape tile_shape{};
+    using ClusterShape = Shape<_2, _1, _1>;
+    ClusterShape cluster_shape{};
+    KernelHardwareInfo info{};
+    Scheduler::Arguments args{};
+    Scheduler::Params schedule_params = Scheduler::to_underlying_arguments(shape, tile_shape, cluster_shape, info, args);
+
+    KernelParams params{M, N, K, schedule_params, g_idx};
+    void *kernel_params[] = {&params};
+    cudaLaunchConfig_t launch_config;
+    launch_config.gridDim = {grid.x, grid.y, grid.z};
+    launch_config.blockDim = {block.x, block.y, block.z};
+    launch_config.dynamicSmemBytes = smemSizeBytes;
+    launch_config.stream = nullptr;
+
+    cudaLaunchAttribute launch_attribute[1];
+    launch_attribute[0].id = cudaLaunchAttributeClusterDimension;
+    launch_attribute[0].val.clusterDim.x = cluster.x;
+    launch_attribute[0].val.clusterDim.y = cluster.y;
+    launch_attribute[0].val.clusterDim.z = cluster.z;
+
+    launch_config.attrs = launch_attribute;
+    launch_config.numAttrs = 1;
+
+    cudaError_t status =
+        cudaLaunchKernelExC(&launch_config, kernel, kernel_params);
+    cudaError_t launch_result = cudaGetLastError();
+    CUDA_CHECK(launch_result);
+
+    copy_to_cpu(idx, g_idx, {(int)block.x});
+
+    return 0;
+}