KnowingNothing
diff --git a/Diff for: ‎cutlass.py/fast_math.py
+42 b/Diff for: ‎cutlass.py/fast_math.py
+42
diff --git a/Diff for: ‎cutlass.py/hw_info.py
+108 b/Diff for: ‎cutlass.py/hw_info.py
+108
diff --git a/Diff for: ‎cutlass.py/mapping.py
+26-3 b/Diff for: ‎cutlass.py/mapping.py
+26-3
diff --git a/Diff for: ‎cutlass.py/mma.py
+1-1 b/Diff for: ‎cutlass.py/mma.py
+1-1
diff --git a/Diff for: ‎cutlass.py/swizzle.py
+96 b/Diff for: ‎cutlass.py/swizzle.py
+96
@@ -0,0 +1,42 @@
+def round_up(a, b):
+    return (a + b - 1) // b * b
+
+
+def integer_log2(x):
+    n = 0
+    x >>= 1
+    while x:
+        x = x >> 1
+        n += 1
+    return n
+
+
+class FastDivmodU64Pow2:
+    def __init__(self, divisor=0) -> None:
+        self.divisor = divisor
+        self.shift_right = integer_log2(divisor)
+
+    def divide(self, dividend):
+        return dividend >> self.shift_right
+
+    def modulus(self, dividend):
+        return dividend & (self.divisor - 1)
+
+    def divmod(self, dividend):
+        quotient = self.divide(dividend)
+        remainder = self.modulus(dividend)
+        return quotient, remainder
+
+    def __call__(self, dividend):
+        return self.divmod(dividend)
+
+
+class FastDivmodU64:
+    def __init__(self, divisor):
+        self.divisor = divisor
+
+    def divmod(self, x):
+        return (x // self.divisor, x % self.divisor)
+
+    def __call__(self, x):
+        return self.divmod(x)
@@ -0,0 +1,108 @@
+from dataclasses import dataclass
+import subprocess
+import os
+
+
+@dataclass
+class dim3:
+    x: int
+    y: int
+    z: int
+
+
+@dataclass
+class DeviceCoord:
+    gridDim: dim3
+    blockDim: dim3
+    clusterDim: dim3
+    blockIdx: dim3 = dim3(0, 0, 0)
+    threadIdx: dim3 = dim3(0, 0, 0)
+    blockIdx_in_cluster: dim3 = dim3(0, 0, 0)
+
+    def block_id_in_cluster(self):
+        return (
+            self.blockIdx_in_cluster.x,
+            self.blockIdx_in_cluster.y,
+            self.blockIdx_in_cluster.z,
+        )
+
+    def set_blockIdx(self, x, y, z):
+        self.blockIdx = dim3(x, y, z)
+        self.blockIdx_in_cluster = dim3(x % self.clusterDim.x, y % self.clusterDim.y, z % self.clusterDim.z)
+
+    def set_threadIdx(self, x, y, z):
+        self.threadIdx = dim3(x, y, z)
+
+
+@dataclass
+class KernelHardwareInfo:
+    device_id: int = 0
+    sm_count: int = 0
+
+    @staticmethod
+    def query_device_multiprocessor_count(device_id: int = 0, arch: str = "90a"):
+        cuda_header_code = f"""
+#include <cuda_runtime.h>
+#include <iostream>
+static constexpr int device_id = {device_id};
+    """
+        cuda_code = """
+int main() {
+    cudaError_t result = cudaSetDevice(device_id);
+    if (result != cudaSuccess) {
+        std::cerr << "cudaSetDevice() returned error "
+            << cudaGetErrorString(result) << std::endl;
+        return 1;
+    }
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count,
+                                    cudaDevAttrMultiProcessorCount, device_id);
+    if (result != cudaSuccess) {
+        std::cerr << "cudaDeviceGetAttribute() returned error "
+            << cudaGetErrorString(result) << std::endl;
+        return 1;
+    }
+    std::cout << multiprocessor_count << std::endl;
+    return 0;
+}
+    """
+        # Combine the header and main CUDA code
+        full_cuda_code = cuda_header_code + cuda_code
+
+        # Write the CUDA code to a temporary file
+        with open("temp_query_device.cu", "w") as file:
+            file.write(full_cuda_code)
+
+        # Compile the CUDA code using nvcc
+        compile_command = (
+            f"nvcc -arch=sm_{arch} temp_query_device.cu -o temp_query_device"
+        )
+        try:
+            subprocess.run(
+                compile_command,
+                check=True,
+                shell=True,
+                text=True,
+                stderr=subprocess.PIPE,
+            )
+        except subprocess.CalledProcessError as e:
+            print(f"Compilation failed: {e.stderr}")
+            return -1
+
+        # Run the compiled binary and capture the output
+        try:
+            result = subprocess.run(
+                "./temp_query_device", capture_output=True, text=True, check=True
+            )
+            return int(result.stdout.strip())
+        except subprocess.CalledProcessError as e:
+            print(f"Execution failed: {e.stderr}")
+            return -1
+        finally:
+            # Cleanup the temporary files
+            os.remove("temp_query_device.cu")
+            os.remove("temp_query_device")
+
+
+if __name__ == "__main__":
+    print(KernelHardwareInfo.query_device_multiprocessor_count())
@@ -1,4 +1,27 @@
-class Mapping:
-    pass
+from typing import List, Optional
+from dataclasses import dataclass
+from tiling import HyperCube, HyperPoint
+from functools import reduce
 
-class 
+class Function:
+    def forward(self, *args):
+        raise NotImplementedError()
+    
+    def backward(self, *args):
+        raise NotImplementedError()
+    
+class Mapping(Function):
+    def __init__(self, functions: Optional[List[Function]] = None) -> None:
+        self.functions = functions if functions is not None else []
+        for func in self.functions:
+            assert isinstance(func, Function), "Should put Function type in Mapping."
+
+@dataclass
+class Layout(Function):
+    shape: HyperCube
+    stride: Optional[HyperPoint] = None
+    
+    def __post_init__(self):
+        self.ndim = len(self.shape)
+        if self.stride is None:
+            self.stride = Hyperreduce(lambda a, b: a + [a[-1] * b], reversed(self.shape[:-1]), [1])
@@ -69,7 +69,7 @@ def __init__(self, mma_op: MMA_OP) -> None:
         self.B_frag_type = SmemDesc(B_major)
 
         self.MNK_shape = HyperCube(3, [mma_op.M_tile, mma_op.N_tile, mma_op.K_tile])
-        self.
+        self.thread_id = 
 
 
 def gmma_selector(
 
@@ -0,0 +1,96 @@
+class Swizzle:
+    def __init__(self, num_bits: int, num_base: int, num_shft: int):
+        self.num_bits = num_bits
+        self.num_base = num_base
+        self.num_shft = num_shft
+
+        assert self.num_bits >= 0, "MBase must be positive."
+        assert self.num_bits >= 0, "BBits must be positive."
+        assert (
+            abs(self.num_shft) >= self.num_bits
+        ), "abs(SShift) must be more than BBits."
+
+        self.bit_msk = (1 << self.num_bits) - 1
+        self.yyy_msk = self.bit_msk << (self.num_base + max(0, self.num_shft))
+        self.zzz_msk = self.bit_msk << (self.num_base - min(0, self.num_shft))
+        self.msk_sft = self.num_shft
+
+        self.swizzle_code = self.yyy_msk | self.zzz_msk
+
+    def apply(self, offset):
+        if self.msk_sft >= 0:
+            return offset ^ ((offset & self.yyy_msk) >> self.msk_sft)
+        else:
+            return offset ^ ((offset & self.yyy_msk) << -self.msk_sft)
+
+    def __call__(self, offset):
+        return self.apply(offset)
+
+
+def test_swizzle():
+
+    def get_ind_matrix(rows, cols):
+        return [[(x, y) for y in range(cols)] for x in range(rows)]
+
+    def get_row_major_ind(x, y, rows, cols):
+        return x * cols + y
+
+    def get_row_major_tuple(xy, rows, cols):
+        return (xy // cols, xy % cols)
+
+    def get_col_major_ind(x, y, rows, cols):
+        return x + y * rows
+
+    def get_col_major_tuple(xy, rows, cols):
+        return (xy % rows, xy // rows)
+
+    def print_matrix(mtx, rows, cols, func=lambda x: x, prompt=""):
+        print(prompt)
+        for x in range(rows):
+            for y in range(cols):
+                item = mtx[x][y]
+                item = func(item)
+                print(item, end=" ")
+            print()
+
+    # Swizzle<3, 4, 4>
+    print("Swizzle<3,4,3>")
+    rows = 128
+    cols = 64
+    mtx = get_ind_matrix(rows, cols)
+    print_matrix(mtx, rows, cols, prompt="Original")
+    print()
+    swizzle = Swizzle(3, 4, 3)
+    print_matrix(
+        mtx,
+        rows,
+        cols,
+        lambda tp: get_row_major_tuple(
+            swizzle(get_row_major_ind(tp[0], tp[1], rows, cols)), rows, cols
+        ),
+        prompt="After swizzle",
+    )
+    print()
+
+    # Swizzle<2, 0, -2>
+    print("Swizzle<2,0,-2>")
+    rows = 4
+    cols = 4
+    mtx = get_ind_matrix(rows, cols)
+    print_matrix(mtx, rows, cols, prompt="Original")
+    print()
+    swizzle = Swizzle(2, 0, -2)
+    print_matrix(
+        mtx,
+        rows,
+        cols,
+        lambda tp: get_row_major_tuple(
+            swizzle(get_row_major_ind(tp[0], tp[1], rows, cols)), rows, cols
+        ),
+        prompt="After swizzle",
+    )
+    print()
+
+
+if __name__ == "__main__":
+    pass