diff --git a/book/src/puzzle_04/intro.mojo b/book/src/puzzle_04/intro.mojo
index 4751240d..d88eed07 100644
--- a/book/src/puzzle_04/intro.mojo
+++ b/book/src/puzzle_04/intro.mojo
@@ -6,7 +6,7 @@ alias WIDTH = 3
alias dtype = DType.float32
alias layout = Layout.row_major(HEIGHT, WIDTH)
-fn kernel[dtype: DType, layout: Layout](tensor: LayoutTensor[mut=True, dtype, layout]):
+fn kernel[dtype: DType, layout: Layout](tensor: LayoutTensor[dtype, layout, MutAnyOrigin]):
print("Before:")
print(tensor)
tensor[0, 0] += 1
@@ -17,8 +17,8 @@ def main():
ctx = DeviceContext()
a = ctx.enqueue_create_buffer[dtype](HEIGHT * WIDTH).enqueue_fill(0)
- tensor = LayoutTensor[mut=True, dtype, layout](a.unsafe_ptr())
+ tensor = LayoutTensor[dtype, layout, MutAnyOrigin](a)
# Note: since `tensor` is a device tensor we can't print it without the kernel wrapper
- ctx.enqueue_function[kernel[dtype, layout]](tensor, grid_dim=1, block_dim=1)
+ ctx.enqueue_function_checked[kernel[dtype, layout], kernel[dtype, layout]](tensor, grid_dim=1, block_dim=1)
ctx.synchronize()
diff --git a/book/src/puzzle_08/layout_tensor.md b/book/src/puzzle_08/layout_tensor.md
index ee441dfb..3a7fcec1 100644
--- a/book/src/puzzle_08/layout_tensor.md
+++ b/book/src/puzzle_08/layout_tensor.md
@@ -30,7 +30,7 @@ The key insight is how LayoutTensor simplifies shared memory management while ma
shared = stack_allocation[TPB, Scalar[dtype]]()
# LayoutTensor approach
- shared = LayoutTensor[dtype, Layout.row_major(TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+ shared = LayoutTensor[dtype, Layout.row_major(TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
```
2. **Memory access**: Same syntax
@@ -168,7 +168,7 @@ This solution demonstrates how LayoutTensor simplifies shared memory usage while
```txt
# Clean LayoutTensor API with address_space
- shared = LayoutTensor[dtype, Layout.row_major(TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+ shared = LayoutTensor[dtype, Layout.row_major(TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
```
- Natural indexing for both global and shared:
diff --git a/book/src/puzzle_11/layout_tensor.md b/book/src/puzzle_11/layout_tensor.md
index 8c4d6c36..a97dceea 100644
--- a/book/src/puzzle_11/layout_tensor.md
+++ b/book/src/puzzle_11/layout_tensor.md
@@ -24,7 +24,7 @@ The key insight is how LayoutTensor simplifies shared memory management while ma
Notes:
-- **LayoutTensor allocation**: Use `LayoutTensor[dtype, Layout.row_major(TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
+- **LayoutTensor allocation**: Use `LayoutTensor[dtype, Layout.row_major(TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
- **Window access**: Natural indexing for 3-element windows
- **Edge handling**: Special cases for first two positions
- **Memory pattern**: One shared memory load per thread
@@ -116,7 +116,7 @@ The solution implements a sliding window sum using LayoutTensor with these key s
- LayoutTensor creates block-local storage with address_space:
```txt
- shared = LayoutTensor[dtype, Layout.row_major(TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+ shared = LayoutTensor[dtype, Layout.row_major(TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
```
- Each thread loads one element:
diff --git a/book/src/puzzle_12/layout_tensor.md b/book/src/puzzle_12/layout_tensor.md
index e544a319..3038bef7 100644
--- a/book/src/puzzle_12/layout_tensor.md
+++ b/book/src/puzzle_12/layout_tensor.md
@@ -25,7 +25,7 @@ The key insight is how LayoutTensor simplifies memory management while maintaini
Notes:
-- **LayoutTensor allocation**: Use `LayoutTensor[dtype, Layout.row_major(TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
+- **LayoutTensor allocation**: Use `LayoutTensor[dtype, Layout.row_major(TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
- **Element access**: Natural indexing with bounds checking
- **Layout handling**: Separate layouts for input and output
- **Thread coordination**: Same synchronization patterns with `barrier()`
diff --git a/book/src/puzzle_13/block_boundary.md b/book/src/puzzle_13/block_boundary.md
index 45c8adb5..115740b8 100644
--- a/book/src/puzzle_13/block_boundary.md
+++ b/book/src/puzzle_13/block_boundary.md
@@ -32,7 +32,7 @@ Notes:
-1. Use `LayoutTensor[dtype, Layout.row_major(TPB + CONV_2 - 1), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` for shared memory
+1. Use `LayoutTensor[dtype, Layout.row_major(TPB + CONV_2 - 1), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` for shared memory
2. Load main data: `shared_a[local_i] = a[global_i]`
3. Load boundary: `if local_i < CONV_2 - 1` handle next block data
4. Load kernel: `shared_b[local_i] = b[local_i]`
@@ -125,8 +125,8 @@ Size calculation:
```mojo
# First: account for padding needed for convolution window
- shared_a = LayoutTensor[dtype, Layout.row_major(TPB + CONV_2 - 1), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
- shared_b = LayoutTensor[dtype, Layout.row_major(CONV_2), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+ shared_a = LayoutTensor[dtype, Layout.row_major(TPB + CONV_2 - 1), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+ shared_b = LayoutTensor[dtype, Layout.row_major(CONV_2), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
```
This allocation pattern ensures we have enough space for both the block's data and the overlap region.
diff --git a/book/src/puzzle_14/complete.md b/book/src/puzzle_14/complete.md
index 393d95d2..73b6dbea 100644
--- a/book/src/puzzle_14/complete.md
+++ b/book/src/puzzle_14/complete.md
@@ -343,10 +343,10 @@ The two kernel phases execute sequentially **without any explicit synchronizatio
```mojo
# Phase 1: Local prefix sums
-ctx.enqueue_function[prefix_sum_local_phase[...]](...)
+ctx.enqueue_function_checked[prefix_sum_local_phase[...], prefix_sum_local_phase[...]](...)
# Phase 2: Add block sums (automatically waits for Phase 1)
-ctx.enqueue_function[prefix_sum_block_sum_phase[...]](...)
+ctx.enqueue_function_checked[prefix_sum_block_sum_phase[...], prefix_sum_block_sum_phase[...]](...)
```
**Key insight**: Mojo's `DeviceContext` uses a single execution stream (CUDA stream on NVIDIA GPUs, HIP stream on AMD ROCm GPUs), which guarantees that kernel launches execute in the exact order they are enqueued. No explicit synchronization is needed between kernels.
diff --git a/book/src/puzzle_16/shared_memory.md b/book/src/puzzle_16/shared_memory.md
index 662f802a..27a70216 100644
--- a/book/src/puzzle_16/shared_memory.md
+++ b/book/src/puzzle_16/shared_memory.md
@@ -131,8 +131,8 @@ Matrix B: b_shared: (similar layout)
```mojo
# Create 2D shared memory tensors using LayoutTensor with address_space
- a_shared = LayoutTensor[dtype, Layout.row_major(TPB, TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
- b_shared = LayoutTensor[dtype, Layout.row_major(TPB, TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+ a_shared = LayoutTensor[dtype, Layout.row_major(TPB, TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+ b_shared = LayoutTensor[dtype, Layout.row_major(TPB, TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
```
2. **Thread Indexing**:
diff --git a/book/src/puzzle_17/puzzle_17.md b/book/src/puzzle_17/puzzle_17.md
index 913c3f60..4be1682b 100644
--- a/book/src/puzzle_17/puzzle_17.md
+++ b/book/src/puzzle_17/puzzle_17.md
@@ -181,7 +181,7 @@ Let's break down how this works in the larger context:
```mojo
gpu_ctx = ctx.get_device_context()
gpu_ctx.enqueue_memset(...) # Zero output buffer
- gpu_ctx.enqueue_function[...](...) # Schedule kernel
+ gpu_ctx.enqueue_function_checked[..., ...](...) # Schedule kernel
```
- Device context manages GPU resources
diff --git a/book/src/puzzle_18/puzzle_18.md b/book/src/puzzle_18/puzzle_18.md
index 5294cdb3..3a2427b4 100644
--- a/book/src/puzzle_18/puzzle_18.md
+++ b/book/src/puzzle_18/puzzle_18.md
@@ -265,8 +265,8 @@ The kernel is parameterized with:
#### Shared memory allocation
```mojo
-shared_max = LayoutTensor[dtype, Layout.row_major(BLOCK_DIM_X), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
-shared_sum = LayoutTensor[dtype, Layout.row_major(BLOCK_DIM_X), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+shared_max = LayoutTensor[dtype, Layout.row_major(BLOCK_DIM_X), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+shared_sum = LayoutTensor[dtype, Layout.row_major(BLOCK_DIM_X), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
```
The kernel allocates two shared memory buffers:
diff --git a/book/src/puzzle_19/puzzle_19.md b/book/src/puzzle_19/puzzle_19.md
index d56f0fde..3edac292 100644
--- a/book/src/puzzle_19/puzzle_19.md
+++ b/book/src/puzzle_19/puzzle_19.md
@@ -121,7 +121,7 @@ To complete this puzzle, we'll leverage the tiled matmul kernel from [Puzzle 16]
**Transpose Kernel Implementation Guide:**
-1. **Shared Memory Setup**: Use `LayoutTensor[dtype, Layout.row_major(TRANSPOSE_BLOCK_DIM_XY, TRANSPOSE_BLOCK_DIM_XY), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` to create a square `TRANSPOSE_BLOCK_DIM_XY` × `TRANSPOSE_BLOCK_DIM_XY` shared memory tile for efficient data exchange between threads
+1. **Shared Memory Setup**: Use `LayoutTensor[dtype, Layout.row_major(TRANSPOSE_BLOCK_DIM_XY, TRANSPOSE_BLOCK_DIM_XY), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` to create a square `TRANSPOSE_BLOCK_DIM_XY` × `TRANSPOSE_BLOCK_DIM_XY` shared memory tile for efficient data exchange between threads
2. **Thread Indexing**: Map threads to matrix elements:
- `local_row = thread_idx.y`, `local_col = thread_idx.x` (position within the block)
diff --git a/book/src/puzzle_32/conflict_free_patterns.md b/book/src/puzzle_32/conflict_free_patterns.md
index 0e717114..750ca61d 100644
--- a/book/src/puzzle_32/conflict_free_patterns.md
+++ b/book/src/puzzle_32/conflict_free_patterns.md
@@ -354,7 +354,7 @@ constant = shared[0] # All threads read same address - hardware optimized
**3. Padding techniques:**
```mojo
-shared = LayoutTensor[dtype, Layout.row_major(TPB + 1), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation() # Shift access patterns
+shared = LayoutTensor[dtype, Layout.row_major(TPB + 1), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation() # Shift access patterns
```
**4. Access pattern analysis:**
diff --git a/book/src/puzzle_34/cluster_coordination_basics.md b/book/src/puzzle_34/cluster_coordination_basics.md
index fd83c630..58d84a24 100644
--- a/book/src/puzzle_34/cluster_coordination_basics.md
+++ b/book/src/puzzle_34/cluster_coordination_basics.md
@@ -65,7 +65,7 @@ Traditional single-block algorithms like those in [Puzzle 27](../puzzle_27/puzzl
### **Shared memory coordination**
-- Allocate shared memory using `LayoutTensor[dtype, Layout.row_major(tpb), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` (see [shared memory basics from Puzzle 8](../puzzle_08/puzzle_08.md))
+- Allocate shared memory using `LayoutTensor[dtype, Layout.row_major(tpb), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` (see [shared memory basics from Puzzle 8](../puzzle_08/puzzle_08.md))
- Process input data scaled by `block_id + 1` to create distinct scaling per block
- Use bounds checking when accessing input data (pattern from [guards in Puzzle 3](../puzzle_03/puzzle_03.md))
@@ -153,7 +153,7 @@ block_id = Int(block_idx.x) # Block index for reliable
**Shared memory allocation and data processing:**
-- Each block allocates its own shared memory workspace: `LayoutTensor[dtype, Layout.row_major(tpb), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
+- Each block allocates its own shared memory workspace: `LayoutTensor[dtype, Layout.row_major(tpb), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
- **Scaling strategy**: `data_scale = Float32(block_id + 1)` ensures each block processes data differently
- Block 0: multiplies by 1.0, Block 1: by 2.0, Block 2: by 3.0, Block 3: by 4.0
- **Bounds checking**: `if global_i < size:` prevents out-of-bounds memory access
diff --git a/problems/p01/p01.mojo b/problems/p01/p01.mojo
index bda50bdc..0fa69032 100644
--- a/problems/p01/p01.mojo
+++ b/problems/p01/p01.mojo
@@ -30,9 +30,9 @@ def main():
for i in range(SIZE):
a_host[i] = i
- ctx.enqueue_function[add_10](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10, add_10](
+ out,
+ a,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
diff --git a/problems/p02/p02.mojo b/problems/p02/p02.mojo
index be6b8353..b27bca85 100644
--- a/problems/p02/p02.mojo
+++ b/problems/p02/p02.mojo
@@ -34,10 +34,10 @@ def main():
b_host[i] = i
expected[i] = a_host[i] + b_host[i]
- ctx.enqueue_function[add](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
- b.unsafe_ptr(),
+ ctx.enqueue_function_checked[add, add](
+ out,
+ a,
+ b,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
diff --git a/problems/p03/p03.mojo b/problems/p03/p03.mojo
index 474489c5..6421a168 100644
--- a/problems/p03/p03.mojo
+++ b/problems/p03/p03.mojo
@@ -30,9 +30,9 @@ def main():
for i in range(SIZE):
a_host[i] = i
- ctx.enqueue_function[add_10_guard](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10_guard, add_10_guard](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/problems/p04/p04.mojo b/problems/p04/p04.mojo
index 2a954400..d1aa2d78 100644
--- a/problems/p04/p04.mojo
+++ b/problems/p04/p04.mojo
@@ -37,9 +37,9 @@ def main():
a_host[i * SIZE + j] = i * SIZE + j
expected[i * SIZE + j] = a_host[i * SIZE + j] + 10
- ctx.enqueue_function[add_10_2d](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10_2d, add_10_2d](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/problems/p04/p04_layout_tensor.mojo b/problems/p04/p04_layout_tensor.mojo
index 01e4b3f3..89bf9d7e 100644
--- a/problems/p04/p04_layout_tensor.mojo
+++ b/problems/p04/p04_layout_tensor.mojo
@@ -12,8 +12,8 @@ alias layout = Layout.row_major(SIZE, SIZE)
fn add_10_2d(
- output: LayoutTensor[mut=True, dtype, layout],
- a: LayoutTensor[mut=True, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, layout, MutAnyOrigin],
size: Int,
):
row = thread_idx.y
@@ -27,7 +27,7 @@ fn add_10_2d(
def main():
with DeviceContext() as ctx:
out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
- out_tensor = LayoutTensor[mut=True, dtype, layout](out_buf.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out_buf)
print("out shape:", out_tensor.shape[0](), "x", out_tensor.shape[1]())
expected = ctx.enqueue_create_host_buffer[dtype](
@@ -40,9 +40,9 @@ def main():
a_host[i] = i
expected[i] = a_host[i] + 10
- a_tensor = LayoutTensor[mut=True, dtype, layout](a.unsafe_ptr())
+ a_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](a)
- ctx.enqueue_function[add_10_2d](
+ ctx.enqueue_function_checked[add_10_2d, add_10_2d](
out_tensor,
a_tensor,
SIZE,
diff --git a/problems/p05/p05.mojo b/problems/p05/p05.mojo
index 37e8aa83..003a5565 100644
--- a/problems/p05/p05.mojo
+++ b/problems/p05/p05.mojo
@@ -39,10 +39,10 @@ def main():
for j in range(SIZE):
expected[i * SIZE + j] = a_host[j] + b_host[i]
- ctx.enqueue_function[broadcast_add](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
- b.unsafe_ptr(),
+ ctx.enqueue_function_checked[broadcast_add, broadcast_add](
+ out,
+ a,
+ b,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/problems/p05/p05_layout_tensor.mojo b/problems/p05/p05_layout_tensor.mojo
index 42fee181..dd3a5f25 100644
--- a/problems/p05/p05_layout_tensor.mojo
+++ b/problems/p05/p05_layout_tensor.mojo
@@ -18,9 +18,9 @@ fn broadcast_add[
a_layout: Layout,
b_layout: Layout,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, a_layout],
- b: LayoutTensor[mut=False, dtype, b_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, b_layout, ImmutAnyOrigin],
size: Int,
):
row = thread_idx.y
@@ -32,16 +32,14 @@ fn broadcast_add[
def main():
with DeviceContext() as ctx:
out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](
- out_buf.unsafe_ptr()
- )
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out_buf)
print("out shape:", out_tensor.shape[0](), "x", out_tensor.shape[1]())
expected_buf = ctx.enqueue_create_host_buffer[dtype](
SIZE * SIZE
).enqueue_fill(0)
- expected_tensor = LayoutTensor[mut=True, dtype, out_layout](
- expected_buf.unsafe_ptr()
+ expected_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](
+ expected_buf
)
a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
@@ -55,10 +53,11 @@ def main():
for j in range(SIZE):
expected_tensor[i, j] = a_host[j] + b_host[i]
- a_tensor = LayoutTensor[dtype, a_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[dtype, b_layout](b.unsafe_ptr())
+ a_tensor = LayoutTensor[dtype, a_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, b_layout, ImmutAnyOrigin](b)
- ctx.enqueue_function[broadcast_add[out_layout, a_layout, b_layout]](
+ alias kernel = broadcast_add[out_layout, a_layout, b_layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
b_tensor,
diff --git a/problems/p06/p06.mojo b/problems/p06/p06.mojo
index c679b21a..6a56d9ce 100644
--- a/problems/p06/p06.mojo
+++ b/problems/p06/p06.mojo
@@ -30,9 +30,9 @@ def main():
for i in range(SIZE):
a_host[i] = i
- ctx.enqueue_function[add_10_blocks](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10_blocks, add_10_blocks](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/problems/p07/p07.mojo b/problems/p07/p07.mojo
index 09db5cd1..623000e1 100644
--- a/problems/p07/p07.mojo
+++ b/problems/p07/p07.mojo
@@ -38,9 +38,9 @@ def main():
a_host[k] = k
expected[k] = k + 10
- ctx.enqueue_function[add_10_blocks_2d](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10_blocks_2d, add_10_blocks_2d](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/problems/p07/p07_layout_tensor.mojo b/problems/p07/p07_layout_tensor.mojo
index 8f939fe4..02ca5153 100644
--- a/problems/p07/p07_layout_tensor.mojo
+++ b/problems/p07/p07_layout_tensor.mojo
@@ -16,8 +16,8 @@ fn add_10_blocks_2d[
out_layout: Layout,
a_layout: Layout,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, a_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin],
size: Int,
):
row = block_dim.y * block_idx.y + thread_idx.y
@@ -31,9 +31,7 @@ fn add_10_blocks_2d[
def main():
with DeviceContext() as ctx:
out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
- out_tensor = LayoutTensor[dtype, out_layout, MutableAnyOrigin](
- out_buf.unsafe_ptr()
- )
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out_buf)
expected_buf = ctx.enqueue_create_host_buffer[dtype](
SIZE * SIZE
@@ -48,11 +46,10 @@ def main():
a_host[k] = k
expected_buf[k] = k + 10
- a_tensor = LayoutTensor[dtype, a_layout, MutableAnyOrigin](
- a.unsafe_ptr()
- )
+ a_tensor = LayoutTensor[dtype, a_layout, ImmutAnyOrigin](a)
- ctx.enqueue_function[add_10_blocks_2d[out_layout, a_layout]](
+ alias kernel = add_10_blocks_2d[out_layout, a_layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
SIZE,
@@ -62,16 +59,14 @@ def main():
ctx.synchronize()
- expected_tensor = LayoutTensor[dtype, out_layout, MutableAnyOrigin](
- expected_buf.unsafe_ptr()
+ expected_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](
+ expected_buf
)
with out_buf.map_to_host() as out_buf_host:
print(
"out:",
- LayoutTensor[dtype, out_layout, MutableAnyOrigin](
- out_buf_host.unsafe_ptr()
- ),
+ LayoutTensor[dtype, out_layout, MutAnyOrigin](out_buf_host),
)
print("expected:", expected_tensor)
for i in range(SIZE):
diff --git a/problems/p08/p08.mojo b/problems/p08/p08.mojo
index dd74f555..1afaa281 100644
--- a/problems/p08/p08.mojo
+++ b/problems/p08/p08.mojo
@@ -43,9 +43,9 @@ def main():
with DeviceContext() as ctx:
out = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(1)
- ctx.enqueue_function[add_10_shared](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10_shared, add_10_shared](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/problems/p08/p08_layout_tensor.mojo b/problems/p08/p08_layout_tensor.mojo
index a6fce741..2203d0eb 100644
--- a/problems/p08/p08_layout_tensor.mojo
+++ b/problems/p08/p08_layout_tensor.mojo
@@ -17,15 +17,15 @@ alias layout = Layout.row_major(SIZE)
fn add_10_shared_layout_tensor[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- a: LayoutTensor[mut=True, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
# Allocate shared memory using LayoutTensor with explicit address_space
shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -48,10 +48,11 @@ def main():
out = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(1)
- out_tensor = LayoutTensor[dtype, layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[dtype, layout](a.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
- ctx.enqueue_function[add_10_shared_layout_tensor[layout]](
+ alias kernel = add_10_shared_layout_tensor[layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
SIZE,
diff --git a/problems/p09/p09.mojo b/problems/p09/p09.mojo
index 5df6563d..da7d208f 100644
--- a/problems/p09/p09.mojo
+++ b/problems/p09/p09.mojo
@@ -60,7 +60,7 @@ fn collaborative_filter(
shared_workspace = LayoutTensor[
dtype,
Layout.row_major(SIZE - 1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
diff --git a/problems/p10/p10.mojo b/problems/p10/p10.mojo
index 1f9fdea4..94627aa1 100644
--- a/problems/p10/p10.mojo
+++ b/problems/p10/p10.mojo
@@ -25,7 +25,7 @@ fn shared_memory_race(
shared_sum = LayoutTensor[
dtype,
Layout.row_major(1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
diff --git a/problems/p11/p11.mojo b/problems/p11/p11.mojo
index 62d3c6d1..cc42c861 100644
--- a/problems/p11/p11.mojo
+++ b/problems/p11/p11.mojo
@@ -39,9 +39,9 @@ def main():
for i in range(SIZE):
a_host[i] = i
- ctx.enqueue_function[pooling](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[pooling, pooling](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
@@ -52,7 +52,7 @@ def main():
ctx.synchronize()
with a.map_to_host() as a_host:
- ptr = a_host.unsafe_ptr()
+ ptr = a_host
for i in range(SIZE):
s = Scalar[dtype](0)
for j in range(max(i - 2, 0), i + 1):
diff --git a/problems/p11/p11_layout_tensor.mojo b/problems/p11/p11_layout_tensor.mojo
index b24d2f81..941275f1 100644
--- a/problems/p11/p11_layout_tensor.mojo
+++ b/problems/p11/p11_layout_tensor.mojo
@@ -16,15 +16,15 @@ alias layout = Layout.row_major(SIZE)
fn pooling[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- a: LayoutTensor[mut=True, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
# Allocate shared memory using tensor builder
shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -45,10 +45,10 @@ def main():
for i in range(SIZE):
a_host[i] = i
- out_tensor = LayoutTensor[dtype, layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[dtype, layout](a.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
- ctx.enqueue_function[pooling[layout]](
+ ctx.enqueue_function_checked[pooling[layout], pooling[layout]](
out_tensor,
a_tensor,
SIZE,
@@ -60,7 +60,7 @@ def main():
ctx.synchronize()
with a.map_to_host() as a_host:
- ptr = a_host.unsafe_ptr()
+ ptr = a_host
for i in range(SIZE):
s = Scalar[dtype](0)
for j in range(max(i - 2, 0), i + 1):
diff --git a/problems/p12/p12.mojo b/problems/p12/p12.mojo
index 690230cc..383eff4c 100644
--- a/problems/p12/p12.mojo
+++ b/problems/p12/p12.mojo
@@ -36,10 +36,10 @@ def main():
a_host[i] = i
b_host[i] = i
- ctx.enqueue_function[dot_product](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
- b.unsafe_ptr(),
+ ctx.enqueue_function_checked[dot_product, dot_product](
+ out,
+ a,
+ b,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/problems/p12/p12_layout_tensor.mojo b/problems/p12/p12_layout_tensor.mojo
index e94c32ee..1636a180 100644
--- a/problems/p12/p12_layout_tensor.mojo
+++ b/problems/p12/p12_layout_tensor.mojo
@@ -18,9 +18,9 @@ alias out_layout = Layout.row_major(1)
fn dot_product[
in_layout: Layout, out_layout: Layout
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=True, dtype, in_layout],
- b: LayoutTensor[mut=True, dtype, in_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
size: Int,
):
# FILL ME IN (roughly 13 lines)
@@ -41,11 +41,12 @@ def main():
a_host[i] = i
b_host[i] = i
- out_tensor = LayoutTensor[dtype, out_layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[dtype, layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[dtype, layout](b.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](b)
- ctx.enqueue_function[dot_product[layout, out_layout]](
+ alias kernel = dot_product[layout, out_layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
b_tensor,
diff --git a/problems/p17/op/conv1d.mojo b/problems/p17/op/conv1d.mojo
index f0bcd5c9..1272a31e 100644
--- a/problems/p17/op/conv1d.mojo
+++ b/problems/p17/op/conv1d.mojo
@@ -16,9 +16,9 @@ fn conv1d_kernel[
conv_size: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- input: LayoutTensor[mut=True, dtype, in_layout],
- kernel: LayoutTensor[mut=True, dtype, conv_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, in_layout, MutAnyOrigin],
+ kernel: LayoutTensor[dtype, conv_layout, MutAnyOrigin],
):
global_i = block_dim.x * block_idx.x + thread_idx.x
local_i = thread_idx.x
@@ -26,13 +26,13 @@ fn conv1d_kernel[
shared_a = LayoutTensor[
dtype,
Layout.row_major(TPB + conv_size - 1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
shared_b = LayoutTensor[
dtype,
Layout.row_major(conv_size),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
if global_i < input_size:
diff --git a/problems/p18/op/softmax.mojo b/problems/p18/op/softmax.mojo
index d4d5621e..5876df2b 100644
--- a/problems/p18/op/softmax.mojo
+++ b/problems/p18/op/softmax.mojo
@@ -38,8 +38,8 @@ fn softmax_cpu_kernel[
input_size: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[dtype, layout, MutableAnyOrigin],
- input: LayoutTensor[dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, layout, MutAnyOrigin],
):
# FILL IN (roughly 10 lines)
...
@@ -65,12 +65,12 @@ struct SoftmaxCustomOp:
ctx: DeviceContextPtr,
) raises:
# Note: rebind is necessary now but it shouldn't be!
- var output_tensor = rebind[
- LayoutTensor[dtype, layout, MutableAnyOrigin]
- ](output.to_layout_tensor())
- var input_tensor = rebind[
- LayoutTensor[dtype, layout, MutableAnyOrigin]
- ](input.to_layout_tensor())
+ var output_tensor = rebind[LayoutTensor[dtype, layout, MutAnyOrigin]](
+ output.to_layout_tensor()
+ )
+ var input_tensor = rebind[LayoutTensor[dtype, layout, MutAnyOrigin]](
+ input.to_layout_tensor()
+ )
@parameter
if target == "gpu":
diff --git a/problems/p19/op/attention.mojo b/problems/p19/op/attention.mojo
index ceffc72c..eddb45d1 100644
--- a/problems/p19/op/attention.mojo
+++ b/problems/p19/op/attention.mojo
@@ -37,9 +37,9 @@ fn matmul_idiomatic_tiled[
inner: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, a_layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, b_layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, a_layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, b_layout, MutAnyOrigin],
):
"""Updated idiomatic tiled matrix multiplication from p16."""
local_row = thread_idx.y
@@ -54,13 +54,13 @@ fn matmul_idiomatic_tiled[
a_shared = LayoutTensor[
dtype,
Layout.row_major(MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
b_shared = LayoutTensor[
dtype,
Layout.row_major(MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
var acc: output.element_type = 0
@@ -124,8 +124,8 @@ fn transpose_kernel[
cols: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, layout_out, MutableAnyOrigin],
- inp: LayoutTensor[mut=False, dtype, layout_in, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout_out, MutAnyOrigin],
+ inp: LayoutTensor[mut=False, dtype, layout_in, MutAnyOrigin],
):
# FILL ME IN (roughly 18 lines)
...
@@ -146,13 +146,13 @@ fn softmax_gpu_kernel[
shared_max = LayoutTensor[
dtype,
Layout.row_major(SOFTMAX_BLOCK_DIM_X),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
shared_sum = LayoutTensor[
dtype,
Layout.row_major(SOFTMAX_BLOCK_DIM_X),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
global_i = thread_idx.x
@@ -213,10 +213,10 @@ fn attention_cpu_kernel[
d: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[dtype, layout_out, MutableAnyOrigin],
- q: LayoutTensor[dtype, layout_q, MutableAnyOrigin],
- k: LayoutTensor[dtype, layout_k, MutableAnyOrigin],
- v: LayoutTensor[dtype, layout_v, MutableAnyOrigin],
+ output: LayoutTensor[dtype, layout_out, MutAnyOrigin],
+ q: LayoutTensor[dtype, layout_q, MutAnyOrigin],
+ k: LayoutTensor[dtype, layout_k, MutAnyOrigin],
+ v: LayoutTensor[dtype, layout_v, MutAnyOrigin],
):
"""CPU implementation of vector attention."""
var scores = List[Float32]()
@@ -278,15 +278,15 @@ struct AttentionCustomOp:
# Convert to layout tensors
var output_tensor = rebind[
- LayoutTensor[dtype, layout_out, MutableAnyOrigin]
+ LayoutTensor[dtype, layout_out, MutAnyOrigin]
](output.to_layout_tensor())
- var q_tensor = rebind[LayoutTensor[dtype, layout_q, MutableAnyOrigin]](
+ var q_tensor = rebind[LayoutTensor[dtype, layout_q, MutAnyOrigin]](
q.to_layout_tensor()
)
- var k_tensor = rebind[LayoutTensor[dtype, layout_k, MutableAnyOrigin]](
+ var k_tensor = rebind[LayoutTensor[dtype, layout_k, MutAnyOrigin]](
k.to_layout_tensor()
)
- var v_tensor = rebind[LayoutTensor[dtype, layout_v, MutableAnyOrigin]](
+ var v_tensor = rebind[LayoutTensor[dtype, layout_v, MutAnyOrigin]](
v.to_layout_tensor()
)
@@ -342,7 +342,7 @@ struct AttentionCustomOp:
seq_len
) # Reused for scores and weights
- k_t = LayoutTensor[mut=True, dtype, layout_k_t, MutableAnyOrigin](
+ k_t = LayoutTensor[mut=True, dtype, layout_k_t, MutAnyOrigin](
k_t_buf.unsafe_ptr()
)
diff --git a/problems/p20/op/conv1d.mojo b/problems/p20/op/conv1d.mojo
index b03d972a..e89d81b3 100644
--- a/problems/p20/op/conv1d.mojo
+++ b/problems/p20/op/conv1d.mojo
@@ -19,9 +19,9 @@ fn conv1d_kernel[
conv_size: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- input: LayoutTensor[mut=True, dtype, in_layout],
- kernel: LayoutTensor[mut=True, dtype, conv_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, in_layout, MutAnyOrigin],
+ kernel: LayoutTensor[dtype, conv_layout, MutAnyOrigin],
):
global_i = block_dim.x * block_idx.x + thread_idx.x
local_i = thread_idx.x
@@ -29,13 +29,13 @@ fn conv1d_kernel[
shared_a = LayoutTensor[
dtype,
Layout.row_major(TPB + conv_size - 1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
shared_b = LayoutTensor[
dtype,
Layout.row_major(conv_size),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
if global_i < input_size:
@@ -113,11 +113,10 @@ struct Conv1DCustomOp:
),
0,
)
- gpu_ctx.enqueue_function[
- conv1d_kernel[
- in_layout, out_layout, conv_layout, input_size, conv_size
- ]
- ](
+ alias kernel = conv1d_kernel[
+ in_layout, out_layout, conv_layout, input_size, conv_size
+ ],
+ gpu_ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
input_tensor,
kernel_tensor,
diff --git a/problems/p21/op/embedding.mojo b/problems/p21/op/embedding.mojo
index bc650460..88d535cf 100644
--- a/problems/p21/op/embedding.mojo
+++ b/problems/p21/op/embedding.mojo
@@ -19,9 +19,9 @@ fn embedding_kernel_coalesced[
embed_dim: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- indices: LayoutTensor[mut=True, DType.int32, indices_layout],
- weights: LayoutTensor[mut=True, dtype, weights_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ indices: LayoutTensor[DType.int32, indices_layout, MutAnyOrigin],
+ weights: LayoutTensor[dtype, weights_layout, MutAnyOrigin],
):
"""
Memory-coalescing focused embedding kernel.
@@ -63,9 +63,9 @@ fn embedding_kernel_2d[
embed_dim: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- indices: LayoutTensor[mut=True, DType.int32, indices_layout],
- weights: LayoutTensor[mut=True, dtype, weights_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ indices: LayoutTensor[DType.int32, indices_layout, MutAnyOrigin],
+ weights: LayoutTensor[dtype, weights_layout, MutAnyOrigin],
):
"""
2D grid non-coalesced embedding kernel.
@@ -154,20 +154,19 @@ struct EmbeddingCustomOp:
blocks = max(1, ceildiv(total_elements, THREADS_PER_BLOCK))
# Compile and launch optimized kernel
- compiled_kernel = gpu_ctx.compile_function[
- embedding_kernel_coalesced[
- indices_layout,
- weights_layout,
- out_layout,
- batch_size,
- seq_len,
- vocab_size,
- embed_dim,
- output.dtype,
- ]
- ]()
-
- gpu_ctx.enqueue_function(
+ alias kernel = embedding_kernel_coalesced[
+ indices_layout,
+ weights_layout,
+ out_layout,
+ batch_size,
+ seq_len,
+ vocab_size,
+ embed_dim,
+ output.dtype,
+ ]
+ compiled_kernel = gpu_ctx.compile_function_checked[kernel, kernel]()
+
+ gpu_ctx.enqueue_function_checked(
compiled_kernel,
output_tensor,
indices_tensor,
diff --git a/problems/p22/op/layernorm_linear.mojo b/problems/p22/op/layernorm_linear.mojo
index 3d0fedda..ae28d462 100644
--- a/problems/p22/op/layernorm_linear.mojo
+++ b/problems/p22/op/layernorm_linear.mojo
@@ -28,9 +28,9 @@ fn matmul_idiomatic_tiled[
inner: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, a_layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, b_layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, a_layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, b_layout, MutAnyOrigin],
):
"""Idiomatic tiled matrix multiplication from p19."""
local_row = thread_idx.y
@@ -45,13 +45,13 @@ fn matmul_idiomatic_tiled[
a_shared = LayoutTensor[
dtype,
Layout.row_major(MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
b_shared = LayoutTensor[
dtype,
Layout.row_major(MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
var acc: output.element_type = 0
@@ -153,8 +153,8 @@ fn transpose_kernel[
cols: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, layout_out, MutableAnyOrigin],
- inp: LayoutTensor[mut=False, dtype, layout_in, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout_out, MutAnyOrigin],
+ inp: LayoutTensor[mut=False, dtype, layout_in, MutAnyOrigin],
):
"""Transpose matrix using shared memory tiling for coalesced access.
We will learn more about coalesced access in the next part.
@@ -162,7 +162,7 @@ fn transpose_kernel[
shared_tile = LayoutTensor[
dtype,
Layout.row_major(TRANSPOSE_BLOCK_DIM_XY, TRANSPOSE_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
diff --git a/problems/p23/p23.mojo b/problems/p23/p23.mojo
index 67b36029..91421916 100644
--- a/problems/p23/p23.mojo
+++ b/problems/p23/p23.mojo
@@ -20,9 +20,9 @@ alias SIMD_WIDTH = simd_width_of[dtype, target = get_gpu_target()]()
fn elementwise_add[
layout: Layout, dtype: DType, simd_width: Int, rank: Int, size: Int
](
- output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
ctx: DeviceContext,
) raises:
@parameter
@@ -52,9 +52,9 @@ fn tiled_elementwise_add[
size: Int,
tile_size: Int,
](
- output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
ctx: DeviceContext,
) raises:
@parameter
@@ -87,9 +87,9 @@ fn manual_vectorized_tiled_elementwise_add[
size: Int,
tile_size: Int,
](
- output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
ctx: DeviceContext,
) raises:
# Each tile contains tile_size groups of simd_width elements
@@ -128,9 +128,9 @@ fn vectorize_within_tiles_elementwise_add[
size: Int,
tile_size: Int,
](
- output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
ctx: DeviceContext,
) raises:
# Each tile contains tile_size elements (not SIMD groups)
@@ -181,13 +181,13 @@ fn benchmark_elementwise_parameterized[
a_host[i] = 2 * i
b_host[i] = 2 * i + 1
- a_tensor = LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin](
+ a_tensor = LayoutTensor[mut=False, dtype, layout, MutAnyOrigin](
a.unsafe_ptr()
)
- b_tensor = LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin](
+ b_tensor = LayoutTensor[mut=False, dtype, layout, MutAnyOrigin](
b_buf.unsafe_ptr()
)
- out_tensor = LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin](
+ out_tensor = LayoutTensor[mut=True, dtype, layout, MutAnyOrigin](
out.unsafe_ptr()
)
diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo
index a0531f05..21f16679 100644
--- a/problems/p24/p24.mojo
+++ b/problems/p24/p24.mojo
@@ -34,9 +34,9 @@ alias out_layout = Layout.row_major(1)
fn traditional_dot_product_p12_style[
in_layout: Layout, out_layout: Layout, size: Int
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, in_layout],
- b: LayoutTensor[mut=False, dtype, in_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
):
"""
This is the complex approach from p12_layout_tensor.mojo - kept for comparison.
@@ -44,7 +44,7 @@ fn traditional_dot_product_p12_style[
shared = LayoutTensor[
dtype,
Layout.row_major(WARP_SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
global_i = block_dim.x * block_idx.x + thread_idx.x
@@ -75,9 +75,9 @@ fn traditional_dot_product_p12_style[
fn simple_warp_dot_product[
in_layout: Layout, out_layout: Layout, size: Int
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, in_layout],
- b: LayoutTensor[mut=False, dtype, in_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
):
global_i = block_dim.x * block_idx.x + thread_idx.x
# FILL IN (6 lines at most)
@@ -95,9 +95,9 @@ fn functional_warp_dot_product[
rank: Int,
size: Int,
](
- output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
ctx: DeviceContext,
) raises:
@parameter
@@ -178,16 +178,15 @@ fn benchmark_simple_warp_parameterized[
rand_int[dtype, test_size](b)
expected_output[dtype, n_warps](expected, a, b)
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b)
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
@parameter
@always_inline
fn traditional_workflow(ctx: DeviceContext) raises:
- ctx.enqueue_function[
- simple_warp_dot_product[in_layout, out_layout, test_size]
- ](
+ alias kernel = simple_warp_dot_product[in_layout, out_layout, test_size]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
b_tensor,
@@ -225,9 +224,9 @@ fn benchmark_functional_warp_parameterized[
rand_int[dtype, test_size](b)
expected_output[dtype, n_warps](expected, a, b)
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b)
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
@parameter
@always_inline
@@ -267,15 +266,16 @@ fn benchmark_traditional_parameterized[
rand_int[dtype, test_size](b)
expected_output[dtype, n_warps](expected, a, b)
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b)
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
@parameter
@always_inline
fn traditional_workflow(ctx: DeviceContext) raises:
- ctx.enqueue_function[
- traditional_dot_product_p12_style[in_layout, out_layout, test_size]
+ ctx.enqueue_function_checked[
+ traditional_dot_product_p12_style[in_layout, out_layout, test_size],
+ traditional_dot_product_p12_style[in_layout, out_layout, test_size],
](
out_tensor,
a_tensor,
@@ -306,11 +306,9 @@ def main():
n_warps
).enqueue_fill(0)
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](
- out.unsafe_ptr()
- )
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b)
with a.map_to_host() as a_host, b.map_to_host() as b_host:
for i in range(SIZE):
@@ -318,10 +316,13 @@ def main():
b_host[i] = i
if argv()[1] == "--traditional":
- ctx.enqueue_function[
+ ctx.enqueue_function_checked[
+ traditional_dot_product_p12_style[
+ in_layout, out_layout, SIZE
+ ],
traditional_dot_product_p12_style[
in_layout, out_layout, SIZE
- ]
+ ],
](
out_tensor,
a_tensor,
@@ -330,8 +331,9 @@ def main():
block_dim=THREADS_PER_BLOCK,
)
elif argv()[1] == "--kernel":
- ctx.enqueue_function[
- simple_warp_dot_product[in_layout, out_layout, SIZE]
+ ctx.enqueue_function_checked[
+ simple_warp_dot_product[in_layout, out_layout, SIZE],
+ simple_warp_dot_product[in_layout, out_layout, SIZE],
](
out_tensor,
a_tensor,
diff --git a/problems/p27/p27.mojo b/problems/p27/p27.mojo
index b21efd7e..211d5a3c 100644
--- a/problems/p27/p27.mojo
+++ b/problems/p27/p27.mojo
@@ -14,9 +14,9 @@ from math import floor
fn traditional_dot_product[
in_layout: Layout, out_layout: Layout, tpb: Int
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, in_layout],
- b: LayoutTensor[mut=False, dtype, in_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
size: Int,
):
"""Traditional dot product using shared memory + barriers + tree reduction.
@@ -25,7 +25,7 @@ fn traditional_dot_product[
shared = LayoutTensor[
dtype,
Layout.row_major(tpb),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
global_i = block_dim.x * block_idx.x + thread_idx.x
@@ -66,9 +66,9 @@ alias dtype = DType.float32
fn block_sum_dot_product[
in_layout: Layout, out_layout: Layout, tpb: Int
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, in_layout],
- b: LayoutTensor[mut=False, dtype, in_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
size: Int,
):
"""Dot product using block.sum() - convenience function like warp.sum()!
@@ -89,9 +89,9 @@ alias bin_layout = Layout.row_major(SIZE) # Max SIZE elements per bin
fn block_histogram_bin_extract[
in_layout: Layout, bin_layout: Layout, out_layout: Layout, tpb: Int
](
- input_data: LayoutTensor[mut=False, dtype, in_layout],
- bin_output: LayoutTensor[mut=True, dtype, bin_layout],
- count_output: LayoutTensor[mut=True, DType.int32, out_layout],
+ input_data: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ bin_output: LayoutTensor[dtype, bin_layout, MutAnyOrigin],
+ count_output: LayoutTensor[DType.int32, out_layout, MutAnyOrigin],
size: Int,
target_bin: Int,
num_bins: Int,
@@ -139,8 +139,8 @@ alias vector_layout = Layout.row_major(SIZE)
fn block_normalize_vector[
in_layout: Layout, out_layout: Layout, tpb: Int
](
- input_data: LayoutTensor[mut=False, dtype, in_layout],
- output_data: LayoutTensor[mut=True, dtype, out_layout],
+ input_data: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ output_data: LayoutTensor[dtype, out_layout, MutAnyOrigin],
size: Int,
):
"""Vector mean normalization using block.sum() + block.broadcast() combination.
@@ -205,18 +205,13 @@ def main():
print("TPB:", TPB)
print("Expected result:", expected)
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](
- b_buf.unsafe_ptr()
- )
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](
- out.unsafe_ptr()
- )
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b_buf)
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
# Traditional approach: works perfectly when size == TPB
- ctx.enqueue_function[
- traditional_dot_product[in_layout, out_layout, TPB]
- ](
+ alias kernel = traditional_dot_product[in_layout, out_layout, TPB]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
b_tensor,
@@ -249,18 +244,13 @@ def main():
print("TPB:", TPB)
print("Expected result:", expected)
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](
- b_buf.unsafe_ptr()
- )
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](
- out.unsafe_ptr()
- )
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b_buf)
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
# Block.sum(): Same result with dramatically simpler code!
- ctx.enqueue_function[
- block_sum_dot_product[in_layout, out_layout, TPB]
- ](
+ alias kernel = block_sum_dot_product[in_layout, out_layout, TPB]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
b_tensor,
@@ -306,8 +296,8 @@ def main():
print("...")
print()
- input_tensor = LayoutTensor[mut=False, dtype, in_layout](
- input_buf.unsafe_ptr()
+ input_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](
+ input_buf
)
# Demonstrate histogram for each bin using block.prefix_sum()
@@ -330,19 +320,18 @@ def main():
1
).enqueue_fill(0)
- bin_tensor = LayoutTensor[mut=True, dtype, bin_layout](
- bin_data.unsafe_ptr()
- )
- count_tensor = LayoutTensor[mut=True, DType.int32, out_layout](
- bin_count.unsafe_ptr()
+ bin_tensor = LayoutTensor[dtype, bin_layout, MutAnyOrigin](
+ bin_data
)
+ count_tensor = LayoutTensor[
+ DType.int32, out_layout, MutAnyOrigin
+ ](bin_count)
# Execute histogram kernel for this specific bin
- ctx.enqueue_function[
- block_histogram_bin_extract[
- in_layout, bin_layout, out_layout, TPB
- ]
- ](
+ alias kernel = block_histogram_bin_extract[
+ in_layout, bin_layout, out_layout, TPB
+ ],
+ ctx.enqueue_function_checked[kernel, kernel](
input_tensor,
bin_tensor,
count_tensor,
@@ -405,17 +394,18 @@ def main():
print("Mean value:", mean_value)
print()
- input_tensor = LayoutTensor[mut=False, dtype, in_layout](
- input_buf.unsafe_ptr()
+ input_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](
+ input_buf
)
- output_tensor = LayoutTensor[mut=True, dtype, vector_layout](
- output_buf.unsafe_ptr()
+ output_tensor = LayoutTensor[dtype, vector_layout, MutAnyOrigin](
+ output_buf
)
# Execute vector normalization kernel
- ctx.enqueue_function[
- block_normalize_vector[in_layout, vector_layout, TPB]
- ](
+ alias kernel = block_normalize_vector[
+ in_layout, vector_layout, TPB
+ ],
+ ctx.enqueue_function_checked[kernel, kernel](
input_tensor,
output_tensor,
SIZE,
diff --git a/problems/p28/p28.mojo b/problems/p28/p28.mojo
index b9e8e0ae..7965fa4d 100644
--- a/problems/p28/p28.mojo
+++ b/problems/p28/p28.mojo
@@ -23,9 +23,9 @@ alias layout_async = Layout.row_major(VECTOR_SIZE)
fn async_copy_overlap_convolution[
dtype: DType, layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- input: LayoutTensor[mut=False, dtype, layout],
- kernel: LayoutTensor[mut=False, dtype, Layout.row_major(KERNEL_SIZE)],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, layout, ImmutAnyOrigin],
+ kernel: LayoutTensor[dtype, Layout.row_major(KERNEL_SIZE), ImmutAnyOrigin],
):
"""Demonstrates async copy operations building on p14 patterns.
@@ -37,13 +37,13 @@ fn async_copy_overlap_convolution[
input_shared = LayoutTensor[
dtype,
Layout.row_major(CONV_TILE_SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
kernel_shared = LayoutTensor[
dtype,
Layout.row_major(KERNEL_SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -76,19 +76,18 @@ def test_async_copy_overlap_convolution():
for i in range(KERNEL_SIZE):
kernel_host[i] = Float32(i + 1)
- input_tensor = LayoutTensor[mut=False, dtype, layout_async](
- input_buf.unsafe_ptr()
+ input_tensor = LayoutTensor[dtype, layout_async, ImmutAnyOrigin](
+ input_buf
)
- output_tensor = LayoutTensor[mut=True, dtype, layout_async](
- output_buf.unsafe_ptr()
+ output_tensor = LayoutTensor[dtype, layout_async, MutAnyOrigin](
+ output_buf
)
kernel_tensor = LayoutTensor[
mut=False, dtype, Layout.row_major(KERNEL_SIZE)
- ](kernel_buf.unsafe_ptr())
+ ](kernel_buf)
- ctx.enqueue_function[
- async_copy_overlap_convolution[dtype, layout_async]
- ](
+ alias kernel = async_copy_overlap_convolution[dtype, layout_async]
+ ctx.enqueue_function_checked[kernel, kernel](
output_tensor,
input_tensor,
kernel_tensor,
diff --git a/problems/p29/p29.mojo b/problems/p29/p29.mojo
index acc89e8e..cee58e6c 100644
--- a/problems/p29/p29.mojo
+++ b/problems/p29/p29.mojo
@@ -32,8 +32,8 @@ alias BLUR_RADIUS = 2
fn multi_stage_image_blur_pipeline[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- input: LayoutTensor[mut=False, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
"""Multi-stage image blur pipeline with barrier coordination.
@@ -47,13 +47,13 @@ fn multi_stage_image_blur_pipeline[
input_shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
blur_shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -91,8 +91,8 @@ alias BUFFER_COUNT = 2
fn double_buffered_stencil_computation[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- input: LayoutTensor[mut=False, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
"""Double-buffered stencil computation with memory barrier coordination.
@@ -105,13 +105,13 @@ fn double_buffered_stencil_computation[
buffer_A = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
buffer_B = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -119,19 +119,19 @@ fn double_buffered_stencil_computation[
init_barrier = LayoutTensor[
DType.uint64,
Layout.row_major(1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
iter_barrier = LayoutTensor[
DType.uint64,
Layout.row_major(1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
final_barrier = LayoutTensor[
DType.uint64,
Layout.row_major(1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -209,10 +209,11 @@ def test_multi_stage_pipeline():
inp_host[i] = Float32(i % 10) + Float32(i / 100.0)
# Create LayoutTensors
- out_tensor = LayoutTensor[mut=True, dtype, layout](out.unsafe_ptr())
- inp_tensor = LayoutTensor[mut=False, dtype, layout](inp.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ inp_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](inp)
- ctx.enqueue_function[multi_stage_image_blur_pipeline[layout]](
+ alias kernel = multi_stage_image_blur_pipeline[layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
inp_tensor,
SIZE,
@@ -268,10 +269,11 @@ def test_double_buffered_stencil():
inp_host[i] = Float32(1.0 if i % 20 < 10 else 0.0)
# Create LayoutTensors for Puzzle 29B
- out_tensor = LayoutTensor[mut=True, dtype, layout](out.unsafe_ptr())
- inp_tensor = LayoutTensor[mut=False, dtype, layout](inp.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ inp_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](inp)
- ctx.enqueue_function[double_buffered_stencil_computation[layout]](
+ alias kernel = double_buffered_stencil_computation[layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
inp_tensor,
SIZE,
diff --git a/problems/p30/p30.mojo b/problems/p30/p30.mojo
index 1f708bba..c6cd7b14 100644
--- a/problems/p30/p30.mojo
+++ b/problems/p30/p30.mojo
@@ -16,9 +16,9 @@ alias layout = Layout.row_major(SIZE)
fn kernel1[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- a: LayoutTensor[mut=False, dtype, layout],
- b: LayoutTensor[mut=False, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
i = block_dim.x * block_idx.x + thread_idx.x
@@ -33,9 +33,9 @@ fn kernel1[
fn kernel2[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- a: LayoutTensor[mut=False, dtype, layout],
- b: LayoutTensor[mut=False, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
tid = block_idx.x * block_dim.x + thread_idx.x
@@ -54,9 +54,9 @@ fn kernel2[
fn kernel3[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- a: LayoutTensor[mut=False, dtype, layout],
- b: LayoutTensor[mut=False, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
tid = block_idx.x * block_dim.x + thread_idx.x
@@ -88,11 +88,11 @@ fn benchmark_kernel1_parameterized[test_size: Int](mut b: Bencher) raises:
a_host[i] = Float32(i + 1)
b_host[i] = Float32(i + 2)
- out_tensor = LayoutTensor[mut=True, dtype, layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[mut=False, dtype, layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, layout](b_buf.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](b_buf)
- ctx.enqueue_function[kernel1[layout]](
+ ctx.enqueue_function_checked[kernel1[layout], kernel1[layout]](
out_tensor,
a_tensor,
b_tensor,
@@ -100,7 +100,7 @@ fn benchmark_kernel1_parameterized[test_size: Int](mut b: Bencher) raises:
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
- keep(out.unsafe_ptr())
+ keep(out)
ctx.synchronize()
bench_ctx = DeviceContext()
@@ -123,11 +123,11 @@ fn benchmark_kernel2_parameterized[test_size: Int](mut b: Bencher) raises:
a_host[i] = Float32(i + 1)
b_host[i] = Float32(i + 2)
- out_tensor = LayoutTensor[mut=True, dtype, layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[mut=False, dtype, layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, layout](b_buf.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](b_buf)
- ctx.enqueue_function[kernel2[layout]](
+ ctx.enqueue_function_checked[kernel2[layout], kernel2[layout]](
out_tensor,
a_tensor,
b_tensor,
@@ -135,7 +135,7 @@ fn benchmark_kernel2_parameterized[test_size: Int](mut b: Bencher) raises:
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
- keep(out.unsafe_ptr())
+ keep(out)
ctx.synchronize()
bench_ctx = DeviceContext()
@@ -158,11 +158,11 @@ fn benchmark_kernel3_parameterized[test_size: Int](mut b: Bencher) raises:
a_host[i] = Float32(i + 1)
b_host[i] = Float32(i + 2)
- out_tensor = LayoutTensor[mut=True, dtype, layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[mut=False, dtype, layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, layout](b_buf.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](b_buf)
- ctx.enqueue_function[kernel3[layout]](
+ ctx.enqueue_function_checked[kernel3[layout], kernel3[layout]](
out_tensor,
a_tensor,
b_tensor,
@@ -170,7 +170,7 @@ fn benchmark_kernel3_parameterized[test_size: Int](mut b: Bencher) raises:
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
- keep(out.unsafe_ptr())
+ keep(out)
ctx.synchronize()
bench_ctx = DeviceContext()
@@ -192,11 +192,11 @@ def test_kernel1():
b_host[i] = Float32(i + 2)
# Create LayoutTensors
- out_tensor = LayoutTensor[mut=True, dtype, layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[mut=False, dtype, layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, layout](b.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](b)
- ctx.enqueue_function[kernel1[layout]](
+ ctx.enqueue_function_checked[kernel1[layout], kernel1[layout]](
out_tensor,
a_tensor,
b_tensor,
@@ -232,11 +232,11 @@ def test_kernel2():
b_host[i] = Float32(i + 2)
# Create LayoutTensors
- out_tensor = LayoutTensor[mut=True, dtype, layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[mut=False, dtype, layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, layout](b.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](b)
- ctx.enqueue_function[kernel2[layout]](
+ ctx.enqueue_function_checked[kernel2[layout], kernel2[layout]](
out_tensor,
a_tensor,
b_tensor,
@@ -275,11 +275,11 @@ def test_kernel3():
b_host[i] = Float32(i + 2)
# Create LayoutTensors
- out_tensor = LayoutTensor[mut=True, dtype, layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[mut=False, dtype, layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, layout](b.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](b)
- ctx.enqueue_function[kernel3[layout]](
+ ctx.enqueue_function_checked[kernel3[layout], kernel3[layout]](
out_tensor,
a_tensor,
b_tensor,
diff --git a/problems/p31/p31.mojo b/problems/p31/p31.mojo
index 62bed3f8..373a38ad 100644
--- a/problems/p31/p31.mojo
+++ b/problems/p31/p31.mojo
@@ -49,7 +49,7 @@ fn sophisticated_kernel[
shared_cache = LayoutTensor[
dtype,
Layout.row_major(1024 * 12),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation() # 48KB
@@ -146,7 +146,7 @@ fn balanced_kernel[
shared_cache = LayoutTensor[
dtype,
Layout.row_major(1024 * 4),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation() # 16KB total
diff --git a/problems/p32/p32.mojo b/problems/p32/p32.mojo
index 71e35fe5..e6cf50cd 100644
--- a/problems/p32/p32.mojo
+++ b/problems/p32/p32.mojo
@@ -32,7 +32,7 @@ fn no_conflict_kernel[
shared_buf = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -75,7 +75,7 @@ fn two_way_conflict_kernel[
shared_buf = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
diff --git a/problems/p33/p33.mojo b/problems/p33/p33.mojo
index 4cf1b28d..b8788dc1 100644
--- a/problems/p33/p33.mojo
+++ b/problems/p33/p33.mojo
@@ -43,13 +43,13 @@ fn matmul_idiomatic_tiled[
a_shared = LayoutTensor[
dtype,
Layout.row_major(TILE_SIZE, TILE_SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
b_shared = LayoutTensor[
dtype,
Layout.row_major(TILE_SIZE, TILE_SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -158,13 +158,13 @@ fn tensor_core_matrix_multiplication[
A_sram_tile = LayoutTensor[
A.dtype,
Layout.row_major(BM, BK),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
B_sram_tile = LayoutTensor[
B.dtype,
Layout.row_major(BK, BN),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -172,7 +172,7 @@ fn tensor_core_matrix_multiplication[
C_warp_accum = LayoutTensor[
C.dtype,
Layout.row_major(WM, WN),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.GENERIC,
].stack_allocation()
diff --git a/problems/p34/p34.mojo b/problems/p34/p34.mojo
index 6c9bf308..d064d9c6 100644
--- a/problems/p34/p34.mojo
+++ b/problems/p34/p34.mojo
@@ -39,7 +39,7 @@ fn cluster_coordination_basics[
shared_data = LayoutTensor[
dtype,
Layout.row_major(tpb),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
diff --git a/solutions/p01/p01.mojo b/solutions/p01/p01.mojo
index a769de40..2ec97047 100644
--- a/solutions/p01/p01.mojo
+++ b/solutions/p01/p01.mojo
@@ -30,9 +30,9 @@ def main():
for i in range(SIZE):
a_host[i] = i
- ctx.enqueue_function[add_10](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10, add_10](
+ out,
+ a,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
diff --git a/solutions/p02/p02.mojo b/solutions/p02/p02.mojo
index 02d1a858..6628e174 100644
--- a/solutions/p02/p02.mojo
+++ b/solutions/p02/p02.mojo
@@ -34,10 +34,10 @@ def main():
b_host[i] = i
expected[i] = a_host[i] + b_host[i]
- ctx.enqueue_function[add](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
- b.unsafe_ptr(),
+ ctx.enqueue_function_checked[add, add](
+ out,
+ a,
+ b,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
)
diff --git a/solutions/p03/p03.mojo b/solutions/p03/p03.mojo
index 1282610f..fbd818b5 100644
--- a/solutions/p03/p03.mojo
+++ b/solutions/p03/p03.mojo
@@ -31,9 +31,9 @@ def main():
for i in range(SIZE):
a_host[i] = i
- ctx.enqueue_function[add_10_guard](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10_guard, add_10_guard](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/solutions/p04/p04.mojo b/solutions/p04/p04.mojo
index cb71c88a..ddd9ad82 100644
--- a/solutions/p04/p04.mojo
+++ b/solutions/p04/p04.mojo
@@ -38,9 +38,9 @@ def main():
a_host[y * SIZE + x] = y * SIZE + x
expected[y * SIZE + x] = a_host[y * SIZE + x] + 10
- ctx.enqueue_function[add_10_2d](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10_2d, add_10_2d](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/solutions/p04/p04_layout_tensor.mojo b/solutions/p04/p04_layout_tensor.mojo
index 09965f57..9eb656e6 100644
--- a/solutions/p04/p04_layout_tensor.mojo
+++ b/solutions/p04/p04_layout_tensor.mojo
@@ -12,8 +12,8 @@ alias layout = Layout.row_major(SIZE, SIZE)
# ANCHOR: add_10_2d_layout_tensor_solution
fn add_10_2d(
- output: LayoutTensor[mut=True, dtype, layout],
- a: LayoutTensor[mut=True, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, layout, MutAnyOrigin],
size: Int,
):
row = thread_idx.y
@@ -28,9 +28,9 @@ fn add_10_2d(
def main():
with DeviceContext() as ctx:
out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
- out_tensor = LayoutTensor[mut=True, dtype, layout](
- out_buf.unsafe_ptr()
- ).reshape[layout]()
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out_buf).reshape[
+ layout
+ ]()
print("out shape:", out_tensor.shape[0](), "x", out_tensor.shape[1]())
expected = ctx.enqueue_create_host_buffer[dtype](
@@ -43,11 +43,11 @@ def main():
a_host[i] = i
expected[i] = a_host[i] + 10
- a_tensor = LayoutTensor[mut=True, dtype, layout](
- a.unsafe_ptr()
- ).reshape[layout]()
+ a_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](a).reshape[
+ layout
+ ]()
- ctx.enqueue_function[add_10_2d](
+ ctx.enqueue_function_checked[add_10_2d, add_10_2d](
out_tensor,
a_tensor,
SIZE,
diff --git a/solutions/p05/p05.mojo b/solutions/p05/p05.mojo
index aa59180d..c9ca2411 100644
--- a/solutions/p05/p05.mojo
+++ b/solutions/p05/p05.mojo
@@ -42,10 +42,10 @@ def main():
for x in range(SIZE):
expected[y * SIZE + x] = a_host[x] + b_host[y]
- ctx.enqueue_function[broadcast_add](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
- b.unsafe_ptr(),
+ ctx.enqueue_function_checked[broadcast_add, broadcast_add](
+ out,
+ a,
+ b,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/solutions/p05/p05_layout_tensor.mojo b/solutions/p05/p05_layout_tensor.mojo
index 10c1b8f5..ab864d8e 100644
--- a/solutions/p05/p05_layout_tensor.mojo
+++ b/solutions/p05/p05_layout_tensor.mojo
@@ -18,9 +18,9 @@ fn broadcast_add[
a_layout: Layout,
b_layout: Layout,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, a_layout],
- b: LayoutTensor[mut=False, dtype, b_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, b_layout, ImmutAnyOrigin],
size: Int,
):
row = thread_idx.y
@@ -35,16 +35,14 @@ fn broadcast_add[
def main():
with DeviceContext() as ctx:
out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](
- out_buf.unsafe_ptr()
- )
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out_buf)
print("out shape:", out_tensor.shape[0](), "x", out_tensor.shape[1]())
expected_buf = ctx.enqueue_create_host_buffer[dtype](
SIZE * SIZE
).enqueue_fill(0)
- expected_tensor = LayoutTensor[mut=True, dtype, out_layout](
- expected_buf.unsafe_ptr()
+ expected_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](
+ expected_buf
)
a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
@@ -58,10 +56,11 @@ def main():
for j in range(SIZE):
expected_tensor[i, j] = a_host[j] + b_host[i]
- a_tensor = LayoutTensor[dtype, a_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[dtype, b_layout](b.unsafe_ptr())
+ a_tensor = LayoutTensor[dtype, a_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, b_layout, ImmutAnyOrigin](b)
- ctx.enqueue_function[broadcast_add[out_layout, a_layout, b_layout]](
+ alias kernel = broadcast_add[out_layout, a_layout, b_layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
b_tensor,
diff --git a/solutions/p06/p06.mojo b/solutions/p06/p06.mojo
index 84823c3b..209141a2 100644
--- a/solutions/p06/p06.mojo
+++ b/solutions/p06/p06.mojo
@@ -31,9 +31,9 @@ def main():
for i in range(SIZE):
a_host[i] = i
- ctx.enqueue_function[add_10_blocks](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10_blocks, add_10_blocks](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/solutions/p07/p07.mojo b/solutions/p07/p07.mojo
index 0c86ec6e..31bc4d96 100644
--- a/solutions/p07/p07.mojo
+++ b/solutions/p07/p07.mojo
@@ -39,9 +39,9 @@ def main():
a_host[k] = k
expected[k] = k + 10
- ctx.enqueue_function[add_10_blocks_2d](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10_blocks_2d, add_10_blocks_2d](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/solutions/p07/p07_layout_tensor.mojo b/solutions/p07/p07_layout_tensor.mojo
index 7a81832c..ecb06c8e 100644
--- a/solutions/p07/p07_layout_tensor.mojo
+++ b/solutions/p07/p07_layout_tensor.mojo
@@ -16,8 +16,8 @@ fn add_10_blocks_2d[
out_layout: Layout,
a_layout: Layout,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, a_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin],
size: Int,
):
row = block_dim.y * block_idx.y + thread_idx.y
@@ -32,9 +32,7 @@ fn add_10_blocks_2d[
def main():
with DeviceContext() as ctx:
out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
- out_tensor = LayoutTensor[dtype, out_layout, MutableAnyOrigin](
- out_buf.unsafe_ptr()
- )
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out_buf)
expected_buf = ctx.enqueue_create_host_buffer[dtype](
SIZE * SIZE
@@ -49,11 +47,10 @@ def main():
a_host[k] = k
expected_buf[k] = k + 10
- a_tensor = LayoutTensor[dtype, a_layout, MutableAnyOrigin](
- a.unsafe_ptr()
- )
+ a_tensor = LayoutTensor[dtype, a_layout, ImmutAnyOrigin](a)
- ctx.enqueue_function[add_10_blocks_2d[out_layout, a_layout]](
+ alias kernel = add_10_blocks_2d[out_layout, a_layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
SIZE,
@@ -63,16 +60,14 @@ def main():
ctx.synchronize()
- expected_tensor = LayoutTensor[dtype, out_layout, MutableAnyOrigin](
- expected_buf.unsafe_ptr()
+ expected_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](
+ expected_buf
)
with out_buf.map_to_host() as out_buf_host:
print(
"out:",
- LayoutTensor[dtype, out_layout, MutableAnyOrigin](
- out_buf_host.unsafe_ptr()
- ),
+ LayoutTensor[dtype, out_layout, MutAnyOrigin](out_buf_host),
)
print("expected:", expected_tensor)
for i in range(SIZE):
diff --git a/solutions/p08/p08.mojo b/solutions/p08/p08.mojo
index cd62ffd0..fe26e8bc 100644
--- a/solutions/p08/p08.mojo
+++ b/solutions/p08/p08.mojo
@@ -49,9 +49,9 @@ def main():
with DeviceContext() as ctx:
out = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(1)
- ctx.enqueue_function[add_10_shared](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[add_10_shared, add_10_shared](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/solutions/p08/p08_layout_tensor.mojo b/solutions/p08/p08_layout_tensor.mojo
index 8b46e8c0..edcf3b3b 100644
--- a/solutions/p08/p08_layout_tensor.mojo
+++ b/solutions/p08/p08_layout_tensor.mojo
@@ -17,15 +17,15 @@ alias layout = Layout.row_major(SIZE)
fn add_10_shared_layout_tensor[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- a: LayoutTensor[mut=True, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
# Allocate shared memory using tensor builder
shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -53,10 +53,11 @@ def main():
out = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(1)
- out_tensor = LayoutTensor[dtype, layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[dtype, layout](a.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
- ctx.enqueue_function[add_10_shared_layout_tensor[layout]](
+ alias kernel = add_10_shared_layout_tensor[layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
SIZE,
diff --git a/solutions/p10/p10.mojo b/solutions/p10/p10.mojo
index b68cb0ed..d1afb2da 100644
--- a/solutions/p10/p10.mojo
+++ b/solutions/p10/p10.mojo
@@ -27,7 +27,7 @@ fn shared_memory_race(
shared_sum = LayoutTensor[
dtype,
Layout.row_major(1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
diff --git a/solutions/p11/p11.mojo b/solutions/p11/p11.mojo
index ad84d2fd..c726363a 100644
--- a/solutions/p11/p11.mojo
+++ b/solutions/p11/p11.mojo
@@ -51,9 +51,9 @@ def main():
for i in range(SIZE):
a_host[i] = i
- ctx.enqueue_function[pooling](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
+ ctx.enqueue_function_checked[pooling, pooling](
+ out,
+ a,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
@@ -64,7 +64,7 @@ def main():
ctx.synchronize()
with a.map_to_host() as a_host:
- ptr = a_host.unsafe_ptr()
+ ptr = a_host
for i in range(SIZE):
s = Scalar[dtype](0)
for j in range(max(i - 2, 0), i + 1):
diff --git a/solutions/p11/p11_layout_tensor.mojo b/solutions/p11/p11_layout_tensor.mojo
index 509b93b7..6b7e0a40 100644
--- a/solutions/p11/p11_layout_tensor.mojo
+++ b/solutions/p11/p11_layout_tensor.mojo
@@ -16,15 +16,15 @@ alias layout = Layout.row_major(SIZE)
fn pooling[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- a: LayoutTensor[mut=True, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
# Allocate shared memory using tensor builder
shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -62,10 +62,10 @@ def main():
for i in range(SIZE):
a_host[i] = i
- out_tensor = LayoutTensor[dtype, layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[dtype, layout](a.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
- ctx.enqueue_function[pooling[layout]](
+ ctx.enqueue_function_checked[pooling[layout], pooling[layout]](
out_tensor,
a_tensor,
SIZE,
@@ -77,7 +77,7 @@ def main():
ctx.synchronize()
with a.map_to_host() as a_host:
- ptr = a_host.unsafe_ptr()
+ ptr = a_host
for i in range(SIZE):
s = Scalar[dtype](0)
for j in range(max(i - 2, 0), i + 1):
diff --git a/solutions/p12/p12.mojo b/solutions/p12/p12.mojo
index b8fa0d28..58de6394 100644
--- a/solutions/p12/p12.mojo
+++ b/solutions/p12/p12.mojo
@@ -67,10 +67,10 @@ def main():
a_host[i] = i
b_host[i] = i
- ctx.enqueue_function[dot_product](
- out.unsafe_ptr(),
- a.unsafe_ptr(),
- b.unsafe_ptr(),
+ ctx.enqueue_function_checked[dot_product, dot_product](
+ out,
+ a,
+ b,
SIZE,
grid_dim=BLOCKS_PER_GRID,
block_dim=THREADS_PER_BLOCK,
diff --git a/solutions/p12/p12_layout_tensor.mojo b/solutions/p12/p12_layout_tensor.mojo
index 2e4f2fa9..0368b86e 100644
--- a/solutions/p12/p12_layout_tensor.mojo
+++ b/solutions/p12/p12_layout_tensor.mojo
@@ -17,15 +17,15 @@ alias out_layout = Layout.row_major(1)
fn dot_product[
in_layout: Layout, out_layout: Layout
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=True, dtype, in_layout],
- b: LayoutTensor[mut=True, dtype, in_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
size: Int,
):
shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
global_i = block_dim.x * block_idx.x + thread_idx.x
@@ -66,11 +66,12 @@ def main():
a_host[i] = i
b_host[i] = i
- out_tensor = LayoutTensor[dtype, out_layout](out.unsafe_ptr())
- a_tensor = LayoutTensor[dtype, layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[dtype, layout](b.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](b)
- ctx.enqueue_function[dot_product[layout, out_layout]](
+ alias kernel = dot_product[layout, out_layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
b_tensor,
diff --git a/solutions/p13/p13.mojo b/solutions/p13/p13.mojo
index d973b72a..8569a367 100644
--- a/solutions/p13/p13.mojo
+++ b/solutions/p13/p13.mojo
@@ -29,13 +29,13 @@ fn conv_1d_simple[
shared_a = LayoutTensor[
dtype,
Layout.row_major(SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
shared_b = LayoutTensor[
dtype,
Layout.row_major(CONV),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
if global_i < SIZE:
@@ -97,13 +97,13 @@ fn conv_1d_block_boundary[
shared_a = LayoutTensor[
dtype,
Layout.row_major(TPB + CONV_2 - 1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
shared_b = LayoutTensor[
dtype,
Layout.row_major(CONV_2),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
if global_i < SIZE_2:
diff --git a/solutions/p14/p14.mojo b/solutions/p14/p14.mojo
index 4fe26559..6aeb7fae 100644
--- a/solutions/p14/p14.mojo
+++ b/solutions/p14/p14.mojo
@@ -27,7 +27,7 @@ fn prefix_sum_simple[
shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
if global_i < size:
@@ -77,7 +77,7 @@ fn prefix_sum_local_phase[
shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
diff --git a/solutions/p15/p15.mojo b/solutions/p15/p15.mojo
index 20ca2d33..cac1dcbd 100644
--- a/solutions/p15/p15.mojo
+++ b/solutions/p15/p15.mojo
@@ -29,7 +29,7 @@ fn axis_sum[
cache = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
diff --git a/solutions/p16/p16.mojo b/solutions/p16/p16.mojo
index d91dcde3..8fe16e59 100644
--- a/solutions/p16/p16.mojo
+++ b/solutions/p16/p16.mojo
@@ -53,13 +53,13 @@ fn single_block_matmul[
a_shared = LayoutTensor[
dtype,
Layout.row_major(TPB, TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
b_shared = LayoutTensor[
dtype,
Layout.row_major(TPB, TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -104,13 +104,13 @@ fn matmul_tiled[
a_shared = LayoutTensor[
dtype,
Layout.row_major(TPB, TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
b_shared = LayoutTensor[
dtype,
Layout.row_major(TPB, TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -174,13 +174,13 @@ fn matmul_idiomatic_tiled[
a_shared = LayoutTensor[
dtype,
Layout.row_major(TPB, TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
b_shared = LayoutTensor[
dtype,
Layout.row_major(TPB, TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
diff --git a/solutions/p17/op/conv1d.mojo b/solutions/p17/op/conv1d.mojo
index 2c0ec755..c6882db6 100644
--- a/solutions/p17/op/conv1d.mojo
+++ b/solutions/p17/op/conv1d.mojo
@@ -17,9 +17,9 @@ fn conv1d_kernel[
conv_size: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- input: LayoutTensor[mut=True, dtype, in_layout],
- kernel: LayoutTensor[mut=True, dtype, conv_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, in_layout, MutAnyOrigin],
+ kernel: LayoutTensor[dtype, conv_layout, MutAnyOrigin],
):
global_i = block_dim.x * block_idx.x + thread_idx.x
local_i = thread_idx.x
@@ -27,13 +27,13 @@ fn conv1d_kernel[
shared_a = LayoutTensor[
dtype,
Layout.row_major(TPB + conv_size - 1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
shared_b = LayoutTensor[
dtype,
Layout.row_major(conv_size),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
if global_i < input_size:
@@ -112,11 +112,10 @@ struct Conv1DCustomOp:
0,
)
# ANCHOR: conv1d_custom_op_solution
- gpu_ctx.enqueue_function[
- conv1d_kernel[
- in_layout, out_layout, conv_layout, input_size, conv_size
- ]
- ](
+ alias kernel = conv1d_kernel[
+ in_layout, out_layout, conv_layout, input_size, conv_size
+ ]
+ gpu_ctx.enqueue_function_checked[kernel, kernel](
output_tensor,
input_tensor,
kernel_tensor,
diff --git a/solutions/p18/op/softmax.mojo b/solutions/p18/op/softmax.mojo
index 3341d899..02843522 100644
--- a/solutions/p18/op/softmax.mojo
+++ b/solutions/p18/op/softmax.mojo
@@ -27,13 +27,13 @@ fn softmax_gpu_kernel[
shared_max = LayoutTensor[
dtype,
Layout.row_major(BLOCK_DIM_X),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
shared_sum = LayoutTensor[
dtype,
Layout.row_major(BLOCK_DIM_X),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
global_i = thread_idx.x
@@ -93,8 +93,8 @@ fn softmax_cpu_kernel[
input_size: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[dtype, layout, MutableAnyOrigin],
- input: LayoutTensor[dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, layout, MutAnyOrigin],
):
var max_val: Scalar[dtype] = min_finite[dtype]()
for i in range(input_size):
@@ -130,12 +130,12 @@ struct SoftmaxCustomOp:
ctx: DeviceContextPtr,
) raises:
# Note: rebind is necessary now but it shouldn't be!
- var output_tensor = rebind[
- LayoutTensor[dtype, layout, MutableAnyOrigin]
- ](output.to_layout_tensor())
- var input_tensor = rebind[
- LayoutTensor[dtype, layout, MutableAnyOrigin]
- ](input.to_layout_tensor())
+ var output_tensor = rebind[LayoutTensor[dtype, layout, MutAnyOrigin]](
+ output.to_layout_tensor()
+ )
+ var input_tensor = rebind[LayoutTensor[dtype, layout, MutAnyOrigin]](
+ input.to_layout_tensor()
+ )
@parameter
if target == "gpu":
diff --git a/solutions/p19/op/attention.mojo b/solutions/p19/op/attention.mojo
index 80c79c5a..903fc4fa 100644
--- a/solutions/p19/op/attention.mojo
+++ b/solutions/p19/op/attention.mojo
@@ -39,9 +39,9 @@ fn matmul_idiomatic_tiled[
inner: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, a_layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, b_layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, a_layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, b_layout, MutAnyOrigin],
):
"""Updated idiomatic tiled matrix multiplication from p16."""
local_row = thread_idx.y
@@ -56,13 +56,13 @@ fn matmul_idiomatic_tiled[
a_shared = LayoutTensor[
dtype,
Layout.row_major(MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
b_shared = LayoutTensor[
dtype,
Layout.row_major(MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
var acc: output.element_type = 0
@@ -126,14 +126,14 @@ fn transpose_kernel[
cols: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, layout_out, MutableAnyOrigin],
- inp: LayoutTensor[mut=False, dtype, layout_in, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout_out, MutAnyOrigin],
+ inp: LayoutTensor[mut=False, dtype, layout_in, MutAnyOrigin],
):
"""Transpose matrix using shared memory tiling for coalesced access."""
shared_tile = LayoutTensor[
dtype,
Layout.row_major(TRANSPOSE_BLOCK_DIM_XY, TRANSPOSE_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -172,13 +172,13 @@ fn softmax_gpu_kernel[
shared_max = LayoutTensor[
dtype,
Layout.row_major(SOFTMAX_BLOCK_DIM_X),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
shared_sum = LayoutTensor[
dtype,
Layout.row_major(SOFTMAX_BLOCK_DIM_X),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
global_i = thread_idx.x
@@ -239,10 +239,10 @@ fn attention_cpu_kernel[
d: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[dtype, layout_out, MutableAnyOrigin],
- q: LayoutTensor[dtype, layout_q, MutableAnyOrigin],
- k: LayoutTensor[dtype, layout_k, MutableAnyOrigin],
- v: LayoutTensor[dtype, layout_v, MutableAnyOrigin],
+ output: LayoutTensor[dtype, layout_out, MutAnyOrigin],
+ q: LayoutTensor[dtype, layout_q, MutAnyOrigin],
+ k: LayoutTensor[dtype, layout_k, MutAnyOrigin],
+ v: LayoutTensor[dtype, layout_v, MutAnyOrigin],
):
"""CPU implementation of vector attention."""
var scores = List[Float32]()
@@ -304,15 +304,15 @@ struct AttentionCustomOp:
# Convert to layout tensors
var output_tensor = rebind[
- LayoutTensor[dtype, layout_out, MutableAnyOrigin]
+ LayoutTensor[dtype, layout_out, MutAnyOrigin]
](output.to_layout_tensor())
- var q_tensor = rebind[LayoutTensor[dtype, layout_q, MutableAnyOrigin]](
+ var q_tensor = rebind[LayoutTensor[dtype, layout_q, MutAnyOrigin]](
q.to_layout_tensor()
)
- var k_tensor = rebind[LayoutTensor[dtype, layout_k, MutableAnyOrigin]](
+ var k_tensor = rebind[LayoutTensor[dtype, layout_k, MutAnyOrigin]](
k.to_layout_tensor()
)
- var v_tensor = rebind[LayoutTensor[dtype, layout_v, MutableAnyOrigin]](
+ var v_tensor = rebind[LayoutTensor[dtype, layout_v, MutAnyOrigin]](
v.to_layout_tensor()
)
@@ -367,7 +367,7 @@ struct AttentionCustomOp:
seq_len
) # Reused for scores and weights
- k_t = LayoutTensor[mut=True, dtype, layout_k_t, MutableAnyOrigin](
+ k_t = LayoutTensor[mut=True, dtype, layout_k_t, MutAnyOrigin](
k_t_buf.unsafe_ptr()
)
@@ -390,7 +390,7 @@ struct AttentionCustomOp:
# This computes Q · K^T[i] = Q · K[i] for each column i of K^T (which is row i of K)
# Reuse scores_weights_buf as (1, seq_len) for scores
scores_2d = LayoutTensor[
- mut=True, dtype, layout_scores_2d, MutableAnyOrigin
+ mut=True, dtype, layout_scores_2d, MutAnyOrigin
](scores_weights_buf.unsafe_ptr())
gpu_ctx.enqueue_function[
matmul_idiomatic_tiled[
diff --git a/solutions/p20/op/conv1d.mojo b/solutions/p20/op/conv1d.mojo
index 21a2f075..ef974087 100644
--- a/solutions/p20/op/conv1d.mojo
+++ b/solutions/p20/op/conv1d.mojo
@@ -18,9 +18,9 @@ fn conv1d_kernel[
conv_size: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- input: LayoutTensor[mut=True, dtype, in_layout],
- kernel: LayoutTensor[mut=True, dtype, conv_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, in_layout, MutAnyOrigin],
+ kernel: LayoutTensor[dtype, conv_layout, MutAnyOrigin],
):
global_i = block_dim.x * block_idx.x + thread_idx.x
local_i = thread_idx.x
@@ -28,13 +28,13 @@ fn conv1d_kernel[
shared_a = LayoutTensor[
dtype,
Layout.row_major(TPB + conv_size - 1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
shared_b = LayoutTensor[
dtype,
Layout.row_major(conv_size),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
if global_i < input_size:
@@ -111,11 +111,10 @@ struct Conv1DCustomOp:
0,
)
# ANCHOR: conv1d_custom_op_solution
- gpu_ctx.enqueue_function[
- conv1d_kernel[
- in_layout, out_layout, conv_layout, input_size, conv_size
- ]
- ](
+ alias kernel = conv1d_kernel[
+ in_layout, out_layout, conv_layout, input_size, conv_size
+ ]
+ gpu_ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
input_tensor,
kernel_tensor,
diff --git a/solutions/p21/op/embedding.mojo b/solutions/p21/op/embedding.mojo
index 95ed5582..284da92c 100644
--- a/solutions/p21/op/embedding.mojo
+++ b/solutions/p21/op/embedding.mojo
@@ -19,9 +19,9 @@ fn embedding_kernel_coalesced[
embed_dim: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- indices: LayoutTensor[mut=True, DType.int32, indices_layout],
- weights: LayoutTensor[mut=True, dtype, weights_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ indices: LayoutTensor[DType.int32, indices_layout, MutAnyOrigin],
+ weights: LayoutTensor[dtype, weights_layout, MutAnyOrigin],
):
"""
Memory-coalescing focused embedding kernel.
@@ -71,9 +71,9 @@ fn embedding_kernel_2d[
embed_dim: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- indices: LayoutTensor[mut=True, DType.int32, indices_layout],
- weights: LayoutTensor[mut=True, dtype, weights_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ indices: LayoutTensor[DType.int32, indices_layout, MutAnyOrigin],
+ weights: LayoutTensor[dtype, weights_layout, MutAnyOrigin],
):
"""
2D grid non-coalesced embedding kernel.
@@ -171,20 +171,19 @@ struct EmbeddingCustomOp:
blocks = max(1, ceildiv(total_elements, THREADS_PER_BLOCK))
# Compile and launch optimized kernel
- compiled_kernel = gpu_ctx.compile_function[
- embedding_kernel_coalesced[
- indices_layout,
- weights_layout,
- out_layout,
- batch_size,
- seq_len,
- vocab_size,
- embed_dim,
- output.dtype,
- ]
- ]()
-
- gpu_ctx.enqueue_function(
+ alias kernel = embedding_kernel_coalesced[
+ indices_layout,
+ weights_layout,
+ out_layout,
+ batch_size,
+ seq_len,
+ vocab_size,
+ embed_dim,
+ output.dtype,
+ ]
+ compiled_kernel = gpu_ctx.compile_function_checked[kernel, kernel]()
+
+ gpu_ctx.enqueue_function_checked(
compiled_kernel,
output_tensor,
indices_tensor,
diff --git a/solutions/p22/op/layernorm_linear.mojo b/solutions/p22/op/layernorm_linear.mojo
index dc055250..50856061 100644
--- a/solutions/p22/op/layernorm_linear.mojo
+++ b/solutions/p22/op/layernorm_linear.mojo
@@ -26,9 +26,9 @@ fn matmul_idiomatic_tiled[
inner: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, a_layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, b_layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, a_layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, b_layout, MutAnyOrigin],
):
"""Idiomatic tiled matrix multiplication from p19."""
local_row = thread_idx.y
@@ -43,13 +43,13 @@ fn matmul_idiomatic_tiled[
a_shared = LayoutTensor[
dtype,
Layout.row_major(MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
b_shared = LayoutTensor[
dtype,
Layout.row_major(MATMUL_BLOCK_DIM_XY, MATMUL_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
var acc: output.element_type = 0
@@ -167,8 +167,8 @@ fn transpose_kernel[
cols: Int,
dtype: DType = DType.float32,
](
- output: LayoutTensor[mut=True, dtype, layout_out, MutableAnyOrigin],
- inp: LayoutTensor[mut=False, dtype, layout_in, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout_out, MutAnyOrigin],
+ inp: LayoutTensor[mut=False, dtype, layout_in, MutAnyOrigin],
):
"""Transpose matrix using shared memory tiling for coalesced access.
We will learn more about coalesced access in the next part.
@@ -176,7 +176,7 @@ fn transpose_kernel[
shared_tile = LayoutTensor[
dtype,
Layout.row_major(TRANSPOSE_BLOCK_DIM_XY, TRANSPOSE_BLOCK_DIM_XY),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
diff --git a/solutions/p23/p23.mojo b/solutions/p23/p23.mojo
index c30683af..a654ed4a 100644
--- a/solutions/p23/p23.mojo
+++ b/solutions/p23/p23.mojo
@@ -20,9 +20,9 @@ alias SIMD_WIDTH = simd_width_of[dtype, target = get_gpu_target()]()
fn elementwise_add[
layout: Layout, dtype: DType, simd_width: Int, rank: Int, size: Int
](
- output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
ctx: DeviceContext,
) raises:
@parameter
@@ -60,9 +60,9 @@ fn tiled_elementwise_add[
size: Int,
tile_size: Int,
](
- output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
ctx: DeviceContext,
) raises:
@parameter
@@ -100,9 +100,9 @@ fn manual_vectorized_tiled_elementwise_add[
size: Int,
tile_size: Int,
](
- output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
ctx: DeviceContext,
) raises:
# Each tile contains tile_size groups of simd_width elements
@@ -150,9 +150,9 @@ fn vectorize_within_tiles_elementwise_add[
size: Int,
tile_size: Int,
](
- output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
ctx: DeviceContext,
) raises:
# Each tile contains tile_size elements (not SIMD groups)
@@ -203,13 +203,13 @@ fn benchmark_elementwise_parameterized[
a_host[i] = 2 * i
b_host[i] = 2 * i + 1
- a_tensor = LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin](
+ a_tensor = LayoutTensor[mut=False, dtype, layout, MutAnyOrigin](
a.unsafe_ptr()
)
- b_tensor = LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin](
+ b_tensor = LayoutTensor[mut=False, dtype, layout, MutAnyOrigin](
b_buf.unsafe_ptr()
)
- out_tensor = LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin](
+ out_tensor = LayoutTensor[mut=True, dtype, layout, MutAnyOrigin](
out.unsafe_ptr()
)
diff --git a/solutions/p24/p24.mojo b/solutions/p24/p24.mojo
index 17d7d243..c4fd95c5 100644
--- a/solutions/p24/p24.mojo
+++ b/solutions/p24/p24.mojo
@@ -34,9 +34,9 @@ alias out_layout = Layout.row_major(1)
fn traditional_dot_product_p12_style[
in_layout: Layout, out_layout: Layout, size: Int
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, in_layout],
- b: LayoutTensor[mut=False, dtype, in_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
):
"""
This is the complex approach from p12_layout_tensor.mojo - kept for comparison.
@@ -44,7 +44,7 @@ fn traditional_dot_product_p12_style[
shared = LayoutTensor[
dtype,
Layout.row_major(WARP_SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
global_i = block_dim.x * block_idx.x + thread_idx.x
@@ -75,9 +75,9 @@ fn traditional_dot_product_p12_style[
fn simple_warp_dot_product[
in_layout: Layout, out_layout: Layout, size: Int
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, in_layout],
- b: LayoutTensor[mut=False, dtype, in_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
):
global_i = block_dim.x * block_idx.x + thread_idx.x
@@ -106,9 +106,9 @@ fn functional_warp_dot_product[
rank: Int,
size: Int,
](
- output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
- a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
- b: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
+ output: LayoutTensor[mut=True, dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
+ b: LayoutTensor[mut=False, dtype, layout, MutAnyOrigin],
ctx: DeviceContext,
) raises:
@parameter
@@ -203,16 +203,15 @@ fn benchmark_simple_warp_parameterized[
rand_int[dtype, test_size](b)
expected_output[dtype, n_warps](expected, a, b)
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b)
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
@parameter
@always_inline
fn traditional_workflow(ctx: DeviceContext) raises:
- ctx.enqueue_function[
- simple_warp_dot_product[in_layout, out_layout, test_size]
- ](
+ alias kernel = simple_warp_dot_product[in_layout, out_layout, test_size]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
b_tensor,
@@ -250,9 +249,9 @@ fn benchmark_functional_warp_parameterized[
rand_int[dtype, test_size](b)
expected_output[dtype, n_warps](expected, a, b)
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b)
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
@parameter
@always_inline
@@ -292,15 +291,16 @@ fn benchmark_traditional_parameterized[
rand_int[dtype, test_size](b)
expected_output[dtype, n_warps](expected, a, b)
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](out.unsafe_ptr())
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b)
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
@parameter
@always_inline
fn traditional_workflow(ctx: DeviceContext) raises:
- ctx.enqueue_function[
- traditional_dot_product_p12_style[in_layout, out_layout, test_size]
+ ctx.enqueue_function_checked[
+ traditional_dot_product_p12_style[in_layout, out_layout, test_size],
+ traditional_dot_product_p12_style[in_layout, out_layout, test_size],
](
out_tensor,
a_tensor,
@@ -331,11 +331,9 @@ def main():
n_warps
).enqueue_fill(0)
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](
- out.unsafe_ptr()
- )
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](b.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b)
with a.map_to_host() as a_host, b.map_to_host() as b_host:
for i in range(SIZE):
@@ -343,10 +341,13 @@ def main():
b_host[i] = i
if argv()[1] == "--traditional":
- ctx.enqueue_function[
+ ctx.enqueue_function_checked[
+ traditional_dot_product_p12_style[
+ in_layout, out_layout, SIZE
+ ],
traditional_dot_product_p12_style[
in_layout, out_layout, SIZE
- ]
+ ],
](
out_tensor,
a_tensor,
@@ -355,8 +356,9 @@ def main():
block_dim=THREADS_PER_BLOCK,
)
elif argv()[1] == "--kernel":
- ctx.enqueue_function[
- simple_warp_dot_product[in_layout, out_layout, SIZE]
+ ctx.enqueue_function_checked[
+ simple_warp_dot_product[in_layout, out_layout, SIZE],
+ simple_warp_dot_product[in_layout, out_layout, SIZE],
](
out_tensor,
a_tensor,
diff --git a/solutions/p27/p27.mojo b/solutions/p27/p27.mojo
index 38ac5f0c..d5d7eb6f 100644
--- a/solutions/p27/p27.mojo
+++ b/solutions/p27/p27.mojo
@@ -21,9 +21,9 @@ alias dtype = DType.float32
fn block_sum_dot_product[
in_layout: Layout, out_layout: Layout, tpb: Int
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, in_layout],
- b: LayoutTensor[mut=False, dtype, in_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
size: Int,
):
"""Dot product using block.sum() - convenience function like warp.sum()!
@@ -56,9 +56,9 @@ fn block_sum_dot_product[
fn traditional_dot_product[
in_layout: Layout, out_layout: Layout, tpb: Int
](
- output: LayoutTensor[mut=True, dtype, out_layout],
- a: LayoutTensor[mut=False, dtype, in_layout],
- b: LayoutTensor[mut=False, dtype, in_layout],
+ output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+ a: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ b: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
size: Int,
):
"""Traditional dot product using shared memory + barriers + tree reduction.
@@ -67,7 +67,7 @@ fn traditional_dot_product[
shared = LayoutTensor[
dtype,
Layout.row_major(tpb),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
global_i = block_dim.x * block_idx.x + thread_idx.x
@@ -103,9 +103,9 @@ alias bin_layout = Layout.row_major(SIZE) # Max SIZE elements per bin
fn block_histogram_bin_extract[
in_layout: Layout, bin_layout: Layout, out_layout: Layout, tpb: Int
](
- input_data: LayoutTensor[mut=False, dtype, in_layout],
- bin_output: LayoutTensor[mut=True, dtype, bin_layout],
- count_output: LayoutTensor[mut=True, DType.int32, out_layout],
+ input_data: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ bin_output: LayoutTensor[dtype, bin_layout, MutAnyOrigin],
+ count_output: LayoutTensor[DType.int32, out_layout, MutAnyOrigin],
size: Int,
target_bin: Int,
num_bins: Int,
@@ -167,8 +167,8 @@ alias vector_layout = Layout.row_major(SIZE) # For full vector output
fn block_normalize_vector[
in_layout: Layout, out_layout: Layout, tpb: Int
](
- input_data: LayoutTensor[mut=False, dtype, in_layout],
- output_data: LayoutTensor[mut=True, dtype, out_layout],
+ input_data: LayoutTensor[dtype, in_layout, ImmutAnyOrigin],
+ output_data: LayoutTensor[dtype, out_layout, MutAnyOrigin],
size: Int,
):
"""Vector mean normalization using block.sum() + block.broadcast() combination.
@@ -239,18 +239,13 @@ def main():
print("TPB:", TPB)
print("Expected result:", expected)
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](
- b_buf.unsafe_ptr()
- )
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](
- out.unsafe_ptr()
- )
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b_buf)
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
# Traditional approach: works perfectly when size == TPB
- ctx.enqueue_function[
- traditional_dot_product[in_layout, out_layout, TPB]
- ](
+ alias kernel = traditional_dot_product[in_layout, out_layout, TPB]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
b_tensor,
@@ -283,18 +278,13 @@ def main():
print("TPB:", TPB)
print("Expected result:", expected)
- a_tensor = LayoutTensor[mut=False, dtype, in_layout](a.unsafe_ptr())
- b_tensor = LayoutTensor[mut=False, dtype, in_layout](
- b_buf.unsafe_ptr()
- )
- out_tensor = LayoutTensor[mut=True, dtype, out_layout](
- out.unsafe_ptr()
- )
+ a_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](a)
+ b_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](b_buf)
+ out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out)
# Block.sum(): Same result with dramatically simpler code!
- ctx.enqueue_function[
- block_sum_dot_product[in_layout, out_layout, TPB]
- ](
+ alias kernel = block_sum_dot_product[in_layout, out_layout, TPB]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
a_tensor,
b_tensor,
@@ -340,8 +330,8 @@ def main():
print("...")
print()
- input_tensor = LayoutTensor[mut=False, dtype, in_layout](
- input_buf.unsafe_ptr()
+ input_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](
+ input_buf
)
# Demonstrate histogram for each bin using block.prefix_sum()
@@ -364,19 +354,18 @@ def main():
1
).enqueue_fill(0)
- bin_tensor = LayoutTensor[mut=True, dtype, bin_layout](
- bin_data.unsafe_ptr()
- )
- count_tensor = LayoutTensor[mut=True, DType.int32, out_layout](
- bin_count.unsafe_ptr()
+ bin_tensor = LayoutTensor[dtype, bin_layout, MutAnyOrigin](
+ bin_data
)
+ count_tensor = LayoutTensor[
+ DType.int32, out_layout, MutAnyOrigin
+ ](bin_count)
# Execute histogram kernel for this specific bin
- ctx.enqueue_function[
- block_histogram_bin_extract[
- in_layout, bin_layout, out_layout, TPB
- ]
- ](
+ alias kernel = block_histogram_bin_extract[
+ in_layout, bin_layout, out_layout, TPB
+ ]
+ ctx.enqueue_function_checked[kernel, kernel](
input_tensor,
bin_tensor,
count_tensor,
@@ -439,17 +428,16 @@ def main():
print("Mean value:", mean_value)
print()
- input_tensor = LayoutTensor[mut=False, dtype, in_layout](
- input_buf.unsafe_ptr()
+ input_tensor = LayoutTensor[dtype, in_layout, ImmutAnyOrigin](
+ input_buf
)
- output_tensor = LayoutTensor[mut=True, dtype, vector_layout](
- output_buf.unsafe_ptr()
+ output_tensor = LayoutTensor[dtype, vector_layout, MutAnyOrigin](
+ output_buf
)
# Execute vector normalization kernel
- ctx.enqueue_function[
- block_normalize_vector[in_layout, vector_layout, TPB]
- ](
+ alias kernel = block_normalize_vector[in_layout, vector_layout, TPB]
+ ctx.enqueue_function_checked[kernel, kernel](
input_tensor,
output_tensor,
SIZE,
diff --git a/solutions/p28/p28.mojo b/solutions/p28/p28.mojo
index 550e07e7..63f676a6 100644
--- a/solutions/p28/p28.mojo
+++ b/solutions/p28/p28.mojo
@@ -24,9 +24,9 @@ alias layout_async = Layout.row_major(VECTOR_SIZE)
fn async_copy_overlap_convolution[
dtype: DType, layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- input: LayoutTensor[mut=False, dtype, layout],
- kernel: LayoutTensor[mut=False, dtype, Layout.row_major(KERNEL_SIZE)],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, layout, ImmutAnyOrigin],
+ kernel: LayoutTensor[dtype, Layout.row_major(KERNEL_SIZE), ImmutAnyOrigin],
):
"""Demonstrates async copy operations building on p14 patterns.
@@ -38,13 +38,13 @@ fn async_copy_overlap_convolution[
input_shared = LayoutTensor[
dtype,
Layout.row_major(CONV_TILE_SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
kernel_shared = LayoutTensor[
dtype,
Layout.row_major(KERNEL_SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -111,19 +111,18 @@ def test_async_copy_overlap_convolution():
for i in range(KERNEL_SIZE):
kernel_host[i] = Float32(i + 1)
- input_tensor = LayoutTensor[mut=False, dtype, layout_async](
- input_buf.unsafe_ptr()
+ input_tensor = LayoutTensor[dtype, layout_async, ImmutAnyOrigin](
+ input_buf
)
- output_tensor = LayoutTensor[mut=True, dtype, layout_async](
- output_buf.unsafe_ptr()
+ output_tensor = LayoutTensor[dtype, layout_async, MutAnyOrigin](
+ output_buf
)
kernel_tensor = LayoutTensor[
mut=False, dtype, Layout.row_major(KERNEL_SIZE)
- ](kernel_buf.unsafe_ptr())
+ ](kernel_buf)
- ctx.enqueue_function[
- async_copy_overlap_convolution[dtype, layout_async]
- ](
+ alias kernel = async_copy_overlap_convolution[dtype, layout_async]
+ ctx.enqueue_function_checked[kernel, kernel](
output_tensor,
input_tensor,
kernel_tensor,
diff --git a/solutions/p29/p29.mojo b/solutions/p29/p29.mojo
index 14073287..333b2423 100644
--- a/solutions/p29/p29.mojo
+++ b/solutions/p29/p29.mojo
@@ -28,8 +28,8 @@ alias BLUR_RADIUS = 2
fn multi_stage_image_blur_pipeline[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- input: LayoutTensor[mut=False, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
"""Multi-stage image blur pipeline with barrier coordination.
@@ -43,13 +43,13 @@ fn multi_stage_image_blur_pipeline[
input_shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
blur_shared = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -136,8 +136,8 @@ alias BUFFER_COUNT = 2
fn double_buffered_stencil_computation[
layout: Layout
](
- output: LayoutTensor[mut=True, dtype, layout],
- input: LayoutTensor[mut=False, dtype, layout],
+ output: LayoutTensor[dtype, layout, MutAnyOrigin],
+ input: LayoutTensor[dtype, layout, ImmutAnyOrigin],
size: Int,
):
"""Double-buffered stencil computation with memory barrier coordination.
@@ -150,13 +150,13 @@ fn double_buffered_stencil_computation[
buffer_A = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
buffer_B = LayoutTensor[
dtype,
Layout.row_major(TPB),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -164,19 +164,19 @@ fn double_buffered_stencil_computation[
init_barrier = LayoutTensor[
DType.uint64,
Layout.row_major(1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
iter_barrier = LayoutTensor[
DType.uint64,
Layout.row_major(1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
final_barrier = LayoutTensor[
DType.uint64,
Layout.row_major(1),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -284,10 +284,11 @@ def test_multi_stage_pipeline():
inp_host[i] = Float32(i % 10) + Float32(i / 100.0)
# Create LayoutTensors
- out_tensor = LayoutTensor[mut=True, dtype, layout](out.unsafe_ptr())
- inp_tensor = LayoutTensor[mut=False, dtype, layout](inp.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ inp_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](inp)
- ctx.enqueue_function[multi_stage_image_blur_pipeline[layout]](
+ alias kernel = multi_stage_image_blur_pipeline[layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
inp_tensor,
SIZE,
@@ -343,10 +344,11 @@ def test_double_buffered_stencil():
inp_host[i] = Float32(1.0 if i % 20 < 10 else 0.0)
# Create LayoutTensors for Puzzle 26B
- out_tensor = LayoutTensor[mut=True, dtype, layout](out.unsafe_ptr())
- inp_tensor = LayoutTensor[mut=False, dtype, layout](inp.unsafe_ptr())
+ out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out)
+ inp_tensor = LayoutTensor[dtype, layout, ImmutAnyOrigin](inp)
- ctx.enqueue_function[double_buffered_stencil_computation[layout]](
+ alias kernel = double_buffered_stencil_computation[layout]
+ ctx.enqueue_function_checked[kernel, kernel](
out_tensor,
inp_tensor,
SIZE,
diff --git a/solutions/p33/p33.mojo b/solutions/p33/p33.mojo
index 70d3097d..9950e686 100644
--- a/solutions/p33/p33.mojo
+++ b/solutions/p33/p33.mojo
@@ -43,13 +43,13 @@ fn matmul_idiomatic_tiled[
a_shared = LayoutTensor[
dtype,
Layout.row_major(TILE_SIZE, TILE_SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
b_shared = LayoutTensor[
dtype,
Layout.row_major(TILE_SIZE, TILE_SIZE),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -158,13 +158,13 @@ fn tensor_core_matrix_multiplication[
A_sram_tile = LayoutTensor[
A.dtype,
Layout.row_major(BM, BK),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
B_sram_tile = LayoutTensor[
B.dtype,
Layout.row_major(BK, BN),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -172,7 +172,7 @@ fn tensor_core_matrix_multiplication[
C_warp_accum = LayoutTensor[
C.dtype,
Layout.row_major(WM, WN),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.LOCAL,
].stack_allocation()
diff --git a/solutions/p34/p34.mojo b/solutions/p34/p34.mojo
index 82ba485b..b4e4ee72 100644
--- a/solutions/p34/p34.mojo
+++ b/solutions/p34/p34.mojo
@@ -39,7 +39,7 @@ fn cluster_coordination_basics[
shared_data = LayoutTensor[
dtype,
Layout.row_major(tpb),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
@@ -99,7 +99,7 @@ fn cluster_collective_operations[
shared_mem = LayoutTensor[
dtype,
Layout.row_major(tpb),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()
shared_mem[local_i] = my_value
@@ -149,7 +149,7 @@ fn advanced_cluster_patterns[
shared_data = LayoutTensor[
dtype,
Layout.row_major(tpb),
- MutableAnyOrigin,
+ MutAnyOrigin,
address_space = AddressSpace.SHARED,
].stack_allocation()