modular · BradLarson · Nov 7, 2025 · Nov 3, 2025 · Nov 4, 2025 · Nov 7, 2025
diff --git a/book/src/puzzle_04/intro.mojo b/book/src/puzzle_04/intro.mojo
@@ -6,7 +6,7 @@ alias WIDTH = 3
 alias dtype = DType.float32
 alias layout = Layout.row_major(HEIGHT, WIDTH)
 
-fn kernel[dtype: DType, layout: Layout](tensor: LayoutTensor[mut=True, dtype, layout]):
+fn kernel[dtype: DType, layout: Layout](tensor: LayoutTensor[dtype, layout, MutAnyOrigin]):
     print("Before:")
     print(tensor)
     tensor[0, 0] += 1
@@ -17,8 +17,8 @@ def main():
     ctx = DeviceContext()
 
     a = ctx.enqueue_create_buffer[dtype](HEIGHT * WIDTH).enqueue_fill(0)
-    tensor = LayoutTensor[mut=True, dtype, layout](a.unsafe_ptr())
+    tensor = LayoutTensor[dtype, layout, MutAnyOrigin](a)
     # Note: since `tensor` is a device tensor we can't print it without the kernel wrapper
-    ctx.enqueue_function[kernel[dtype, layout]](tensor, grid_dim=1, block_dim=1)
+    ctx.enqueue_function_checked[kernel[dtype, layout], kernel[dtype, layout]](tensor, grid_dim=1, block_dim=1)
 
     ctx.synchronize()
diff --git a/book/src/puzzle_08/layout_tensor.md b/book/src/puzzle_08/layout_tensor.md
@@ -30,7 +30,7 @@ The key insight is how LayoutTensor simplifies shared memory management while ma
    shared = stack_allocation[TPB, Scalar[dtype]]()
 
    # LayoutTensor approach
-   shared = LayoutTensor[dtype, Layout.row_major(TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+   shared = LayoutTensor[dtype, Layout.row_major(TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
    ```
 
 2. **Memory access**: Same syntax
@@ -168,7 +168,7 @@ This solution demonstrates how LayoutTensor simplifies shared memory usage while
 
      ```txt
      # Clean LayoutTensor API with address_space
-     shared = LayoutTensor[dtype, Layout.row_major(TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+     shared = LayoutTensor[dtype, Layout.row_major(TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
      ```
 
    - Natural indexing for both global and shared:

diff --git a/book/src/puzzle_11/layout_tensor.md b/book/src/puzzle_11/layout_tensor.md
@@ -24,7 +24,7 @@ The key insight is how LayoutTensor simplifies shared memory management while ma
 
 Notes:
 
-- **LayoutTensor allocation**: Use `LayoutTensor[dtype, Layout.row_major(TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
+- **LayoutTensor allocation**: Use `LayoutTensor[dtype, Layout.row_major(TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
 - **Window access**: Natural indexing for 3-element windows
 - **Edge handling**: Special cases for first two positions
 - **Memory pattern**: One shared memory load per thread
@@ -116,7 +116,7 @@ The solution implements a sliding window sum using LayoutTensor with these key s
    - LayoutTensor creates block-local storage with address_space:
 
      ```txt
-     shared = LayoutTensor[dtype, Layout.row_major(TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+     shared = LayoutTensor[dtype, Layout.row_major(TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
      ```
 
    - Each thread loads one element:

diff --git a/book/src/puzzle_12/layout_tensor.md b/book/src/puzzle_12/layout_tensor.md
@@ -25,7 +25,7 @@ The key insight is how LayoutTensor simplifies memory management while maintaini
 
 Notes:
 
-- **LayoutTensor allocation**: Use `LayoutTensor[dtype, Layout.row_major(TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
+- **LayoutTensor allocation**: Use `LayoutTensor[dtype, Layout.row_major(TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
 - **Element access**: Natural indexing with bounds checking
 - **Layout handling**: Separate layouts for input and output
 - **Thread coordination**: Same synchronization patterns with `barrier()`

diff --git a/book/src/puzzle_13/block_boundary.md b/book/src/puzzle_13/block_boundary.md
@@ -32,7 +32,7 @@ Notes:
 
 <div class="solution-tips">
 
-1. Use `LayoutTensor[dtype, Layout.row_major(TPB + CONV_2 - 1), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` for shared memory
+1. Use `LayoutTensor[dtype, Layout.row_major(TPB + CONV_2 - 1), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` for shared memory
 2. Load main data: `shared_a[local_i] = a[global_i]`
 3. Load boundary: `if local_i < CONV_2 - 1` handle next block data
 4. Load kernel: `shared_b[local_i] = b[local_i]`
@@ -125,8 +125,8 @@ Size calculation:
 
    ```mojo
    # First: account for padding needed for convolution window
-   shared_a = LayoutTensor[dtype, Layout.row_major(TPB + CONV_2 - 1), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
-   shared_b = LayoutTensor[dtype, Layout.row_major(CONV_2), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+   shared_a = LayoutTensor[dtype, Layout.row_major(TPB + CONV_2 - 1), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+   shared_b = LayoutTensor[dtype, Layout.row_major(CONV_2), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
    ```
 
    This allocation pattern ensures we have enough space for both the block's data and the overlap region.

diff --git a/book/src/puzzle_14/complete.md b/book/src/puzzle_14/complete.md
@@ -343,10 +343,10 @@ The two kernel phases execute sequentially **without any explicit synchronizatio
 
 ```mojo
 # Phase 1: Local prefix sums
-ctx.enqueue_function[prefix_sum_local_phase[...]](...)
+ctx.enqueue_function_checked[prefix_sum_local_phase[...], prefix_sum_local_phase[...]](...)
 
 # Phase 2: Add block sums (automatically waits for Phase 1)
-ctx.enqueue_function[prefix_sum_block_sum_phase[...]](...)
+ctx.enqueue_function_checked[prefix_sum_block_sum_phase[...], prefix_sum_block_sum_phase[...]](...)
 ```
 
 **Key insight**: Mojo's `DeviceContext` uses a single execution stream (CUDA stream on NVIDIA GPUs, HIP stream on AMD ROCm GPUs), which guarantees that kernel launches execute in the exact order they are enqueued. No explicit synchronization is needed between kernels.

diff --git a/book/src/puzzle_16/shared_memory.md b/book/src/puzzle_16/shared_memory.md
@@ -131,8 +131,8 @@ Matrix B:                           b_shared: (similar layout)
 
    ```mojo
    # Create 2D shared memory tensors using LayoutTensor with address_space
-   a_shared = LayoutTensor[dtype, Layout.row_major(TPB, TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
-   b_shared = LayoutTensor[dtype, Layout.row_major(TPB, TPB), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+   a_shared = LayoutTensor[dtype, Layout.row_major(TPB, TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+   b_shared = LayoutTensor[dtype, Layout.row_major(TPB, TPB), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
    ```
 
 2. **Thread Indexing**:

diff --git a/book/src/puzzle_17/puzzle_17.md b/book/src/puzzle_17/puzzle_17.md
@@ -181,7 +181,7 @@ Let's break down how this works in the larger context:
    ```mojo
    gpu_ctx = ctx.get_device_context()
    gpu_ctx.enqueue_memset(...)  # Zero output buffer
-   gpu_ctx.enqueue_function[...](...) # Schedule kernel
+   gpu_ctx.enqueue_function_checked[..., ...](...) # Schedule kernel
    ```
 
    - Device context manages GPU resources

diff --git a/book/src/puzzle_18/puzzle_18.md b/book/src/puzzle_18/puzzle_18.md
@@ -265,8 +265,8 @@ The kernel is parameterized with:
 #### Shared memory allocation
 
 ```mojo
-shared_max = LayoutTensor[dtype, Layout.row_major(BLOCK_DIM_X), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
-shared_sum = LayoutTensor[dtype, Layout.row_major(BLOCK_DIM_X), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+shared_max = LayoutTensor[dtype, Layout.row_major(BLOCK_DIM_X), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
+shared_sum = LayoutTensor[dtype, Layout.row_major(BLOCK_DIM_X), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()
 ```
 
 The kernel allocates two shared memory buffers:

diff --git a/book/src/puzzle_19/puzzle_19.md b/book/src/puzzle_19/puzzle_19.md
@@ -121,7 +121,7 @@ To complete this puzzle, we'll leverage the tiled matmul kernel from [Puzzle 16]
 
 **Transpose Kernel Implementation Guide:**
 
-1. **Shared Memory Setup**: Use `LayoutTensor[dtype, Layout.row_major(TRANSPOSE_BLOCK_DIM_XY, TRANSPOSE_BLOCK_DIM_XY), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` to create a square `TRANSPOSE_BLOCK_DIM_XY` × `TRANSPOSE_BLOCK_DIM_XY` shared memory tile for efficient data exchange between threads
+1. **Shared Memory Setup**: Use `LayoutTensor[dtype, Layout.row_major(TRANSPOSE_BLOCK_DIM_XY, TRANSPOSE_BLOCK_DIM_XY), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` to create a square `TRANSPOSE_BLOCK_DIM_XY` × `TRANSPOSE_BLOCK_DIM_XY` shared memory tile for efficient data exchange between threads
 
 2. **Thread Indexing**: Map threads to matrix elements:
    - `local_row = thread_idx.y`, `local_col = thread_idx.x` (position within the block)

diff --git a/book/src/puzzle_32/conflict_free_patterns.md b/book/src/puzzle_32/conflict_free_patterns.md
@@ -354,7 +354,7 @@ constant = shared[0]  # All threads read same address - hardware optimized
 **3. Padding techniques:**
 
 ```mojo
-shared = LayoutTensor[dtype, Layout.row_major(TPB + 1), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()  # Shift access patterns
+shared = LayoutTensor[dtype, Layout.row_major(TPB + 1), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()  # Shift access patterns
 ```
 
 **4. Access pattern analysis:**

diff --git a/book/src/puzzle_34/cluster_coordination_basics.md b/book/src/puzzle_34/cluster_coordination_basics.md
@@ -65,7 +65,7 @@ Traditional single-block algorithms like those in [Puzzle 27](../puzzle_27/puzzl
 
 ### **Shared memory coordination**
 
-- Allocate shared memory using `LayoutTensor[dtype, Layout.row_major(tpb), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` (see [shared memory basics from Puzzle 8](../puzzle_08/puzzle_08.md))
+- Allocate shared memory using `LayoutTensor[dtype, Layout.row_major(tpb), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()` (see [shared memory basics from Puzzle 8](../puzzle_08/puzzle_08.md))
 - Process input data scaled by `block_id + 1` to create distinct scaling per block
 - Use bounds checking when accessing input data (pattern from [guards in Puzzle 3](../puzzle_03/puzzle_03.md))
 
@@ -153,7 +153,7 @@ block_id = Int(block_idx.x)                          # Block index for reliable
 
 **Shared memory allocation and data processing:**
 
-- Each block allocates its own shared memory workspace: `LayoutTensor[dtype, Layout.row_major(tpb), MutableAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
+- Each block allocates its own shared memory workspace: `LayoutTensor[dtype, Layout.row_major(tpb), MutAnyOrigin, address_space = AddressSpace.SHARED].stack_allocation()`
 - **Scaling strategy**: `data_scale = Float32(block_id + 1)` ensures each block processes data differently
   - Block 0: multiplies by 1.0, Block 1: by 2.0, Block 2: by 3.0, Block 3: by 4.0
 - **Bounds checking**: `if global_i < size:` prevents out-of-bounds memory access

diff --git a/problems/p01/p01.mojo b/problems/p01/p01.mojo
@@ -30,9 +30,9 @@ def main():
             for i in range(SIZE):
                 a_host[i] = i
 
-        ctx.enqueue_function[add_10](
-            out.unsafe_ptr(),
-            a.unsafe_ptr(),
+        ctx.enqueue_function_checked[add_10, add_10](
+            out,
+            a,
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )

diff --git a/problems/p02/p02.mojo b/problems/p02/p02.mojo
@@ -34,10 +34,10 @@ def main():
                 b_host[i] = i
                 expected[i] = a_host[i] + b_host[i]
 
-        ctx.enqueue_function[add](
-            out.unsafe_ptr(),
-            a.unsafe_ptr(),
-            b.unsafe_ptr(),
+        ctx.enqueue_function_checked[add, add](
+            out,
+            a,
+            b,
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )

diff --git a/problems/p03/p03.mojo b/problems/p03/p03.mojo
@@ -30,9 +30,9 @@ def main():
             for i in range(SIZE):
                 a_host[i] = i
 
-        ctx.enqueue_function[add_10_guard](
-            out.unsafe_ptr(),
-            a.unsafe_ptr(),
+        ctx.enqueue_function_checked[add_10_guard, add_10_guard](
+            out,
+            a,
             SIZE,
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,

diff --git a/problems/p04/p04.mojo b/problems/p04/p04.mojo
@@ -37,9 +37,9 @@ def main():
                     a_host[i * SIZE + j] = i * SIZE + j
                     expected[i * SIZE + j] = a_host[i * SIZE + j] + 10
 
-        ctx.enqueue_function[add_10_2d](
-            out.unsafe_ptr(),
-            a.unsafe_ptr(),
+        ctx.enqueue_function_checked[add_10_2d, add_10_2d](
+            out,
+            a,
             SIZE,
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,

diff --git a/problems/p04/p04_layout_tensor.mojo b/problems/p04/p04_layout_tensor.mojo
@@ -12,8 +12,8 @@ alias layout = Layout.row_major(SIZE, SIZE)
 
 
 fn add_10_2d(
-    output: LayoutTensor[mut=True, dtype, layout],
-    a: LayoutTensor[mut=True, dtype, layout],
+    output: LayoutTensor[dtype, layout, MutAnyOrigin],
+    a: LayoutTensor[dtype, layout, MutAnyOrigin],
     size: Int,
 ):
     row = thread_idx.y
@@ -27,7 +27,7 @@ fn add_10_2d(
 def main():
     with DeviceContext() as ctx:
         out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
-        out_tensor = LayoutTensor[mut=True, dtype, layout](out_buf.unsafe_ptr())
+        out_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](out_buf)
         print("out shape:", out_tensor.shape[0](), "x", out_tensor.shape[1]())
 
         expected = ctx.enqueue_create_host_buffer[dtype](
@@ -40,9 +40,9 @@ def main():
                 a_host[i] = i
                 expected[i] = a_host[i] + 10
 
-        a_tensor = LayoutTensor[mut=True, dtype, layout](a.unsafe_ptr())
+        a_tensor = LayoutTensor[dtype, layout, MutAnyOrigin](a)
 
-        ctx.enqueue_function[add_10_2d](
+        ctx.enqueue_function_checked[add_10_2d, add_10_2d](
             out_tensor,
             a_tensor,
             SIZE,

diff --git a/problems/p05/p05.mojo b/problems/p05/p05.mojo
@@ -39,10 +39,10 @@ def main():
                 for j in range(SIZE):
                     expected[i * SIZE + j] = a_host[j] + b_host[i]
 
-        ctx.enqueue_function[broadcast_add](
-            out.unsafe_ptr(),
-            a.unsafe_ptr(),
-            b.unsafe_ptr(),
+        ctx.enqueue_function_checked[broadcast_add, broadcast_add](
+            out,
+            a,
+            b,
             SIZE,
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,

diff --git a/problems/p05/p05_layout_tensor.mojo b/problems/p05/p05_layout_tensor.mojo
@@ -18,9 +18,9 @@ fn broadcast_add[
     a_layout: Layout,
     b_layout: Layout,
 ](
-    output: LayoutTensor[mut=True, dtype, out_layout],
-    a: LayoutTensor[mut=False, dtype, a_layout],
-    b: LayoutTensor[mut=False, dtype, b_layout],
+    output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+    a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin],
+    b: LayoutTensor[dtype, b_layout, ImmutAnyOrigin],
     size: Int,
 ):
     row = thread_idx.y
@@ -32,16 +32,14 @@ fn broadcast_add[
 def main():
     with DeviceContext() as ctx:
         out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
-        out_tensor = LayoutTensor[mut=True, dtype, out_layout](
-            out_buf.unsafe_ptr()
-        )
+        out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out_buf)
         print("out shape:", out_tensor.shape[0](), "x", out_tensor.shape[1]())
 
         expected_buf = ctx.enqueue_create_host_buffer[dtype](
             SIZE * SIZE
         ).enqueue_fill(0)
-        expected_tensor = LayoutTensor[mut=True, dtype, out_layout](
-            expected_buf.unsafe_ptr()
+        expected_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](
+            expected_buf
         )
 
         a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
@@ -55,10 +53,11 @@ def main():
                 for j in range(SIZE):
                     expected_tensor[i, j] = a_host[j] + b_host[i]
 
-        a_tensor = LayoutTensor[dtype, a_layout](a.unsafe_ptr())
-        b_tensor = LayoutTensor[dtype, b_layout](b.unsafe_ptr())
+        a_tensor = LayoutTensor[dtype, a_layout, ImmutAnyOrigin](a)
+        b_tensor = LayoutTensor[dtype, b_layout, ImmutAnyOrigin](b)
 
-        ctx.enqueue_function[broadcast_add[out_layout, a_layout, b_layout]](
+        alias kernel = broadcast_add[out_layout, a_layout, b_layout]
+        ctx.enqueue_function_checked[kernel, kernel](
             out_tensor,
             a_tensor,
             b_tensor,

diff --git a/problems/p06/p06.mojo b/problems/p06/p06.mojo
@@ -30,9 +30,9 @@ def main():
             for i in range(SIZE):
                 a_host[i] = i
 
-        ctx.enqueue_function[add_10_blocks](
-            out.unsafe_ptr(),
-            a.unsafe_ptr(),
+        ctx.enqueue_function_checked[add_10_blocks, add_10_blocks](
+            out,
+            a,
             SIZE,
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,

diff --git a/problems/p07/p07.mojo b/problems/p07/p07.mojo
@@ -38,9 +38,9 @@ def main():
                     a_host[k] = k
                     expected[k] = k + 10
 
-        ctx.enqueue_function[add_10_blocks_2d](
-            out.unsafe_ptr(),
-            a.unsafe_ptr(),
+        ctx.enqueue_function_checked[add_10_blocks_2d, add_10_blocks_2d](
+            out,
+            a,
             SIZE,
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,

diff --git a/problems/p07/p07_layout_tensor.mojo b/problems/p07/p07_layout_tensor.mojo
@@ -16,8 +16,8 @@ fn add_10_blocks_2d[
     out_layout: Layout,
     a_layout: Layout,
 ](
-    output: LayoutTensor[mut=True, dtype, out_layout],
-    a: LayoutTensor[mut=False, dtype, a_layout],
+    output: LayoutTensor[dtype, out_layout, MutAnyOrigin],
+    a: LayoutTensor[dtype, a_layout, ImmutAnyOrigin],
     size: Int,
 ):
     row = block_dim.y * block_idx.y + thread_idx.y
@@ -31,9 +31,7 @@ fn add_10_blocks_2d[
 def main():
     with DeviceContext() as ctx:
         out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
-        out_tensor = LayoutTensor[dtype, out_layout, MutableAnyOrigin](
-            out_buf.unsafe_ptr()
-        )
+        out_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](out_buf)
 
         expected_buf = ctx.enqueue_create_host_buffer[dtype](
             SIZE * SIZE
@@ -48,11 +46,10 @@ def main():
                     a_host[k] = k
                     expected_buf[k] = k + 10
 
-        a_tensor = LayoutTensor[dtype, a_layout, MutableAnyOrigin](
-            a.unsafe_ptr()
-        )
+        a_tensor = LayoutTensor[dtype, a_layout, ImmutAnyOrigin](a)
 
-        ctx.enqueue_function[add_10_blocks_2d[out_layout, a_layout]](
+        alias kernel = add_10_blocks_2d[out_layout, a_layout]
+        ctx.enqueue_function_checked[kernel, kernel](
             out_tensor,
             a_tensor,
             SIZE,
@@ -62,16 +59,14 @@ def main():
 
         ctx.synchronize()
 
-        expected_tensor = LayoutTensor[dtype, out_layout, MutableAnyOrigin](
-            expected_buf.unsafe_ptr()
+        expected_tensor = LayoutTensor[dtype, out_layout, MutAnyOrigin](
+            expected_buf
         )
 
         with out_buf.map_to_host() as out_buf_host:
             print(
                 "out:",
-                LayoutTensor[dtype, out_layout, MutableAnyOrigin](
-                    out_buf_host.unsafe_ptr()
-                ),
+                LayoutTensor[dtype, out_layout, MutAnyOrigin](out_buf_host),
             )
             print("expected:", expected_tensor)
             for i in range(SIZE):